facebookresearch · danthe3rd · Mar 4, 2024 · Dec 5, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/.github/workflows/rocm_ci.yml b/.github/workflows/rocm_ci.yml
@@ -0,0 +1,71 @@
+name: ROCM_CI
+
+on: 
+  pull_request:
+    types: [labeled, synchronize, reopened]
+
+jobs:
+  build:
+    if: contains(github.event.label.name, 'rocm')
+    runs-on: rocm
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Get CPU info on Ubuntu
+      if: contains(runner.os, 'linux')
+      run: |
+        cat /proc/cpuinfo
+    - name: Get env vars
+      run: |
+        echo GITHUB_WORKFLOW   = $GITHUB_WORKFLOW
+        echo HOME              = $HOME
+        echo PWD               = $PWD
+        echo GITHUB_ACTION     = $GITHUB_ACTION
+        echo GITHUB_ACTIONS    = $GITHUB_ACTIONS
+        echo GITHUB_REPOSITORY = $GITHUB_REPOSITORY
+        echo GITHUB_EVENT_NAME = $GITHUB_EVENT_NAME
+        echo GITHUB_EVENT_PATH = $GITHUB_EVENT_PATH
+        echo GITHUB_WORKSPACE  = $GITHUB_WORKSPACE
+        echo GITHUB_SHA        = $GITHUB_SHA
+        echo GITHUB_REF        = $GITHUB_REF
+
+        export GIT_BRANCH=${GITHUB_BASE_REF:-${GITHUB_REF#refs/heads/}}
+        echo GIT_BRANCH        = $GIT_BRANCH
+        
+        export ROCM_PATH=/opt/rocm
+        echo ROCM_PATH         = $ROCM_PATH
+
+        export MAX_JOBS=64
+        echo MAX_JOBS          = $MAX_JOBS
+
+        hipcc --version
+        rocm-smi
+        rocminfo | grep "gfx"
+
+    - name: Build XFormers
+      run: |
+        git clone --recursive -b $GIT_BRANCH $GITHUB_REPOSITORY
+        docker run -it --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --device=/dev/kfd --device=/dev/dri --group-add video --ipc=host --shm-size 8G -v $PWD/xformers:/xformers  rocm/pytorch-nightly:latest
+
+        pip3 install --upgrade pip
+        pip3 uninstall -y xformers
+        MAX_JOBS=$MAX_JOBS pip3 install -e /xformers --verbose
+        pip3 install scipy==1.10
+
+        python3 -c "import torch; print(torch.__version__)"
+        python3 -m xformers.info
+
+    - name: Run python tests
+      run: |
+        pytest -rpfs /xformers/tests/test_mem_eff_attention.py | tee test_mem_eff_attention.log
+    
+    - name: Archive logs
+      uses: actions/upload-artifact@v3
+      with:
+        name: test results
+        path: test_mem_eff_attention_ck.log
+
+    - name: Process test results
+      run: |
+        echo "Processing test results TBD"
+      
diff --git a/.gitignore b/.gitignore
@@ -60,3 +60,13 @@ outputs
 xformers/_flash_attn
 xformers/version.py
 xformers/cpp_lib.json
+
+## temporary files
+xformers/csrc/attention/hip_fmha/*.cu
+xformers/csrc/attention/hip_fmha/*.hip
+xformers/csrc/attention/hip_fmha/*_hip.h
+xformers/csrc/attention/hip_fmha/instances/*.cu
+xformers/csrc/attention/hip_fmha/instances/*.hip
+xformers/csrc/attention/hip_fmha/instances_tiled/*.cu
+xformers/csrc/attention/hip_fmha/instances_tiled/*.hip
+
diff --git a/.gitmodules b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "third_party/composable_kernel_tiled"]
+	path = third_party/composable_kernel_tiled
+	url = https://github.com/ROCm/composable_kernel.git
+	branch = ck_tile/dev
diff --git a/docs/source/components/ops.rst b/docs/source/components/ops.rst
@@ -22,13 +22,25 @@ Available implementations
     :member-order: bysource
 
 .. automodule:: xformers.ops.fmha.triton
-    :members: FwOp, BwOp
+    :members: FwOp
     :member-order: bysource
 
 .. automodule:: xformers.ops.fmha.small_k
     :members: FwOp, BwOp
     :member-order: bysource
 
+.. automodule:: xformers.ops.fmha.ck
+    :members: FwOp, BwOp
+    :member-order: bysource
+
+.. automodule:: xformers.ops.fmha.ck_decoder
+    :members: FwOp
+    :member-order: bysource
+
+.. automodule:: xformers.ops.fmha.ck_splitk
+    :members: FwOp
+    :member-order: bysource
+
 Attention biases
 ~~~~~~~~~~~~~~~~~~~~
 

diff --git a/requirements-test.txt b/requirements-test.txt
@@ -25,7 +25,7 @@ hydra-core >= 1.1
 
 # Dependency for Mixture of Experts
 fairscale >= 0.4.5
-scipy
+scipy >= 1.7
 
 # Dependency for fused layers, optional
 cmake
diff --git a/setup.py b/setup.py
@@ -125,6 +125,23 @@ def get_cuda_version(cuda_dir) -> int:
     return bare_metal_major * 100 + bare_metal_minor
 
 
+def get_hip_version(rocm_dir) -> str:
+    hipcc_bin = "hipcc" if rocm_dir is None else os.path.join(rocm_dir, "bin", "hipcc")
+    try:
+        raw_output = subprocess.check_output(
+            [hipcc_bin, "--version"], universal_newlines=True
+        )
+    except Exception as e:
+        print(
+            f"hip installation not found: {e} ROCM_PATH={os.environ.get('ROCM_PATH')}"
+        )
+        return None
+    for line in raw_output.split("\n"):
+        if "HIP version" in line:
+            return line.split()[-1]
+    return None
+
+
 def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
     # XXX: Not supported on windows for cuda<12
     # https://github.com/Dao-AILab/flash-attention/issues/345
@@ -223,11 +240,27 @@ def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
     ]
 
 
+def rename_cpp_cu(cpp_files):
+    for entry in cpp_files:
+        shutil.copy(entry, os.path.splitext(entry)[0] + ".cu")
+
+
 def get_extensions():
     extensions_dir = os.path.join("xformers", "csrc")
 
     sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"), recursive=True)
     source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu"), recursive=True)
+    source_hip = glob.glob(
+        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cpp"),
+        recursive=True,
+    )
+    source_hip_generated = glob.glob(
+        os.path.join(extensions_dir, "attention", "hip_fmha", "**", "*.cu"),
+        recursive=True,
+    )
+    # avoid the temporary .cu files generated under xformers/csrc/attention/hip_fmha
+    source_cuda = list(set(source_cuda) - set(source_hip_generated))
+    sources = list(set(sources) - set(source_hip))
 
     sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
     cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
@@ -253,6 +286,7 @@ def get_extensions():
     include_dirs = [extensions_dir]
     ext_modules = []
     cuda_version = None
+    hip_version = None
     flash_version = "0.0.0"
 
     if (
@@ -294,6 +328,7 @@ def get_extensions():
         flash_extensions = get_flash_attention_extensions(
             cuda_version=cuda_version, extra_compile_args=extra_compile_args
         )
+
         if flash_extensions:
             flash_version = get_flash_version()
         ext_modules += flash_extensions
@@ -306,6 +341,51 @@ def get_extensions():
             "--ptxas-options=-O2",
             "--ptxas-options=-allow-expensive-optimizations=true",
         ]
+    elif torch.cuda.is_available() and torch.version.hip:
+        rename_cpp_cu(source_hip)
+        rocm_home = os.getenv("ROCM_PATH")
+        hip_version = get_hip_version(rocm_home)
+
+        source_hip_cu = []
+        for ff in source_hip:
+            source_hip_cu += [ff.replace(".cpp", ".cu")]
+
+        extension = CUDAExtension
+        sources += source_hip_cu
+        include_dirs += [
+            Path(this_dir) / "xformers" / "csrc" / "attention" / "hip_fmha"
+        ]
+
+        include_dirs += [
+            Path(this_dir)
+            / "third_party"
+            / "composable_kernel_tiled"
+            / "example"
+            / "91_tile_program"
+            / "xformers_fmha"
+        ]
+
+        include_dirs += [
+            Path(this_dir) / "third_party" / "composable_kernel_tiled" / "include"
+        ]
+
+        generator_flag = []
+
+        cc_flag = ["-DBUILD_PYTHON_PACKAGE"]
+        extra_compile_args = {
+            "cxx": ["-O3", "-std=c++17"] + generator_flag,
+            "nvcc": [
+                "-O3",
+                "-std=c++17",
+                f"--offload-arch={os.getenv('HIP_ARCHITECTURES', 'native')}",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-DCK_FMHA_FWD_FAST_EXP2=1",
+                "-fgpu-flush-denormals-to-zero",
+            ]
+            + generator_flag
+            + cc_flag,
+        }
 
     ext_modules.append(
         extension(
@@ -320,6 +400,7 @@ def get_extensions():
     return ext_modules, {
         "version": {
             "cuda": cuda_version,
+            "hip": hip_version,
             "torch": torch.__version__,
             "python": platform.python_version(),
             "flash": flash_version,
@@ -328,6 +409,7 @@ def get_extensions():
             k: os.environ.get(k)
             for k in [
                 "TORCH_CUDA_ARCH_LIST",
+                "PYTORCH_ROCM_ARCH",
                 "XFORMERS_BUILD_TYPE",
                 "XFORMERS_ENABLE_DEBUG_ASSERTIONS",
                 "NVCC_FLAGS",

diff --git a/tests/readme_test_on_rocm.txt b/tests/readme_test_on_rocm.txt
@@ -0,0 +1,13 @@
+
+   1. #> pip install -e ./
+
+   2. verify testing for generic fmha inference on ROCM
+
+      #> pytest tests/test_mem_eff_attention.py::test_forward  
+
+   3. verify testing for decoder fmha inference on ROCM
+
+      #> pytest tests/test_mem_eff_attention.py::test_decoder
+      #> pytest tests/test_mem_eff_attention.py::test_splitk_decoder
+
+
diff --git a/tests/test_attentions.py b/tests/test_attentions.py
@@ -107,6 +107,13 @@ def test_order_invariance(
     causal: bool,
     device: torch.device,
 ):
+    if (
+        torch.version.hip
+        and device == torch.device("cuda")
+        and attention_name == "local"
+    ):
+        # Backend calls into Sputnik library which isn't built on ROCm
+        device = torch.device("cpu")
 
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)

diff --git a/tests/test_checkpoint.py b/tests/test_checkpoint.py
@@ -111,7 +111,11 @@ def test_checkpoint_with_grad(policy_fn, input_requires_grad, grad_mode):
     "op",
     [
         xformers.ops.MemoryEfficientAttentionFlashAttentionOp,
-        xformers.ops.MemoryEfficientAttentionCutlassOp,
+        (
+            xformers.ops.MemoryEfficientAttentionCutlassOp
+            if torch.version.cuda
+            else xformers.ops.MemoryEfficientAttentionCkOp
+        ),
     ],
 )
 def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast, op):
@@ -121,6 +125,15 @@ def test_checkpoint_attention(policy_fn, input_requires_grad, device, autocast,
     ):
         pytest.skip("skipping operator not supported in this arch")
 
+    if (
+        op is xformers.ops.MemoryEfficientAttentionFlashAttentionOp
+        and torch.version.hip
+    ):
+        pytest.skip("FlashAttentionOp is not supported on ROCM!")
+
+    if op is xformers.ops.MemoryEfficientAttentionCkOp:
+        pytest.skip("Gradience is currently not supported by ck-tiled!")
+
     class Attn(nn.Module):
         def forward(self, x):
             out = xformers.ops.memory_efficient_attention(x, x, x, op=op)

diff --git a/tests/test_core_attention.py b/tests/test_core_attention.py
@@ -31,7 +31,9 @@ def fn_and_catch_oor(*args, **kwargs):
     return fn_and_catch_oor
 
 
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+_devices = (
+    ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+)
 
 
 def test_core_attention():
@@ -144,6 +146,7 @@ def test_amp_attention_sparsecs(device):
 @pytest.mark.skipif(
     not _is_blocksparse_available, reason="Blocksparse is not available"
 )
+@pytest.mark.skipif(not torch.version.cuda, reason="Sparse ops not supported on ROCm")
 @pytest.mark.parametrize("device", ["cuda"])
 @pytest.mark.parametrize("data_type", [torch.float16, torch.float32])
 @catch_oor

diff --git a/tests/test_custom_ops.py b/tests/test_custom_ops.py
@@ -16,8 +16,13 @@
     _sparse_bmm,
 )
 
-cuda_only = pytest.mark.skipif(not torch.cuda.is_available(), reason="requires CUDA")
-_devices = ["cpu", "cuda"] if torch.cuda.is_available() else ["cpu"]
+cuda_only = pytest.mark.skipif(
+    not torch.cuda.is_available() or not torch.version.cuda, reason="requires CUDA"
+)
+
+_devices = (
+    ["cpu", "cuda"] if torch.cuda.is_available() and torch.version.cuda else ["cpu"]
+)
 
 
 def _baseline_matmul_with_sparse_mask(