Add support for ORT_DML backend for dx12 devices

AmusementClub · May 28, 2023 · d9e4111 · d9e4111
1 parent cf2bfbf
commit d9e4111
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 39 deletions.
diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml
@@ -44,22 +44,22 @@ jobs:
       uses: actions/cache@v3
       with:
         path: vsort/protobuf/install
-        key: ${{ runner.os }}-vsort-protobuf-v3
+        key: ${{ runner.os }}-vsort-protobuf-v4
 
     - name: Checkout protobuf
       uses: actions/checkout@v3
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
-        # follows protobuf in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
+        # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161
         # if you change this, remember to bump the version of the cache key.
-        ref: a902b39270841beafc307dfa709610aa1cac2f06
+        ref: v3.21.12
         fetch-depth: 1
         path: vsort/protobuf
 
     - name: Configure protobuf
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: cmake -S protobuf\cmake -B protobuf\build_rel -G Ninja -LA
+      run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA
         -D CMAKE_BUILD_TYPE=Release
         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 
@@ -76,7 +76,7 @@ jobs:
       uses: actions/cache@v3
       with:
         path: vsort/onnx/install
-        key: ${{ runner.os }}-vsort-onnx-v3
+        key: ${{ runner.os }}-vsort-onnx-v4
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
@@ -85,7 +85,7 @@ jobs:
         repository: onnx/onnx
         # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
         # if you change this, remember to bump the version of the cache key.
-        ref: 5a5f8a5935762397aa68429b5493084ff970f774
+        ref: a0d77f18516d2da7468a96b0de3b737266f23176
         fetch-depth: 1
         path: vsort/onnx
 
@@ -116,7 +116,7 @@ jobs:
 
     - name: Download ONNX Runtime Precompilation
       run: |
-        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/latest/download/onnxruntime-gpu-win64.zip
+        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip
         unzip -q ortgpu.zip
 
     - name: Cache CUDA
@@ -143,6 +143,7 @@ jobs:
         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
         -D ENABLE_CUDA=1
         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+        -D ENABLE_DML=1
         -D CMAKE_CXX_STANDARD=20
 
     - name: Build
@@ -157,6 +158,13 @@ jobs:
         copy onnxruntime-gpu\bin\*.dll artifact\vsort\
         copy onnxruntime-gpu\lib\*.dll artifact\vsort\
 
+    - name: Download DirectML Library
+      # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
+      run: |
+        curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0
+        unzip -q directml.nupkg -d dml
+        copy dml\bin\x64-win\DirectML.dll artifact\vsort\
+
     - name: Upload
       uses: actions/upload-artifact@v3
       with:

diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml
@@ -71,7 +71,7 @@ jobs:
     - name: Compress scirpts.7z
       run: |
         cd scripts
-        7za a -t7z -bb3 -mx=3 ../scripts.${{ github.event.inputs.tag }}.7z .
+        7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
 
     - name: Upload scripts release
       uses: actions/upload-artifact@v3
@@ -113,7 +113,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../models.7z .
+        7za a -t7z -bb3 -mx=9 ../models.7z .
 
     - name: Upload model release
       uses: actions/upload-artifact@v3
@@ -144,7 +144,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../ext-models.7z .
+        7za a -t7z -bb3 -mx=9 ../ext-models.7z .
 
     - name: Upload external model release
       uses: actions/upload-artifact@v3
@@ -175,7 +175,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../contrib-models.7z .
+        7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
 
     - name: Upload contrib model release
       uses: actions/upload-artifact@v3
@@ -264,7 +264,7 @@ jobs:
         cp scripts-release/*.py release-cpu/
         cd release-cpu
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cpu.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
 
     - name: Upload CPU-only release
       uses: actions/upload-artifact@v3
@@ -286,36 +286,36 @@ jobs:
         generate_release_notes: false
         prerelease: true
 
-    - name: Build non-CUDA GPU release
+    - name: Build generic GPU release
       shell: bash
       run: |
-        mkdir release-vk
-        cp -r models-release/models release-vk/
-        cp -r vsov-release/* release-vk/
-        cp -r vsort-release/* release-vk/
-        rm -f release-vk/vsort/onnxruntime_providers_*.dll
-        cp -r vsncnn-release/* release-vk/
-        cp scripts-release/*.py release-vk/
-        cd release-vk
+        mkdir release-generic-gpu
+        cp -r models-release/models release-generic-gpu/
+        cp -r vsov-release/* release-generic-gpu/
+        cp -r vsort-release/* release-generic-gpu/
+        rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll
+        cp -r vsncnn-release/* release-generic-gpu/
+        cp scripts-release/*.py release-generic-gpu/
+        cd release-generic-gpu
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-vk.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
 
     - name: Upload non-CUDA GPU release
       uses: actions/upload-artifact@v3
       if: false
       with:
-        name: vsmlrt-vk-release
-        path: vsmlrt-windows-x64-vk.7z
+        name: vsmlrt-generic-gpu-release
+        path: vsmlrt-windows-x64-generic-gpu.7z
         retention-days: 1
 
     - name: Rename release asset for non-CUDA GPU release
-      run: mv vsmlrt-windows-x64-vk.7z vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z
+      run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
 
     - name: Release non-CUDA GPU
       uses: softprops/action-gh-release@v1
       with:
         tag_name: ${{ github.event.inputs.tag }}
-        files: vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z
+        files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
         fail_on_unmatched_files: true
         generate_release_notes: false
         prerelease: true
@@ -339,7 +339,7 @@ jobs:
         cp scripts-release/*.py release-cuda/
         cd release-cuda
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cuda.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z .
 
     - name: Upload CUDA release
       uses: actions/upload-artifact@v3

diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py
@@ -1,4 +1,4 @@
-__version__ = "3.15.23"
+__version__ = "3.15.24"
 
 __all__ = [
     "Backend", "BackendV2",
@@ -12,7 +12,7 @@
 ]
 
 import copy
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 import enum
 import math
 import os
@@ -172,14 +172,28 @@ class NCNN_VK:
         # internal backend attributes
         supports_onnx_serialization: bool = True
 
+    @dataclass(frozen=False)
+    class ORT_DML:
+        """ backend for directml (d3d12) devices """
+
+        device_id: int = 0
+        num_streams: int = 1
+        verbosity: int = 2
+        fp16: bool = False
+        fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None
+
+        # internal backend attributes
+        supports_onnx_serialization: bool = True
+
 
 backendT = typing.Union[
     Backend.OV_CPU,
     Backend.ORT_CPU,
     Backend.ORT_CUDA,
     Backend.TRT,
     Backend.OV_GPU,
-    Backend.NCNN_VK
+    Backend.NCNN_VK,
+    Backend.ORT_DML,
 ]
 
 
@@ -1399,6 +1413,18 @@ def _inference(
             path_is_serialization=path_is_serialization,
             fp16_blacklist_ops=backend.fp16_blacklist_ops
         )
+    elif isinstance(backend, Backend.ORT_DML):
+        clip = core.ort.Model(
+            clips, network_path,
+            overlap=overlap, tilesize=tilesize,
+            provider="DML", builtin=False,
+            device_id=backend.device_id,
+            num_streams=backend.num_streams,
+            verbosity=backend.verbosity,
+            fp16=backend.fp16,
+            path_is_serialization=path_is_serialization,
+            fp16_blacklist_ops=backend.fp16_blacklist_ops
+        )
     elif isinstance(backend, Backend.ORT_CUDA):
         clip = core.ort.Model(
             clips, network_path,
@@ -1701,6 +1727,20 @@ def OV_GPU(*,
             **kwargs
         )
 
+    @staticmethod
+    def ORT_DML(*,
+        device_id: int = 0,
+        num_streams: int = 1,
+        fp16: bool = False,
+        **kwargs
+    ) -> Backend.ORT_DML:
+        return Backend.ORT_DML(
+            device_id=device_id,
+            num_streams=num_streams,
+            fp16=fp16,
+            **kwargs
+        )
+
 
 def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode:
     clip_org = clip

diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt
@@ -7,6 +7,7 @@ set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers")
 set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries")
 
 set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend")
+set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend")
 
 find_package(protobuf REQUIRED CONFIG)
 find_package(ONNX REQUIRED CONFIG)
@@ -52,6 +53,10 @@ if (ENABLE_CUDA)
     endif()
 endif()
 
+if (ENABLE_DML)
+    add_compile_definitions(ENABLE_DML)
+endif()
+
 target_include_directories(vsort PUBLIC
     "${PROJECT_BINARY_DIR}"
 )