From d9e411125622b6bc00de5ae10fa1a32fbe5d5282 Mon Sep 17 00:00:00 2001 From: WolframRhodium Date: Sun, 28 May 2023 22:47:56 +0800 Subject: [PATCH] Add support for `ORT_DML` backend for dx12 devices --- .github/workflows/windows-ort.yml | 22 ++++++---- .github/workflows/windows-release.yml | 40 +++++++++--------- scripts/vsmlrt.py | 46 +++++++++++++++++++-- vsort/CMakeLists.txt | 5 +++ vsort/vs_onnxruntime.cpp | 58 ++++++++++++++++++++++----- vsort/win32.cpp | 1 + 6 files changed, 133 insertions(+), 39 deletions(-) diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml index 0a35e3e..8261e5d 100644 --- a/.github/workflows/windows-ort.yml +++ b/.github/workflows/windows-ort.yml @@ -44,22 +44,22 @@ jobs: uses: actions/cache@v3 with: path: vsort/protobuf/install - key: ${{ runner.os }}-vsort-protobuf-v3 + key: ${{ runner.os }}-vsort-protobuf-v4 - name: Checkout protobuf uses: actions/checkout@v3 if: steps.cache-protobuf.outputs.cache-hit != 'true' with: repository: protocolbuffers/protobuf - # follows protobuf in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external + # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161 # if you change this, remember to bump the version of the cache key. - ref: a902b39270841beafc307dfa709610aa1cac2f06 + ref: v3.21.12 fetch-depth: 1 path: vsort/protobuf - name: Configure protobuf if: steps.cache-protobuf.outputs.cache-hit != 'true' - run: cmake -S protobuf\cmake -B protobuf\build_rel -G Ninja -LA + run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA -D CMAKE_BUILD_TYPE=Release -D protobuf_BUILD_SHARED_LIBS=OFF -D protobuf_BUILD_TESTS=OFF @@ -76,7 +76,7 @@ jobs: uses: actions/cache@v3 with: path: vsort/onnx/install - key: ${{ runner.os }}-vsort-onnx-v3 + key: ${{ runner.os }}-vsort-onnx-v4 - name: Checkout onnx if: steps.cache-onnx.outputs.cache-hit != 'true' @@ -85,7 +85,7 @@ jobs: repository: onnx/onnx # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external # if you change this, remember to bump the version of the cache key. - ref: 5a5f8a5935762397aa68429b5493084ff970f774 + ref: a0d77f18516d2da7468a96b0de3b737266f23176 fetch-depth: 1 path: vsort/onnx @@ -116,7 +116,7 @@ jobs: - name: Download ONNX Runtime Precompilation run: | - curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/latest/download/onnxruntime-gpu-win64.zip + curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip unzip -q ortgpu.zip - name: Cache CUDA @@ -143,6 +143,7 @@ jobs: -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib -D ENABLE_CUDA=1 -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8" + -D ENABLE_DML=1 -D CMAKE_CXX_STANDARD=20 - name: Build @@ -157,6 +158,13 @@ jobs: copy onnxruntime-gpu\bin\*.dll artifact\vsort\ copy onnxruntime-gpu\lib\*.dll artifact\vsort\ + - name: Download DirectML Library + # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44 + run: | + curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0 + unzip -q directml.nupkg -d dml + copy dml\bin\x64-win\DirectML.dll artifact\vsort\ + - name: Upload uses: actions/upload-artifact@v3 with: diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml index 40b0843..64318d3 100644 --- a/.github/workflows/windows-release.yml +++ b/.github/workflows/windows-release.yml @@ -71,7 +71,7 @@ jobs: - name: Compress scirpts.7z run: | cd scripts - 7za a -t7z -bb3 -mx=3 ../scripts.${{ github.event.inputs.tag }}.7z . + 7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z . - name: Upload scripts release uses: actions/upload-artifact@v3 @@ -113,7 +113,7 @@ jobs: popd ls -lR du -sh - 7za a -t7z -bb3 -mx=3 ../models.7z . + 7za a -t7z -bb3 -mx=9 ../models.7z . - name: Upload model release uses: actions/upload-artifact@v3 @@ -144,7 +144,7 @@ jobs: popd ls -lR du -sh - 7za a -t7z -bb3 -mx=3 ../ext-models.7z . + 7za a -t7z -bb3 -mx=9 ../ext-models.7z . - name: Upload external model release uses: actions/upload-artifact@v3 @@ -175,7 +175,7 @@ jobs: popd ls -lR du -sh - 7za a -t7z -bb3 -mx=3 ../contrib-models.7z . + 7za a -t7z -bb3 -mx=9 ../contrib-models.7z . - name: Upload contrib model release uses: actions/upload-artifact@v3 @@ -264,7 +264,7 @@ jobs: cp scripts-release/*.py release-cpu/ cd release-cpu ls -lR - 7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cpu.7z . + 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z . - name: Upload CPU-only release uses: actions/upload-artifact@v3 @@ -286,36 +286,36 @@ jobs: generate_release_notes: false prerelease: true - - name: Build non-CUDA GPU release + - name: Build generic GPU release shell: bash run: | - mkdir release-vk - cp -r models-release/models release-vk/ - cp -r vsov-release/* release-vk/ - cp -r vsort-release/* release-vk/ - rm -f release-vk/vsort/onnxruntime_providers_*.dll - cp -r vsncnn-release/* release-vk/ - cp scripts-release/*.py release-vk/ - cd release-vk + mkdir release-generic-gpu + cp -r models-release/models release-generic-gpu/ + cp -r vsov-release/* release-generic-gpu/ + cp -r vsort-release/* release-generic-gpu/ + rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll + cp -r vsncnn-release/* release-generic-gpu/ + cp scripts-release/*.py release-generic-gpu/ + cd release-generic-gpu ls -lR - 7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-vk.7z . + 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z . - name: Upload non-CUDA GPU release uses: actions/upload-artifact@v3 if: false with: - name: vsmlrt-vk-release - path: vsmlrt-windows-x64-vk.7z + name: vsmlrt-generic-gpu-release + path: vsmlrt-windows-x64-generic-gpu.7z retention-days: 1 - name: Rename release asset for non-CUDA GPU release - run: mv vsmlrt-windows-x64-vk.7z vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z + run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z - name: Release non-CUDA GPU uses: softprops/action-gh-release@v1 with: tag_name: ${{ github.event.inputs.tag }} - files: vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z + files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z fail_on_unmatched_files: true generate_release_notes: false prerelease: true @@ -339,7 +339,7 @@ jobs: cp scripts-release/*.py release-cuda/ cd release-cuda ls -lR - 7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cuda.7z . + 7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z . - name: Upload CUDA release uses: actions/upload-artifact@v3 diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py index 72dc876..0814efb 100644 --- a/scripts/vsmlrt.py +++ b/scripts/vsmlrt.py @@ -1,4 +1,4 @@ -__version__ = "3.15.23" +__version__ = "3.15.24" __all__ = [ "Backend", "BackendV2", @@ -12,7 +12,7 @@ ] import copy -from dataclasses import dataclass, field +from dataclasses import dataclass import enum import math import os @@ -172,6 +172,19 @@ class NCNN_VK: # internal backend attributes supports_onnx_serialization: bool = True + @dataclass(frozen=False) + class ORT_DML: + """ backend for directml (d3d12) devices """ + + device_id: int = 0 + num_streams: int = 1 + verbosity: int = 2 + fp16: bool = False + fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None + + # internal backend attributes + supports_onnx_serialization: bool = True + backendT = typing.Union[ Backend.OV_CPU, @@ -179,7 +192,8 @@ class NCNN_VK: Backend.ORT_CUDA, Backend.TRT, Backend.OV_GPU, - Backend.NCNN_VK + Backend.NCNN_VK, + Backend.ORT_DML, ] @@ -1399,6 +1413,18 @@ def _inference( path_is_serialization=path_is_serialization, fp16_blacklist_ops=backend.fp16_blacklist_ops ) + elif isinstance(backend, Backend.ORT_DML): + clip = core.ort.Model( + clips, network_path, + overlap=overlap, tilesize=tilesize, + provider="DML", builtin=False, + device_id=backend.device_id, + num_streams=backend.num_streams, + verbosity=backend.verbosity, + fp16=backend.fp16, + path_is_serialization=path_is_serialization, + fp16_blacklist_ops=backend.fp16_blacklist_ops + ) elif isinstance(backend, Backend.ORT_CUDA): clip = core.ort.Model( clips, network_path, @@ -1701,6 +1727,20 @@ def OV_GPU(*, **kwargs ) + @staticmethod + def ORT_DML(*, + device_id: int = 0, + num_streams: int = 1, + fp16: bool = False, + **kwargs + ) -> Backend.ORT_DML: + return Backend.ORT_DML( + device_id=device_id, + num_streams=num_streams, + fp16=fp16, + **kwargs + ) + def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode: clip_org = clip diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt index 0e97ba1..85dcae3 100644 --- a/vsort/CMakeLists.txt +++ b/vsort/CMakeLists.txt @@ -7,6 +7,7 @@ set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers") set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries") set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend") +set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend") find_package(protobuf REQUIRED CONFIG) find_package(ONNX REQUIRED CONFIG) @@ -52,6 +53,10 @@ if (ENABLE_CUDA) endif() endif() +if (ENABLE_DML) + add_compile_definitions(ENABLE_DML) +endif() + target_include_directories(vsort PUBLIC "${PROJECT_BINARY_DIR}" ) diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp index 6477fa0..a69e71b 100644 --- a/vsort/vs_onnxruntime.cpp +++ b/vsort/vs_onnxruntime.cpp @@ -24,12 +24,19 @@ using namespace std::chrono_literals; #include #include +#define NOMINMAX + #include #ifdef ENABLE_CUDA #include #endif // ENABLE_CUDA +#ifdef ENABLE_DML +// include/onnxruntime/core/providers/dml/dml_provider_factory.h +#include <../providers/dml/dml_provider_factory.h> +#endif // ENABLE_DML + #include "config.h" @@ -387,7 +394,8 @@ struct TicketSemaphore { enum class Backend { CPU = 0, CUDA = 1, - COREML = 2 + COREML = 2, + DML = 3 }; #ifdef ENABLE_CUDA @@ -404,6 +412,8 @@ struct Resource { OrtValue * input_tensor; OrtValue * output_tensor; OrtIoBinding * binding; + char * input_name; + char * output_name; #ifdef ENABLE_CUDA cudaStream_t stream; @@ -637,11 +647,21 @@ static const VSFrameRef *VS_CC vsOrtGetFrame( checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding)); // onnxruntime replays the graph itself in CUDAExecutionProvider::OnRunEnd - } else { -#else // ENABLE_CUDA - { + } else #endif // ENABLE_CUDA + if (d->backend == Backend::CPU || d->backend == Backend::CUDA) { checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding)); + } else { + checkError(ortapi->Run( + resource.session, + nullptr, + &resource.input_name, + &resource.input_tensor, + 1, + &resource.output_name, + 1, + &resource.output_tensor + )); } #ifdef ENABLE_CUDA @@ -867,6 +887,10 @@ static void VS_CC vsOrtCreate( } else if (strcmp(provider, "COREML") == 0) { d->backend = Backend::COREML; #endif // ENABLE_COREML +#ifdef ENABLE_DML + } else if (strcmp(provider, "DML") == 0) { + d->backend = Backend::DML; +#endif // ENABLE_DML } else { return set_error("unknwon provider "s + provider); } @@ -1071,6 +1095,13 @@ static void VS_CC vsOrtCreate( )); } #endif // ENABLE_COREML +#ifdef ENABLE_DML + else if (d->backend == Backend::DML) { + const OrtDmlApi * ortdmlapi {}; + checkError(ortapi->GetExecutionProviderApi("DML", ORT_API_VERSION, (const void **) &ortdmlapi)); + checkError(ortdmlapi->SessionOptionsAppendExecutionProvider_DML(session_options, d->device_id)); + } +#endif // ENABLE_DML checkError(ortapi->CreateSessionFromArray( d->environment, @@ -1158,18 +1189,17 @@ static void VS_CC vsOrtCreate( checkError(ortapi->CreateIoBinding(resource.session, &resource.binding)); - char * input_name; checkError(ortapi->SessionGetInputName( - resource.session, 0, cpu_allocator, &input_name + resource.session, 0, cpu_allocator, &resource.input_name )); char * output_name; checkError(ortapi->SessionGetOutputName( - resource.session, 0, cpu_allocator, &output_name + resource.session, 0, cpu_allocator, &resource.output_name )); - checkError(ortapi->BindInput(resource.binding, input_name, resource.input_tensor)); - checkError(ortapi->BindOutput(resource.binding, output_name, resource.output_tensor)); + checkError(ortapi->BindInput(resource.binding, resource.input_name, resource.input_tensor)); + checkError(ortapi->BindOutput(resource.binding, resource.output_name, resource.output_tensor)); if (auto err = checkNodesAndNetwork(resource.session, in_vis); err.has_value()) { return set_error(err.value()); @@ -1247,6 +1277,16 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit( ); vsapi->propSetData(out, "path", vsapi->getPluginPath(myself), -1, paReplace); + +#ifdef ENABLE_CUDA + vsapi->propSetData(out, "providers", "CUDA", -1, paAppend); +#endif +#ifdef ENABLE_COREML + vsapi->propSetData(out, "providers", "COREML", -1, paAppend); +#endif +#ifdef ENABLE_DML + vsapi->propSetData(out, "providers", "DML", -1, paAppend); +#endif }; registerFunc("Version", "", getVersion, nullptr, plugin); } diff --git a/vsort/win32.cpp b/vsort/win32.cpp index 73505c5..8741167 100644 --- a/vsort/win32.cpp +++ b/vsort/win32.cpp @@ -14,6 +14,7 @@ namespace { std::vector dlls = { // This list must be sorted by dependency. + L"DirectML.dll", L"onnxruntime.dll", // must be the last };