diff --git a/.github/workflows/windows-ort.yml b/.github/workflows/windows-ort.yml
index 0a35e3e..8261e5d 100644
--- a/.github/workflows/windows-ort.yml
+++ b/.github/workflows/windows-ort.yml
@@ -44,22 +44,22 @@ jobs:
       uses: actions/cache@v3
       with:
         path: vsort/protobuf/install
-        key: ${{ runner.os }}-vsort-protobuf-v3
+        key: ${{ runner.os }}-vsort-protobuf-v4
 
     - name: Checkout protobuf
       uses: actions/checkout@v3
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
       with:
         repository: protocolbuffers/protobuf
-        # follows protobuf in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
+        # follows protobuf in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/onnxruntime_external_deps.cmake#L161
         # if you change this, remember to bump the version of the cache key.
-        ref: a902b39270841beafc307dfa709610aa1cac2f06
+        ref: v3.21.12
         fetch-depth: 1
         path: vsort/protobuf
 
     - name: Configure protobuf
       if: steps.cache-protobuf.outputs.cache-hit != 'true'
-      run: cmake -S protobuf\cmake -B protobuf\build_rel -G Ninja -LA
+      run: cmake -S protobuf -B protobuf\build_rel -G Ninja -LA
         -D CMAKE_BUILD_TYPE=Release
         -D protobuf_BUILD_SHARED_LIBS=OFF  -D protobuf_BUILD_TESTS=OFF
 
@@ -76,7 +76,7 @@ jobs:
       uses: actions/cache@v3
       with:
         path: vsort/onnx/install
-        key: ${{ runner.os }}-vsort-onnx-v3
+        key: ${{ runner.os }}-vsort-onnx-v4
 
     - name: Checkout onnx
       if: steps.cache-onnx.outputs.cache-hit != 'true'
@@ -85,7 +85,7 @@ jobs:
         repository: onnx/onnx
         # follows onnx in https://github.com/AmusementClub/onnxruntime/tree/master/cmake/external
         # if you change this, remember to bump the version of the cache key.
-        ref: 5a5f8a5935762397aa68429b5493084ff970f774
+        ref: a0d77f18516d2da7468a96b0de3b737266f23176
         fetch-depth: 1
         path: vsort/onnx
 
@@ -116,7 +116,7 @@ jobs:
 
     - name: Download ONNX Runtime Precompilation
       run: |
-        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/latest/download/onnxruntime-gpu-win64.zip
+        curl -s -o ortgpu.zip -LJO https://github.com/AmusementClub/onnxruntime/releases/download/orttraining_rc2-5943-g73584f936-230528-0922/onnxruntime-gpu-win64.zip
         unzip -q ortgpu.zip
 
     - name: Cache CUDA
@@ -143,6 +143,7 @@ jobs:
         -D ONNX_RUNTIME_LIB_DIRECTORY=onnxruntime-gpu\lib
         -D ENABLE_CUDA=1
         -D CUDAToolkit_ROOT="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8"
+        -D ENABLE_DML=1
         -D CMAKE_CXX_STANDARD=20
 
     - name: Build
@@ -157,6 +158,13 @@ jobs:
         copy onnxruntime-gpu\bin\*.dll artifact\vsort\
         copy onnxruntime-gpu\lib\*.dll artifact\vsort\
 
+    - name: Download DirectML Library
+      # follows DirectML in https://github.com/AmusementClub/onnxruntime/blob/master/cmake/external/dml.cmake#L44
+      run: |
+        curl -s -o directml.nupkg -LJO https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.12.0
+        unzip -q directml.nupkg -d dml
+        copy dml\bin\x64-win\DirectML.dll artifact\vsort\
+
     - name: Upload
       uses: actions/upload-artifact@v3
       with:
diff --git a/.github/workflows/windows-release.yml b/.github/workflows/windows-release.yml
index 40b0843..64318d3 100644
--- a/.github/workflows/windows-release.yml
+++ b/.github/workflows/windows-release.yml
@@ -71,7 +71,7 @@ jobs:
     - name: Compress scirpts.7z
       run: |
         cd scripts
-        7za a -t7z -bb3 -mx=3 ../scripts.${{ github.event.inputs.tag }}.7z .
+        7za a -t7z -bb3 -mx=9 ../scripts.${{ github.event.inputs.tag }}.7z .
 
     - name: Upload scripts release
       uses: actions/upload-artifact@v3
@@ -113,7 +113,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../models.7z .
+        7za a -t7z -bb3 -mx=9 ../models.7z .
 
     - name: Upload model release
       uses: actions/upload-artifact@v3
@@ -144,7 +144,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../ext-models.7z .
+        7za a -t7z -bb3 -mx=9 ../ext-models.7z .
 
     - name: Upload external model release
       uses: actions/upload-artifact@v3
@@ -175,7 +175,7 @@ jobs:
         popd
         ls -lR
         du -sh
-        7za a -t7z -bb3 -mx=3 ../contrib-models.7z .
+        7za a -t7z -bb3 -mx=9 ../contrib-models.7z .
 
     - name: Upload contrib model release
       uses: actions/upload-artifact@v3
@@ -264,7 +264,7 @@ jobs:
         cp scripts-release/*.py release-cpu/
         cd release-cpu
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cpu.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cpu.7z .
 
     - name: Upload CPU-only release
       uses: actions/upload-artifact@v3
@@ -286,36 +286,36 @@ jobs:
         generate_release_notes: false
         prerelease: true
 
-    - name: Build non-CUDA GPU release
+    - name: Build generic GPU release
       shell: bash
       run: |
-        mkdir release-vk
-        cp -r models-release/models release-vk/
-        cp -r vsov-release/* release-vk/
-        cp -r vsort-release/* release-vk/
-        rm -f release-vk/vsort/onnxruntime_providers_*.dll
-        cp -r vsncnn-release/* release-vk/
-        cp scripts-release/*.py release-vk/
-        cd release-vk
+        mkdir release-generic-gpu
+        cp -r models-release/models release-generic-gpu/
+        cp -r vsov-release/* release-generic-gpu/
+        cp -r vsort-release/* release-generic-gpu/
+        rm -f release-generic-gpu/vsort/onnxruntime_providers_*.dll
+        cp -r vsncnn-release/* release-generic-gpu/
+        cp scripts-release/*.py release-generic-gpu/
+        cd release-generic-gpu
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-vk.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-generic-gpu.7z .
 
     - name: Upload non-CUDA GPU release
       uses: actions/upload-artifact@v3
       if: false
       with:
-        name: vsmlrt-vk-release
-        path: vsmlrt-windows-x64-vk.7z
+        name: vsmlrt-generic-gpu-release
+        path: vsmlrt-windows-x64-generic-gpu.7z
         retention-days: 1
 
     - name: Rename release asset for non-CUDA GPU release
-      run: mv vsmlrt-windows-x64-vk.7z vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z
+      run: mv vsmlrt-windows-x64-generic-gpu.7z vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
 
     - name: Release non-CUDA GPU
       uses: softprops/action-gh-release@v1
       with:
         tag_name: ${{ github.event.inputs.tag }}
-        files: vsmlrt-windows-x64-vk.${{ github.event.inputs.tag }}.7z
+        files: vsmlrt-windows-x64-generic-gpu.${{ github.event.inputs.tag }}.7z
         fail_on_unmatched_files: true
         generate_release_notes: false
         prerelease: true
@@ -339,7 +339,7 @@ jobs:
         cp scripts-release/*.py release-cuda/
         cd release-cuda
         ls -lR
-        7za a -t7z -bb3 -mx=3 ../vsmlrt-windows-x64-cuda.7z .
+        7za a -t7z -bb3 -mx=9 ../vsmlrt-windows-x64-cuda.7z .
 
     - name: Upload CUDA release
       uses: actions/upload-artifact@v3
diff --git a/scripts/vsmlrt.py b/scripts/vsmlrt.py
index 72dc876..0814efb 100644
--- a/scripts/vsmlrt.py
+++ b/scripts/vsmlrt.py
@@ -1,4 +1,4 @@
-__version__ = "3.15.23"
+__version__ = "3.15.24"
 
 __all__ = [
     "Backend", "BackendV2",
@@ -12,7 +12,7 @@
 ]
 
 import copy
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 import enum
 import math
 import os
@@ -172,6 +172,19 @@ class NCNN_VK:
         # internal backend attributes
         supports_onnx_serialization: bool = True
 
+    @dataclass(frozen=False)
+    class ORT_DML:
+        """ backend for directml (d3d12) devices """
+
+        device_id: int = 0
+        num_streams: int = 1
+        verbosity: int = 2
+        fp16: bool = False
+        fp16_blacklist_ops: typing.Optional[typing.Sequence[str]] = None
+
+        # internal backend attributes
+        supports_onnx_serialization: bool = True
+
 
 backendT = typing.Union[
     Backend.OV_CPU,
@@ -179,7 +192,8 @@ class NCNN_VK:
     Backend.ORT_CUDA,
     Backend.TRT,
     Backend.OV_GPU,
-    Backend.NCNN_VK
+    Backend.NCNN_VK,
+    Backend.ORT_DML,
 ]
 
 
@@ -1399,6 +1413,18 @@ def _inference(
             path_is_serialization=path_is_serialization,
             fp16_blacklist_ops=backend.fp16_blacklist_ops
         )
+    elif isinstance(backend, Backend.ORT_DML):
+        clip = core.ort.Model(
+            clips, network_path,
+            overlap=overlap, tilesize=tilesize,
+            provider="DML", builtin=False,
+            device_id=backend.device_id,
+            num_streams=backend.num_streams,
+            verbosity=backend.verbosity,
+            fp16=backend.fp16,
+            path_is_serialization=path_is_serialization,
+            fp16_blacklist_ops=backend.fp16_blacklist_ops
+        )
     elif isinstance(backend, Backend.ORT_CUDA):
         clip = core.ort.Model(
             clips, network_path,
@@ -1701,6 +1727,20 @@ def OV_GPU(*,
             **kwargs
         )
 
+    @staticmethod
+    def ORT_DML(*,
+        device_id: int = 0,
+        num_streams: int = 1,
+        fp16: bool = False,
+        **kwargs
+    ) -> Backend.ORT_DML:
+        return Backend.ORT_DML(
+            device_id=device_id,
+            num_streams=num_streams,
+            fp16=fp16,
+            **kwargs
+        )
+
 
 def fmtc_resample(clip: vs.VideoNode, **kwargs) -> vs.VideoNode:
     clip_org = clip
diff --git a/vsort/CMakeLists.txt b/vsort/CMakeLists.txt
index 0e97ba1..85dcae3 100644
--- a/vsort/CMakeLists.txt
+++ b/vsort/CMakeLists.txt
@@ -7,6 +7,7 @@ set(ONNX_RUNTIME_API_DIRECTORY "" CACHE PATH "Path to ONNX API headers")
 set(ONNX_RUNTIME_LIB_DIRECTORY "" CACHE PATH "Path to ONNX Runtime libraries")
 
 set(ENABLE_CUDA OFF CACHE BOOL "Enable CUDA backend")
+set(ENABLE_DML OFF CACHE BOOL "Enable DirectML backend")
 
 find_package(protobuf REQUIRED CONFIG)
 find_package(ONNX REQUIRED CONFIG)
@@ -52,6 +53,10 @@ if (ENABLE_CUDA)
     endif()
 endif()
 
+if (ENABLE_DML)
+    add_compile_definitions(ENABLE_DML)
+endif()
+
 target_include_directories(vsort PUBLIC
     "${PROJECT_BINARY_DIR}"
 )
diff --git a/vsort/vs_onnxruntime.cpp b/vsort/vs_onnxruntime.cpp
index 6477fa0..a69e71b 100644
--- a/vsort/vs_onnxruntime.cpp
+++ b/vsort/vs_onnxruntime.cpp
@@ -24,12 +24,19 @@ using namespace std::chrono_literals;
 #include <onnx/common/version.h>
 #include <onnx/onnx_pb.h>
 
+#define NOMINMAX
+
 #include <onnxruntime_c_api.h>
 
 #ifdef ENABLE_CUDA
 #include <cuda_runtime.h>
 #endif // ENABLE_CUDA
 
+#ifdef ENABLE_DML
+// include/onnxruntime/core/providers/dml/dml_provider_factory.h
+#include <../providers/dml/dml_provider_factory.h>
+#endif // ENABLE_DML
+
 #include "config.h"
 
 
@@ -387,7 +394,8 @@ struct TicketSemaphore {
 enum class Backend {
     CPU = 0,
     CUDA = 1,
-    COREML = 2
+    COREML = 2,
+    DML = 3
 };
 
 #ifdef ENABLE_CUDA
@@ -404,6 +412,8 @@ struct Resource {
     OrtValue * input_tensor;
     OrtValue * output_tensor;
     OrtIoBinding * binding;
+    char * input_name;
+    char * output_name;
 
 #ifdef ENABLE_CUDA
     cudaStream_t stream;
@@ -637,11 +647,21 @@ static const VSFrameRef *VS_CC vsOrtGetFrame(
                     checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
 
                     // onnxruntime replays the graph itself in CUDAExecutionProvider::OnRunEnd
-                } else {
-#else // ENABLE_CUDA
-                {
+                } else
 #endif // ENABLE_CUDA
+                if (d->backend == Backend::CPU || d->backend == Backend::CUDA) {
                     checkError(ortapi->RunWithBinding(resource.session, nullptr, resource.binding));
+                } else {
+                    checkError(ortapi->Run(
+                        resource.session,
+                        nullptr,
+                        &resource.input_name,
+                        &resource.input_tensor,
+                        1,
+                        &resource.output_name,
+                        1,
+                        &resource.output_tensor
+                    ));
                 }
 
 #ifdef ENABLE_CUDA
@@ -867,6 +887,10 @@ static void VS_CC vsOrtCreate(
     } else if (strcmp(provider, "COREML") == 0) {
         d->backend = Backend::COREML;
 #endif // ENABLE_COREML
+#ifdef ENABLE_DML
+    } else if (strcmp(provider, "DML") == 0) {
+        d->backend = Backend::DML;
+#endif // ENABLE_DML
     } else {
         return set_error("unknwon provider "s + provider);
     }
@@ -1071,6 +1095,13 @@ static void VS_CC vsOrtCreate(
             ));
         }
 #endif // ENABLE_COREML
+#ifdef ENABLE_DML
+        else if (d->backend == Backend::DML) {
+            const OrtDmlApi * ortdmlapi {};
+            checkError(ortapi->GetExecutionProviderApi("DML", ORT_API_VERSION, (const void **) &ortdmlapi));
+            checkError(ortdmlapi->SessionOptionsAppendExecutionProvider_DML(session_options, d->device_id));
+        }
+#endif // ENABLE_DML
 
         checkError(ortapi->CreateSessionFromArray(
             d->environment,
@@ -1158,18 +1189,17 @@ static void VS_CC vsOrtCreate(
 
         checkError(ortapi->CreateIoBinding(resource.session, &resource.binding));
 
-        char * input_name;
         checkError(ortapi->SessionGetInputName(
-            resource.session, 0, cpu_allocator, &input_name
+            resource.session, 0, cpu_allocator, &resource.input_name
         ));
 
         char * output_name;
         checkError(ortapi->SessionGetOutputName(
-            resource.session, 0, cpu_allocator, &output_name
+            resource.session, 0, cpu_allocator, &resource.output_name
         ));
 
-        checkError(ortapi->BindInput(resource.binding, input_name, resource.input_tensor));
-        checkError(ortapi->BindOutput(resource.binding, output_name, resource.output_tensor));
+        checkError(ortapi->BindInput(resource.binding, resource.input_name, resource.input_tensor));
+        checkError(ortapi->BindOutput(resource.binding, resource.output_name, resource.output_tensor));
 
         if (auto err = checkNodesAndNetwork(resource.session, in_vis); err.has_value()) {
             return set_error(err.value());
@@ -1247,6 +1277,16 @@ VS_EXTERNAL_API(void) VapourSynthPluginInit(
         );
 
         vsapi->propSetData(out, "path", vsapi->getPluginPath(myself), -1, paReplace);
+
+#ifdef ENABLE_CUDA
+        vsapi->propSetData(out, "providers", "CUDA", -1, paAppend);
+#endif
+#ifdef ENABLE_COREML
+        vsapi->propSetData(out, "providers", "COREML", -1, paAppend);
+#endif
+#ifdef ENABLE_DML
+        vsapi->propSetData(out, "providers", "DML", -1, paAppend);
+#endif
     };
     registerFunc("Version", "", getVersion, nullptr, plugin);
 }
diff --git a/vsort/win32.cpp b/vsort/win32.cpp
index 73505c5..8741167 100644
--- a/vsort/win32.cpp
+++ b/vsort/win32.cpp
@@ -14,6 +14,7 @@
 namespace {
 std::vector<std::wstring> dlls = {
     // This list must be sorted by dependency.
+    L"DirectML.dll",
     L"onnxruntime.dll", // must be the last
 };