From 6a75b3ed364f78ad3c2b8242917b535a7d72d1a9 Mon Sep 17 00:00:00 2001 From: Jeff Bloomfield Date: Thu, 22 Jun 2023 18:53:20 -0700 Subject: [PATCH 1/5] Add option to use custom heaps for input and output tensors --- DxDispatch/src/dxdispatch/CommandLineArgs.cpp | 11 ++++ DxDispatch/src/dxdispatch/CommandLineArgs.h | 5 +- DxDispatch/src/dxdispatch/Device.cpp | 65 ++++++++++++++----- DxDispatch/src/dxdispatch/Device.h | 15 ++++- DxDispatch/src/dxdispatch/Executor.cpp | 9 ++- .../src/dxdispatch/OnnxDispatchable.cpp | 10 ++- 6 files changed, 94 insertions(+), 21 deletions(-) diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp index 92f5a81b..b74feffa 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp @@ -114,6 +114,12 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) "Sets barrier types issued after every dispatch is recorded into a command list: none, uav, or uav+aliasing", cxxopts::value()->default_value("uav") ) + ( + "u, custom_heaps", + "Binds input and output resources are allocated from custom heaps. Write-combined caching and system memory (L0) are used when this is specified.", + cxxopts::value() + ) + // DxDispatch generates root signatures that are guaranteed to match HLSL source, which eliminates // having to write it inline in the HLSL file. DXC for Xbox precompiles shaders for Xbox (by default), // but precompilation requires the root signature to be in the HLSL source itself; to allow use of the @@ -271,6 +277,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) } } + if (result.count("custom_heaps")) + { + m_customHeaps = result["custom_heaps"].as(); + } + auto queueTypeStr = result["queue_type"].as(); if (queueTypeStr == "direct") { diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.h b/DxDispatch/src/dxdispatch/CommandLineArgs.h index 854403fe..9aaaf466 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.h +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.h @@ -38,6 +38,8 @@ class CommandLineArgs bool GetUavBarrierAfterDispatch() const { return m_uavBarrierAfterDispatch; } bool GetAliasingBarrierAfterDispatch() const { return m_aliasingBarrierAfterDispatch; } + bool GetCustomHeaps() const { return m_customHeaps; } + // ONNX gsl::span> GetOnnxFreeDimensionNameOverrides() const { return m_onnxFreeDimensionNameOverrides; } gsl::span> GetOnnxFreeDimensionDenotationOverrides() const { return m_onnxFreeDimensionDenotationOverrides; } @@ -45,7 +47,7 @@ class CommandLineArgs const std::unordered_map>& GetOnnxBindingShapes() const { return m_onnxBindShapes; } std::optional GetOnnxGraphOptimizationLevel() const { return m_onnxGraphOptimizationLevel; } std::optional GetOnnxLoggingLevel() const { return m_onnxLoggingLevel; } - bool PrintVerboseOnnxBindingInfo() const { return m_onnxPrintVerboseBindingInfo; } + bool PrintVerboseOnnxBindingInfo() const { return m_onnxPrintVerboseBindingInfo; } private: bool m_showAdapters = false; @@ -58,6 +60,7 @@ class CommandLineArgs bool m_clearShaderCaches = false; bool m_uavBarrierAfterDispatch = true; bool m_aliasingBarrierAfterDispatch = false; + bool m_customHeaps = false; std::string m_adapterSubstring = ""; std::filesystem::path m_modelPath; std::string m_pixCaptureName = "dxdispatch"; diff --git a/DxDispatch/src/dxdispatch/Device.cpp b/DxDispatch/src/dxdispatch/Device.cpp index f99d16a1..7d280b51 100644 --- a/DxDispatch/src/dxdispatch/Device.cpp +++ b/DxDispatch/src/dxdispatch/Device.cpp @@ -163,6 +163,34 @@ ComPtr Device::CreateDefaultBuffer( return resource; } +ComPtr Device::CreateBuffer( + uint64_t sizeInBytes, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference, + D3D12_RESOURCE_FLAGS resourceFlags, + uint64_t alignment, + D3D12_HEAP_FLAGS heapFlags) +{ + if (cpuPageProperty == D3D12_CPU_PAGE_PROPERTY_UNKNOWN && memoryPoolPreference == D3D12_MEMORY_POOL_UNKNOWN) + { + return CreateDefaultBuffer(sizeInBytes, resourceFlags, alignment, heapFlags); + } + + auto resourceDesc = CD3DX12_RESOURCE_DESC::Buffer(sizeInBytes, resourceFlags, alignment); + auto heapProps = CD3DX12_HEAP_PROPERTIES(cpuPageProperty, memoryPoolPreference); + + ComPtr resource; + THROW_IF_FAILED(m_d3d->CreateCommittedResource( + &heapProps, + heapFlags, + &resourceDesc, + D3D12_RESOURCE_STATE_COMMON, + nullptr, + IID_GRAPHICS_PPV_ARGS(resource.ReleaseAndGetAddressOf()))); + + return resource; +} + ComPtr Device::CreateUploadBuffer( uint64_t sizeInBytes, D3D12_RESOURCE_FLAGS resourceFlags, @@ -247,23 +275,30 @@ void Device::RecordDispatch(const char* name, uint32_t threadGroupX, uint32_t th PIXEndEvent(m_commandList.Get()); } -Microsoft::WRL::ComPtr Device::Upload(uint64_t totalSize, gsl::span data, std::wstring_view name) +Microsoft::WRL::ComPtr Device::CreateBuffer( + uint64_t totalSize, + gsl::span data, + std::wstring_view name, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty, + D3D12_MEMORY_POOL memoryPoolPreference +) { if (data.size() > totalSize) { throw std::invalid_argument("Attempting to upload more data than the size of the buffer"); } - auto defaultBuffer = CreateDefaultBuffer(totalSize); + auto buffer = CreateBuffer(totalSize, cpuPageProperty, memoryPoolPreference); + if (!name.empty()) { - defaultBuffer->SetName(name.data()); + buffer->SetName(name.data()); } if (data.empty()) { // No need to create an upload resource if the source data is empty. - return defaultBuffer; + return buffer; } auto uploadBuffer = CreateUploadBuffer(totalSize); @@ -279,20 +314,20 @@ Microsoft::WRL::ComPtr Device::Upload(uint64_t totalSize, gsl::s D3D12_RESOURCE_BARRIER barriers[] = { CD3DX12_RESOURCE_BARRIER::Transition( - defaultBuffer.Get(), + buffer.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_DEST) }; m_commandList->ResourceBarrier(_countof(barriers), barriers); } - m_commandList->CopyResource(defaultBuffer.Get(), uploadBuffer.Get()); + m_commandList->CopyResource(buffer.Get(), uploadBuffer.Get()); { D3D12_RESOURCE_BARRIER barriers[] = { CD3DX12_RESOURCE_BARRIER::Transition( - defaultBuffer.Get(), + buffer.Get(), D3D12_RESOURCE_STATE_COPY_DEST, D3D12_RESOURCE_STATE_UNORDERED_ACCESS) }; @@ -301,32 +336,32 @@ Microsoft::WRL::ComPtr Device::Upload(uint64_t totalSize, gsl::s m_temporaryResources.push_back(std::move(uploadBuffer)); - return defaultBuffer; + return buffer; } -std::vector Device::Download(Microsoft::WRL::ComPtr defaultBuffer) +std::vector Device::Download(Microsoft::WRL::ComPtr buffer) { - auto readbackBuffer = CreateReadbackBuffer(defaultBuffer->GetDesc().Width); + auto readbackBuffer = CreateReadbackBuffer(buffer->GetDesc().Width); readbackBuffer->SetName(L"Device::Download"); { D3D12_RESOURCE_BARRIER barriers[] = { CD3DX12_RESOURCE_BARRIER::Transition( - defaultBuffer.Get(), + buffer.Get(), D3D12_RESOURCE_STATE_UNORDERED_ACCESS, D3D12_RESOURCE_STATE_COPY_SOURCE) }; m_commandList->ResourceBarrier(_countof(barriers), barriers); } - m_commandList->CopyResource(readbackBuffer.Get(), defaultBuffer.Get()); + m_commandList->CopyResource(readbackBuffer.Get(), buffer.Get()); { D3D12_RESOURCE_BARRIER barriers[] = { CD3DX12_RESOURCE_BARRIER::Transition( - defaultBuffer.Get(), + buffer.Get(), D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_RESOURCE_STATE_UNORDERED_ACCESS) }; @@ -335,9 +370,9 @@ std::vector Device::Download(Microsoft::WRL::ComPtr d ExecuteCommandListAndWait(); - std::vector outputBuffer(defaultBuffer->GetDesc().Width); + std::vector outputBuffer(buffer->GetDesc().Width); { - size_t dataSize = defaultBuffer->GetDesc().Width; + size_t dataSize = buffer->GetDesc().Width; CD3DX12_RANGE readRange(0, gsl::narrow(dataSize)); void* readbackBufferData = nullptr; THROW_IF_FAILED(readbackBuffer->Map(0, &readRange, &readbackBufferData)); diff --git a/DxDispatch/src/dxdispatch/Device.h b/DxDispatch/src/dxdispatch/Device.h index c8c588ef..276e2243 100644 --- a/DxDispatch/src/dxdispatch/Device.h +++ b/DxDispatch/src/dxdispatch/Device.h @@ -44,6 +44,14 @@ class Device uint64_t alignment = 0, D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE); + Microsoft::WRL::ComPtr CreateBuffer( + uint64_t sizeInBytes, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty = D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, + D3D12_MEMORY_POOL memoryPoolPreference = D3D12_MEMORY_POOL_L0, + D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS, + uint64_t alignment = 0, + D3D12_HEAP_FLAGS heapFlags = D3D12_HEAP_FLAG_NONE); + Microsoft::WRL::ComPtr CreateUploadBuffer( uint64_t sizeInBytes, D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_NONE, @@ -88,7 +96,12 @@ class Device m_temporaryResources.emplace_back(std::move(object)); } - Microsoft::WRL::ComPtr Upload(uint64_t totalSize, gsl::span data, std::wstring_view name = {}); + Microsoft::WRL::ComPtr CreateBuffer( + uint64_t totalSize, + gsl::span data, + std::wstring_view name, + D3D12_CPU_PAGE_PROPERTY cpuPageProperty = D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE, + D3D12_MEMORY_POOL memoryPoolPreference = D3D12_MEMORY_POOL_L0); std::vector Download(Microsoft::WRL::ComPtr); diff --git a/DxDispatch/src/dxdispatch/Executor.cpp b/DxDispatch/src/dxdispatch/Executor.cpp index 8e93fa30..22397b32 100644 --- a/DxDispatch/src/dxdispatch/Executor.cpp +++ b/DxDispatch/src/dxdispatch/Executor.cpp @@ -113,7 +113,14 @@ Executor::Executor(Model& model, std::shared_ptr device, const CommandLi assert(std::holds_alternative(desc.value)); auto& bufferDesc = std::get(desc.value); auto wName = std::wstring_convert>().from_bytes(desc.name); - m_resources[desc.name] = std::move(device->Upload(bufferDesc.sizeInBytes, bufferDesc.initialValues, wName)); + + m_resources[desc.name] = std::move(device->CreateBuffer( + bufferDesc.sizeInBytes, + bufferDesc.initialValues, + wName, + m_commandLineArgs.GetCustomHeaps() ? D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE : D3D12_CPU_PAGE_PROPERTY_UNKNOWN, + m_commandLineArgs.GetCustomHeaps() ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN + )); } } device->ExecuteCommandListAndWait(); diff --git a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp index 22021bb3..3e15797a 100644 --- a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp @@ -401,12 +401,16 @@ void OnnxDispatchable::Bind(const Bindings& jsonBindings, uint32_t iteration) tensorShapeUint32.push_back(1); } - binding.resource = m_device->CreateDefaultBuffer(DMLCalcBufferTensorSize( + UINT64 resourceSize = DMLCalcBufferTensorSize( dataTypeInfo.dmlDataType, tensorShapeUint32.size(), tensorShapeUint32.data(), - nullptr - )); + nullptr); + + binding.resource = m_device->CreateBuffer( + resourceSize, + m_args.GetCustomHeaps() ? D3D12_CPU_PAGE_PROPERTY_WRITE_COMBINE : D3D12_CPU_PAGE_PROPERTY_UNKNOWN, + m_args.GetCustomHeaps() ? D3D12_MEMORY_POOL_L0 : D3D12_MEMORY_POOL_UNKNOWN); binding.ortValue = CreateTensorFromResource( m_ortDmlApi, From 3589b4a313b77e2a8902aba9f5a9455dd92b5d4d Mon Sep 17 00:00:00 2001 From: Jeff Bloomfield Date: Thu, 22 Jun 2023 18:58:32 -0700 Subject: [PATCH 2/5] Use feature level 1_0_core --- DxDispatch/src/dxdispatch/Device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DxDispatch/src/dxdispatch/Device.cpp b/DxDispatch/src/dxdispatch/Device.cpp index 7d280b51..fe3aeee2 100644 --- a/DxDispatch/src/dxdispatch/Device.cpp +++ b/DxDispatch/src/dxdispatch/Device.cpp @@ -73,7 +73,7 @@ Device::Device( THROW_IF_FAILED(m_d3dModule->CreateDevice( adapter, - D3D_FEATURE_LEVEL_11_0, + D3D_FEATURE_LEVEL_1_0_CORE, IID_PPV_ARGS(&m_d3d))); if (debugLayersEnabled) From ce4065f32dfbff8fb21462b44990b4bc93c2d329 Mon Sep 17 00:00:00 2001 From: Jeff Bloomfield Date: Thu, 3 Aug 2023 09:07:05 -0700 Subject: [PATCH 3/5] Add option to disable GPU timing collection for ONNX models with "-x" --- DxDispatch/src/dxdispatch/CommandLineArgs.cpp | 10 +++ DxDispatch/src/dxdispatch/CommandLineArgs.h | 2 + DxDispatch/src/dxdispatch/Executor.cpp | 85 +++++++++++++------ .../src/dxdispatch/OnnxDispatchable.cpp | 18 ++-- 4 files changed, 82 insertions(+), 33 deletions(-) diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp index b74feffa..a19d9a7a 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp @@ -181,6 +181,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) "Prints verbose ONNX model binding information.", cxxopts::value() ) + ( + "x,onnx_disable_gpu_timing", + "Disables collection of GPU timing and PIX events surrounding ONNX Runtime. This can decrease CPU time by preventing additional command list executions.", + cxxopts::value() + ) ; options.positional_help(""); @@ -418,5 +423,10 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) m_onnxPrintVerboseBindingInfo = result["print_onnx_bindings"].as(); } + if (result.count("onnx_disable_gpu_timing")) + { + m_onnxDisableGpuTiming = result["onnx_disable_gpu_timing"].as(); + } + m_helpText = options.help(); } \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.h b/DxDispatch/src/dxdispatch/CommandLineArgs.h index 9aaaf466..6376903a 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.h +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.h @@ -48,6 +48,7 @@ class CommandLineArgs std::optional GetOnnxGraphOptimizationLevel() const { return m_onnxGraphOptimizationLevel; } std::optional GetOnnxLoggingLevel() const { return m_onnxLoggingLevel; } bool PrintVerboseOnnxBindingInfo() const { return m_onnxPrintVerboseBindingInfo; } + bool DisableGpuTiming() const { return m_onnxDisableGpuTiming; } private: bool m_showAdapters = false; @@ -95,4 +96,5 @@ class CommandLineArgs std::optional m_onnxGraphOptimizationLevel; std::optional m_onnxLoggingLevel; bool m_onnxPrintVerboseBindingInfo = false; + bool m_onnxDisableGpuTiming = false; }; \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/Executor.cpp b/DxDispatch/src/dxdispatch/Executor.cpp index 22397b32..84310ff3 100644 --- a/DxDispatch/src/dxdispatch/Executor.cpp +++ b/DxDispatch/src/dxdispatch/Executor.cpp @@ -283,21 +283,37 @@ void Executor::operator()(const Model::DispatchCommand& command) // GPU timings are capped at a fixed size ring buffer. The first samples may have been // overwritten, in which case the warmup samples are dropped. - gpuTimings.rawSamples = m_device->ResolveTimingSamples(); - assert (cpuTimings.rawSamples.size() >= gpuTimings.rawSamples.size()); - uint32_t gpuSamplesOverwritten = cpuTimings.rawSamples.size() - gpuTimings.rawSamples.size(); - auto gpuStats = gpuTimings.ComputeStats(std::max(m_commandLineArgs.MaxWarmupSamples(), gpuSamplesOverwritten) - gpuSamplesOverwritten); + Timings::SampleStats gpuStats = {}; + uint32_t gpuSamplesOverwritten = 0; + if (!m_commandLineArgs.DisableGpuTiming()) + { + gpuTimings.rawSamples = m_device->ResolveTimingSamples(); + assert (cpuTimings.rawSamples.size() >= gpuTimings.rawSamples.size()); + gpuSamplesOverwritten = cpuTimings.rawSamples.size() - gpuTimings.rawSamples.size(); + gpuStats = gpuTimings.ComputeStats(std::max(m_commandLineArgs.MaxWarmupSamples(), gpuSamplesOverwritten) - gpuSamplesOverwritten); + } if (iterationsCompleted > 0) { if (m_commandLineArgs.GetTimingVerbosity() == TimingVerbosity::Basic) { - LogInfo(fmt::format("Dispatch '{}': {} iterations, {:.4f} ms median (CPU), {:.6f} ms median (GPU)", - command.dispatchableName, - iterationsCompleted, - cpuStats.hot.median, - gpuStats.hot.median - )); + if (!m_commandLineArgs.DisableGpuTiming()) + { + LogInfo(fmt::format("Dispatch '{}': {} iterations, {:.4f} ms median (CPU), {:.6f} ms median (GPU)", + command.dispatchableName, + iterationsCompleted, + cpuStats.hot.median, + gpuStats.hot.median + )); + } + else + { + LogInfo(fmt::format("Dispatch '{}': {} iterations, {:.4f} ms median (CPU)", + command.dispatchableName, + iterationsCompleted, + cpuStats.hot.median + )); + } } else { @@ -309,21 +325,27 @@ void Executor::operator()(const Model::DispatchCommand& command) cpuStats.cold.count, cpuStats.cold.average, cpuStats.cold.min, cpuStats.cold.median, cpuStats.cold.max )); - LogInfo(fmt::format("GPU Timings (Cold) : {} samples, {:.4f} ms average, {:.4f} ms min, {:.4f} ms median, {:.4f} ms max", - gpuStats.cold.count, gpuStats.cold.average, gpuStats.cold.min, gpuStats.cold.median, gpuStats.cold.max - )); + if (!m_commandLineArgs.DisableGpuTiming()) + { + LogInfo(fmt::format("GPU Timings (Cold) : {} samples, {:.4f} ms average, {:.4f} ms min, {:.4f} ms median, {:.4f} ms max", + gpuStats.cold.count, gpuStats.cold.average, gpuStats.cold.min, gpuStats.cold.median, gpuStats.cold.max + )); + } LogInfo(fmt::format("CPU Timings (Hot) : {} samples, {:.4f} ms average, {:.4f} ms min, {:.4f} ms median, {:.4f} ms max", cpuStats.hot.count, cpuStats.hot.average, cpuStats.hot.min, cpuStats.hot.median, cpuStats.hot.max )); - LogInfo(fmt::format("GPU Timings (Hot) : {} samples, {:.4f} ms average, {:.4f} ms min, {:.4f} ms median, {:.4f} ms max", - gpuStats.hot.count, gpuStats.hot.average, gpuStats.hot.min, gpuStats.hot.median, gpuStats.hot.max - )); - - if (gpuSamplesOverwritten > 0) + if (!m_commandLineArgs.DisableGpuTiming()) { - LogInfo(fmt::format("GPU samples buffer has {} samples overwritten.", gpuSamplesOverwritten)); + LogInfo(fmt::format("GPU Timings (Hot) : {} samples, {:.4f} ms average, {:.4f} ms min, {:.4f} ms median, {:.4f} ms max", + gpuStats.hot.count, gpuStats.hot.average, gpuStats.hot.min, gpuStats.hot.median, gpuStats.hot.max + )); + + if (gpuSamplesOverwritten > 0) + { + LogInfo(fmt::format("GPU samples buffer has {} samples overwritten.", gpuSamplesOverwritten)); + } } } @@ -333,18 +355,27 @@ void Executor::operator()(const Model::DispatchCommand& command) for (uint32_t i = 0; i < iterationsCompleted; ++i) { - if (i < gpuSamplesOverwritten) + if (!m_commandLineArgs.DisableGpuTiming()) { - // GPU samples are limited to a fixed size, so the initial iterations - // may not have timing information (overwritten timestamps). - LogInfo(fmt::format("iteration {}: {:.4f} ms (CPU)", - i, cpuTimings.rawSamples[i] - )); + if (i < gpuSamplesOverwritten) + { + // GPU samples are limited to a fixed size, so the initial iterations + // may not have timing information (overwritten timestamps). + LogInfo(fmt::format("iteration {}: {:.4f} ms (CPU)", + i, cpuTimings.rawSamples[i] + )); + } + else + { + LogInfo(fmt::format("iteration {}: {:.4f} ms (CPU), {:.4f} ms (GPU)", + i, cpuTimings.rawSamples[i], gpuTimings.rawSamples[i - gpuSamplesOverwritten] + )); + } } else { - LogInfo(fmt::format("iteration {}: {:.4f} ms (CPU), {:.4f} ms (GPU)", - i, cpuTimings.rawSamples[i], gpuTimings.rawSamples[i - gpuSamplesOverwritten] + LogInfo(fmt::format("iteration {}: {:.4f} ms (CPU)", + i, cpuTimings.rawSamples[i] )); } } diff --git a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp index 3e15797a..3c6e5c01 100644 --- a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp @@ -497,17 +497,23 @@ void OnnxDispatchable::Bind(const Bindings& jsonBindings, uint32_t iteration) void OnnxDispatchable::Dispatch(const Model::DispatchCommand& args, uint32_t iteration) { - PIXBeginEvent(m_device->GetCommandList(), PIX_COLOR(255, 255, 0), "ONNX: '%s'", args.dispatchableName.c_str()); - m_device->RecordTimestamp(); - m_device->ExecuteCommandList(); + if (!m_args.DisableGpuTiming()) + { + PIXBeginEvent(m_device->GetCommandList(), PIX_COLOR(255, 255, 0), "ONNX: '%s'", args.dispatchableName.c_str()); + m_device->RecordTimestamp(); + m_device->ExecuteCommandList(); + } Ort::RunOptions runOptions; for (uint32_t i = 0; i < m_args.DispatchRepeat(); i++) m_session->Run(runOptions, *m_ioBindings); - m_device->RecordTimestamp(); - PIXEndEvent(m_device->GetCommandList()); - m_device->ExecuteCommandList(); + if (!m_args.DisableGpuTiming()) + { + m_device->RecordTimestamp(); + PIXEndEvent(m_device->GetCommandList()); + m_device->ExecuteCommandList(); + } } void OnnxDispatchable::Wait() From b7a510fe9e435acf920eeee488f387d2b244e358 Mon Sep 17 00:00:00 2001 From: Jeff Bloomfield Date: Wed, 20 Sep 2023 16:17:51 -0700 Subject: [PATCH 4/5] Making Dispatch synchronous to address leak --- DxDispatch/src/dxdispatch/Device.cpp | 1 + DxDispatch/src/dxdispatch/Dispatchable.h | 1 - DxDispatch/src/dxdispatch/DmlDispatchable.cpp | 4 ---- DxDispatch/src/dxdispatch/DmlDispatchable.h | 1 - DxDispatch/src/dxdispatch/Executor.cpp | 1 - DxDispatch/src/dxdispatch/HlslDispatchable.cpp | 4 ---- DxDispatch/src/dxdispatch/HlslDispatchable.h | 1 - DxDispatch/src/dxdispatch/OnnxDispatchable.cpp | 17 +++++++++++------ DxDispatch/src/dxdispatch/OnnxDispatchable.h | 1 - 9 files changed, 12 insertions(+), 19 deletions(-) diff --git a/DxDispatch/src/dxdispatch/Device.cpp b/DxDispatch/src/dxdispatch/Device.cpp index fe3aeee2..0a20b92c 100644 --- a/DxDispatch/src/dxdispatch/Device.cpp +++ b/DxDispatch/src/dxdispatch/Device.cpp @@ -94,6 +94,7 @@ Device::Device( IID_GRAPHICS_PPV_ARGS(m_fence.ReleaseAndGetAddressOf()))); D3D12_COMMAND_QUEUE_DESC queueDesc = {}; + queueDesc.Flags = D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT; queueDesc.Type = commandListType; THROW_IF_FAILED(m_d3d->CreateCommandQueue( &queueDesc, diff --git a/DxDispatch/src/dxdispatch/Dispatchable.h b/DxDispatch/src/dxdispatch/Dispatchable.h index f4496a72..4b63f3b3 100644 --- a/DxDispatch/src/dxdispatch/Dispatchable.h +++ b/DxDispatch/src/dxdispatch/Dispatchable.h @@ -23,5 +23,4 @@ struct Dispatchable virtual void Initialize() = 0; virtual void Bind(const Bindings& bindings, uint32_t iteration) = 0; virtual void Dispatch(const Model::DispatchCommand& args, uint32_t iteration) = 0; - virtual void Wait() = 0; }; \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/DmlDispatchable.cpp b/DxDispatch/src/dxdispatch/DmlDispatchable.cpp index 16554c76..62293b18 100644 --- a/DxDispatch/src/dxdispatch/DmlDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/DmlDispatchable.cpp @@ -244,9 +244,5 @@ void DmlDispatchable::Bind(const Bindings& bindings, uint32_t iteration) void DmlDispatchable::Dispatch(const Model::DispatchCommand& args, uint32_t iteration) { m_device->RecordDispatch(m_operatorCompiled.Get(), m_bindingTable.Get()); -} - -void DmlDispatchable::Wait() -{ m_device->ExecuteCommandListAndWait(); } \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/DmlDispatchable.h b/DxDispatch/src/dxdispatch/DmlDispatchable.h index 6ea39068..20b27162 100644 --- a/DxDispatch/src/dxdispatch/DmlDispatchable.h +++ b/DxDispatch/src/dxdispatch/DmlDispatchable.h @@ -12,7 +12,6 @@ class DmlDispatchable : public Dispatchable void Initialize() final; void Bind(const Bindings& bindings, uint32_t iteration) final; void Dispatch(const Model::DispatchCommand& args, uint32_t iteration) final; - void Wait() final; private: std::string m_name; diff --git a/DxDispatch/src/dxdispatch/Executor.cpp b/DxDispatch/src/dxdispatch/Executor.cpp index 84310ff3..e19c5df5 100644 --- a/DxDispatch/src/dxdispatch/Executor.cpp +++ b/DxDispatch/src/dxdispatch/Executor.cpp @@ -253,7 +253,6 @@ void Executor::operator()(const Model::DispatchCommand& command) // Dispatch dispatchTimer.Start(); dispatchable->Dispatch(command, iterationsCompleted); - dispatchable->Wait(); cpuTimings.rawSamples.push_back(dispatchTimer.End().DurationInMilliseconds() / m_commandLineArgs.DispatchRepeat()); // The dispatch interval defaults to 0 (dispatch as fast as possible). However, the user may increase it diff --git a/DxDispatch/src/dxdispatch/HlslDispatchable.cpp b/DxDispatch/src/dxdispatch/HlslDispatchable.cpp index 86275008..e68c26f2 100644 --- a/DxDispatch/src/dxdispatch/HlslDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/HlslDispatchable.cpp @@ -468,9 +468,5 @@ void HlslDispatchable::Bind(const Bindings& bindings, uint32_t iteration) void HlslDispatchable::Dispatch(const Model::DispatchCommand& args, uint32_t iteration) { m_device->RecordDispatch(args.dispatchableName.c_str(), args.threadGroupCount[0], args.threadGroupCount[1], args.threadGroupCount[2]); -} - -void HlslDispatchable::Wait() -{ m_device->ExecuteCommandListAndWait(); } \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/HlslDispatchable.h b/DxDispatch/src/dxdispatch/HlslDispatchable.h index 532a8069..5d7ef22b 100644 --- a/DxDispatch/src/dxdispatch/HlslDispatchable.h +++ b/DxDispatch/src/dxdispatch/HlslDispatchable.h @@ -10,7 +10,6 @@ class HlslDispatchable : public Dispatchable void Initialize() final; void Bind(const Bindings& bindings, uint32_t iteration) final; void Dispatch(const Model::DispatchCommand& args, uint32_t iteration) final; - void Wait() final; enum class BufferViewType { diff --git a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp index 3c6e5c01..ec1049a9 100644 --- a/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp +++ b/DxDispatch/src/dxdispatch/OnnxDispatchable.cpp @@ -512,11 +512,16 @@ void OnnxDispatchable::Dispatch(const Model::DispatchCommand& args, uint32_t ite { m_device->RecordTimestamp(); PIXEndEvent(m_device->GetCommandList()); - m_device->ExecuteCommandList(); - } -} -void OnnxDispatchable::Wait() -{ - m_ioBindings->SynchronizeOutputs(); + // SynchronizeOutputs is not called after this because we've just synchronized on work added later to the same queue. + // It is not called before this because it would prevent parallelization of launching the command list. + // It that were addressed by calling ExecuteCommandList before SynchronizeOutputs, then the command allocator would + // not be reset and lead to a leak. The command allocator cannot be reset a new command list is recording with it, + // and a new command list is started with the allocator during ExecuteCommandList. + m_device->ExecuteCommandListAndWait(); + } + else + { + m_ioBindings->SynchronizeOutputs(); + } } \ No newline at end of file diff --git a/DxDispatch/src/dxdispatch/OnnxDispatchable.h b/DxDispatch/src/dxdispatch/OnnxDispatchable.h index cfe3c83f..3a26c2e6 100644 --- a/DxDispatch/src/dxdispatch/OnnxDispatchable.h +++ b/DxDispatch/src/dxdispatch/OnnxDispatchable.h @@ -15,7 +15,6 @@ class OnnxDispatchable : public Dispatchable void Initialize() final; void Bind(const Bindings& bindings, uint32_t iteration) final; void Dispatch(const Model::DispatchCommand& args, uint32_t iteration) final; - void Wait() final; private: std::shared_ptr m_device; From c9637652914431195fd6e6d1f8dbc2dfa8d1274b Mon Sep 17 00:00:00 2001 From: Jeff Bloomfield Date: Mon, 25 Sep 2023 14:20:34 -0700 Subject: [PATCH 5/5] Enable option to call SetStablePowerState (-g, --set_stable_power_state) --- DxDispatch/src/dxdispatch/CommandLineArgs.cpp | 10 ++++++++++ DxDispatch/src/dxdispatch/CommandLineArgs.h | 2 ++ DxDispatch/src/dxdispatch/Executor.cpp | 17 +++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp index a19d9a7a..9897dc70 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.cpp +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.cpp @@ -119,6 +119,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) "Binds input and output resources are allocated from custom heaps. Write-combined caching and system memory (L0) are used when this is specified.", cxxopts::value() ) + ( + "g, set_stable_power_state", + "Causes a stable power state to be triggered while running the test, using ID3D12Device::SetStablePowerState. This requires developer mode be enabled.", + cxxopts::value() + ) // DxDispatch generates root signatures that are guaranteed to match HLSL source, which eliminates // having to write it inline in the HLSL file. DXC for Xbox precompiles shaders for Xbox (by default), @@ -287,6 +292,11 @@ CommandLineArgs::CommandLineArgs(int argc, char** argv) m_customHeaps = result["custom_heaps"].as(); } + if (result.count("set_stable_power_state")) + { + m_setStablePowerState = result["set_stable_power_state"].as(); + } + auto queueTypeStr = result["queue_type"].as(); if (queueTypeStr == "direct") { diff --git a/DxDispatch/src/dxdispatch/CommandLineArgs.h b/DxDispatch/src/dxdispatch/CommandLineArgs.h index 6376903a..8f6df3ad 100644 --- a/DxDispatch/src/dxdispatch/CommandLineArgs.h +++ b/DxDispatch/src/dxdispatch/CommandLineArgs.h @@ -39,6 +39,7 @@ class CommandLineArgs bool GetAliasingBarrierAfterDispatch() const { return m_aliasingBarrierAfterDispatch; } bool GetCustomHeaps() const { return m_customHeaps; } + bool GetSetStablePowerState() const { return m_setStablePowerState; } // ONNX gsl::span> GetOnnxFreeDimensionNameOverrides() const { return m_onnxFreeDimensionNameOverrides; } @@ -62,6 +63,7 @@ class CommandLineArgs bool m_uavBarrierAfterDispatch = true; bool m_aliasingBarrierAfterDispatch = false; bool m_customHeaps = false; + bool m_setStablePowerState = false; std::string m_adapterSubstring = ""; std::filesystem::path m_modelPath; std::string m_pixCaptureName = "dxdispatch"; diff --git a/DxDispatch/src/dxdispatch/Executor.cpp b/DxDispatch/src/dxdispatch/Executor.cpp index e19c5df5..70c5d71d 100644 --- a/DxDispatch/src/dxdispatch/Executor.cpp +++ b/DxDispatch/src/dxdispatch/Executor.cpp @@ -233,6 +233,11 @@ void Executor::operator()(const Model::DispatchCommand& command) { Timer loopTimer, iterationTimer, bindTimer, dispatchTimer; + if (m_commandLineArgs.GetSetStablePowerState()) + { + THROW_IF_FAILED(m_device->D3D()->SetStablePowerState(TRUE)); + } + for (; !timedOut && iterationsCompleted < m_commandLineArgs.DispatchIterations(); iterationsCompleted++) { iterationTimer.Start(); @@ -274,8 +279,20 @@ void Executor::operator()(const Model::DispatchCommand& command) catch (const std::exception& e) { LogError(fmt::format("Failed to execute dispatchable: {}", e.what())); + + if (m_commandLineArgs.GetSetStablePowerState()) + { + m_device->D3D()->SetStablePowerState(FALSE); + } + return; } + + if (m_commandLineArgs.GetSetStablePowerState()) + { + m_device->D3D()->SetStablePowerState(FALSE); + } + PIXEndEvent(); auto cpuStats = cpuTimings.ComputeStats(m_commandLineArgs.MaxWarmupSamples());