From 6289adc2a79b176db20f3eefd5d0f3316ed71f4a Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 15 Aug 2024 17:26:18 +0200 Subject: [PATCH 01/21] Adding tasks for onnx runtime gpu inference on AMD GPU's --- Detectors/TPC/workflow/CMakeLists.txt | 4 + .../workflow/test/test_onnx_gpu_inference.cxx | 323 ++++++++++++++++++ 2 files changed, 327 insertions(+) create mode 100644 Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx diff --git a/Detectors/TPC/workflow/CMakeLists.txt b/Detectors/TPC/workflow/CMakeLists.txt index 3b05e5067108c..592fc1ef25d40 100644 --- a/Detectors/TPC/workflow/CMakeLists.txt +++ b/Detectors/TPC/workflow/CMakeLists.txt @@ -81,6 +81,10 @@ if(OpenMP_CXX_FOUND) target_link_libraries(${mergertargetName} PRIVATE OpenMP::OpenMP_CXX) endif() +o2_add_executable(onnx-gpu + COMPONENT_NAME test + SOURCES test/test_onnx_gpu_inference.cxx + PUBLIC_LINK_LIBRARIES O2::TPCWorkflow O2::SimulationDataFormat O2::TPCQC O2::DataFormatsTPC O2::TPCBase ONNXRuntime::ONNXRuntime Boost::thread O2::GPUTracking) o2_add_executable(reco-workflow COMPONENT_NAME tpc diff --git a/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx b/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx new file mode 100644 index 0000000000000..ee5d2d7fa15a0 --- /dev/null +++ b/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx @@ -0,0 +1,323 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Algorithm/RangeTokenizer.h" +#include "SimulationDataFormat/MCCompLabel.h" +#include "SimulationDataFormat/ConstMCTruthContainer.h" +#include "SimulationDataFormat/LabelContainer.h" +#include "SimulationDataFormat/IOMCTruthContainerView.h" +#include "SimulationDataFormat/MCTruthContainer.h" + +#include "Headers/DataHeader.h" + +#include "Steer/MCKinematicsReader.h" + +#include "DPLUtils/RootTreeReader.h" +#include "DPLUtils/MakeRootTreeWriterSpec.h" + +#include "DataFormatsTPC/WorkflowHelper.h" +#include "DataFormatsTPC/ClusterNativeHelper.h" +#include "DataFormatsTPC/ClusterNative.h" +#include "DataFormatsTPC/ClusterGroupAttribute.h" +#include "DataFormatsTPC/Constants.h" +#include "DataFormatsTPC/TrackTPC.h" +#include "DataFormatsGlobalTracking/TrackTuneParams.h" +#include "DataFormatsTPC/Defs.h" + +#include "TPCWorkflow/ProcessingHelpers.h" +#include "TPCQC/Clusters.h" +#include "TPCBase/Painter.h" +#include "TPCBase/CalDet.h" +#include "TPCBase/Mapper.h" + +#include "Framework/Logger.h" +#include "Framework/Task.h" +#include "Framework/DataProcessorSpec.h" +#include "Framework/ConfigParamRegistry.h" +#include "Framework/ControlService.h" +#include "Framework/CompletionPolicyHelpers.h" +#include "Framework/WorkflowSpec.h" +#include "Framework/CallbacksPolicy.h" + +#include "DetectorsRaw/HBFUtils.h" + +using namespace o2; +using namespace o2::tpc; +using namespace o2::framework; + +namespace o2 +{ +namespace tpc +{ +class onnxGPUinference : public Task +{ + public: + + onnxGPUinference(std::unordered_map options_map) { + model_path = options_map["path"]; + device = options_map["device"]; + dtype = options_map["dtype"]; + std::stringstream(options_map["device-id"]) >> device_id; + std::stringstream(options_map["num-iter"]) >> test_size_iter; + std::stringstream(options_map["execution-threads"]) >> execution_threads; + std::stringstream(options_map["threads-per-session-cpu"]) >> threads_per_session_cpu; + std::stringstream(options_map["num-tensors"]) >> test_num_tensors; + std::stringstream(options_map["size-tensor"]) >> test_size_tensor; + std::stringstream(options_map["measure-cycle"]) >> epochs_measure; + std::stringstream(options_map["logging-level"]) >> logging_level; + std::stringstream(options_map["enable-optimizations"]) >> enable_optimizations; + + LOG(info) << "Options loaded"; + + execution_threads = std::min((int)execution_threads, (int)boost::thread::hardware_concurrency()); + + // Set the environment variable to use ROCm execution provider + if(device=="GPU"){ + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, device_id)); + LOG(info) << "ROCM execution provider set"; + } else if(device=="CPU"){ + session_options.SetIntraOpNumThreads(threads_per_session_cpu); + if(threads_per_session_cpu > 0){ + LOG(info) << "CPU execution provider set with " << threads_per_session_cpu << " threads"; + } else { + threads_per_session_cpu = 0; + LOG(info) << "CPU execution provider set with default number of threads"; + } + if(threads_per_session_cpu > 1){ + session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } + } else { + LOG(fatal) << "Device not recognized"; + } + // std::vector providers = session.GetProviders(); + // for (const auto& provider : providers) { + // LOG(info) << "Using execution provider: " << provider << std::endl; + // } + + if((int)enable_profiling){ + session_options.EnableProfiling((options_map["profiling-output-path"] + "/ORT_LOG_").c_str()); + } + if(enable_optimizations){ + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + } + session_options.SetLogSeverityLevel(logging_level); + + env.resize(execution_threads); + session.resize(execution_threads); + for(int s = 0; s < execution_threads; s++){ + env[s] = Ort::Env(ORT_LOGGING_LEVEL_VERBOSE, "onnx_model_inference"); + session[s].reset(new Ort::Session{env[s], model_path.c_str(), session_options}); + } + LOG(info) << "Sessions created"; + + LOG(info) << "Number of iterations: " << test_size_iter << ", size of the test tensor: " << test_size_tensor << ", measuring every " << epochs_measure << " cycles, number of tensors: " << test_num_tensors << ", execution threads: " << execution_threads; + + for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { + mInputNames.push_back(session[0]->GetInputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { + mInputShapes.emplace_back(session[0]->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { + mOutputNames.push_back(session[0]->GetOutputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { + mOutputShapes.emplace_back(session[0]->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + LOG(info) << "Initializing ONNX names and sizes"; + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + // Print names + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + }; + + void runONNXGPUModel(std::vector>& input) { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = session[thrd]->Run(runOptions, inputNamesChar.data(), input[thrd].data(), input[thrd].size(), outputNamesChar.data(), outputNamesChar.size()); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + void init(InitContext& ic) final {}; + void run(ProcessingContext& pc) final { + double time = 0; + + LOG(info) << "Preparing input data"; + // Prepare input data + std::vector inputShape{test_size_tensor, mInputShapes[0][1]}; + + LOG(info) << "Creating memory info"; + Ort::MemoryInfo mem_info("Cpu", OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemType::OrtMemTypeDefault); + + LOG(info) << "Creating ONNX tensor"; + std::vector> input_tensor(execution_threads); + if(dtype=="FP16"){ + std::vector input_data(mInputShapes[0][1] * test_size_tensor, (Ort::Float16_t)1.f); // Example input + for(int i = 0; i < execution_threads; i++){ + for(int j = 0; j < test_num_tensors; j++){ + input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); + } + } + } else { + std::vector input_data(mInputShapes[0][1] * test_size_tensor, 1.0f); // Example input + for(int i = 0; i < execution_threads; i++){ + for(int j = 0; j < test_num_tensors; j++){ + input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); + } + } + } + + LOG(info) << "Starting inference"; + for(int i = 0; i < test_size_iter; i++){ + auto start_network_eval = std::chrono::high_resolution_clock::now(); + runONNXGPUModel(input_tensor); + // std::vector output = model.inference(test); + auto end_network_eval = std::chrono::high_resolution_clock::now(); + time += std::chrono::duration>(end_network_eval - start_network_eval).count(); + if((i % epochs_measure == 0) && (i != 0)){ + time /= 1e9; + LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; + time = 0; + } + } + + // for(auto out : output){ + // LOG(info) << "Test output: " << out; + // } + pc.services().get().endOfStream(); + pc.services().get().readyToQuit(QuitRequest::Me); + }; + + private: + + std::vector model_buffer; + std::string model_path, device, dtype; + int device_id, execution_threads, threads_per_session_cpu, enable_profiling, logging_level, enable_optimizations; + size_t test_size_iter, test_size_tensor, epochs_measure, test_num_tensors; + + Ort::RunOptions runOptions; + std::vector env; + std::vector> session; + Ort::SessionOptions session_options; + Ort::AllocatorWithDefaultOptions allocator; + + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames; + std::vector> mInputShapes; + std::vector mOutputNames; + std::vector> mOutputShapes; + + std::string printShape(const std::vector& v) + { + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); + }; +}; +} +} + +void customize(std::vector& workflowOptions) +{ + std::vector options{ + {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, + {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, + {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, + {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, + {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, + {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, + {"threads-per-session-cpu", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, + {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, + {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, + {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, + {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, + {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, + {"logging-level", VariantType::Int, 0, {"Logging level"}}, + {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}} + }; + std::swap(workflowOptions, options); +} + +// --------------------------------- +#include "Framework/runDataProcessing.h" + +DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) +{ + + // A copy of the global workflow options from customize() to pass to the task + std::unordered_map options_map{ + {"path", cfgc.options().get("path")}, + {"device", cfgc.options().get("device")}, + {"device-id", std::to_string(cfgc.options().get("device-id"))}, + {"dtype", cfgc.options().get("dtype")}, + {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, + {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, + {"threads-per-session-cpu", std::to_string(cfgc.options().get("threads-per-session-cpu"))}, + {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, + {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, + {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, + {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, + {"profiling-output-path", cfgc.options().get("profiling-output-path")}, + {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, + {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))} + }; + + return DataProcessorSpec{ + "test-onnx-gpu", + inputs, + outputs, + adaptFromTask(options_map), + Options{ + {"somethingElse", VariantType::String, "-", {"Something else"}} + } + }; +} + +WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) +{ + + WorkflowSpec specs; + + static std::vector inputs; + static std::vector outputs; + + specs.push_back(testProcess(cfgc, inputs, outputs)); + + return specs; +} \ No newline at end of file From e585588c155d2f37786061e6c9a2618bfc0678c2 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Wed, 28 Aug 2024 17:12:37 +0200 Subject: [PATCH 02/21] Working header file for ONNX model executions --- Common/CMakeLists.txt | 1 + Common/ML/CMakeLists.txt | 15 + Common/ML/include/ML/GPUORTFloat16.h | 875 ++++++++++++++++++ Common/ML/include/ML/ort_interface.h | 102 ++ Common/ML/src/ort_interface.cxx | 222 +++++ Detectors/TPC/workflow/CMakeLists.txt | 6 + .../workflow/test/test_onnx_gpu_inference.cxx | 486 +++++----- .../test/test_onnx_interface_headers.cxx | 227 +++++ 8 files changed, 1702 insertions(+), 232 deletions(-) create mode 100644 Common/ML/CMakeLists.txt create mode 100644 Common/ML/include/ML/GPUORTFloat16.h create mode 100644 Common/ML/include/ML/ort_interface.h create mode 100644 Common/ML/src/ort_interface.cxx create mode 100644 Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt index f435e269575aa..5419aaf1b1b92 100644 --- a/Common/CMakeLists.txt +++ b/Common/CMakeLists.txt @@ -14,6 +14,7 @@ add_subdirectory(MathUtils) add_subdirectory(Field) add_subdirectory(Types) add_subdirectory(Utils) +add_subdirectory(ML) add_subdirectory(SimConfig) add_subdirectory(DCAFitter) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt new file mode 100644 index 0000000000000..777f474e687fa --- /dev/null +++ b/Common/ML/CMakeLists.txt @@ -0,0 +1,15 @@ +# Copyright 2019-2020 CERN and copyright holders of ALICE O2. +# See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +# All rights not expressly granted are reserved. +# +# This software is distributed under the terms of the GNU General Public +# License v3 (GPL Version 3), copied verbatim in the file "COPYING". +# +# In applying this license CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +o2_add_library(ML + SOURCES src/ort_interface.cxx + TARGETVARNAME targetName + PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file diff --git a/Common/ML/include/ML/GPUORTFloat16.h b/Common/ML/include/ML/GPUORTFloat16.h new file mode 100644 index 0000000000000..ce13f576205f8 --- /dev/null +++ b/Common/ML/include/ML/GPUORTFloat16.h @@ -0,0 +1,875 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file GPUORTFloat16.h +/// \author Christian Sonnabend +/// \brief An implementation of the ONNXRuntime Float16_t data-type for GPU acceleration + +#include +#include +#include +#include + +namespace o2 +{ + +namespace OrtDataType +{ + +namespace detail +{ + +enum class endian { +#if defined(_WIN32) + little = 0, + big = 1, + native = little, +#elif defined(__GNUC__) || defined(__clang__) + little = __ORDER_LITTLE_ENDIAN__, + big = __ORDER_BIG_ENDIAN__, + native = __BYTE_ORDER__, +#else +#error OrtDataType::detail::endian is not implemented in this environment. +#endif +}; + +static_assert( + endian::native == endian::little || endian::native == endian::big, + "Only little-endian or big-endian native byte orders are supported."); + +} // namespace detail + +/// +/// Shared implementation between public and internal classes. CRTP pattern. +/// +template +struct Float16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + constexpr static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7C00U; + static constexpr uint16_t kPositiveInfinityBits = 0x7C00U; + static constexpr uint16_t kNegativeInfinityBits = 0xFC00U; + static constexpr uint16_t kPositiveQNaNBits = 0x7E00U; + static constexpr uint16_t kNegativeQNaNBits = 0xFE00U; + static constexpr uint16_t kEpsilonBits = 0x4170U; + static constexpr uint16_t kMinValueBits = 0xFBFFU; // Minimum normal number + static constexpr uint16_t kMaxValueBits = 0x7BFFU; // Largest normal number + static constexpr uint16_t kOneBits = 0x3C00U; + static constexpr uint16_t kMinusOneBits = 0xBC00U; + + uint16_t val{0}; + + Float16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const Float16Impl& lhs, const Float16Impl& rhs) noexcept + { + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } + + bool operator==(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is not equal to anything, including itself. + return false; + } + return val == rhs.val; + } + + bool operator!=(const Float16Impl& rhs) const noexcept { return !(*this == rhs); } + + bool operator<(const Float16Impl& rhs) const noexcept + { + if (IsNaN() || rhs.IsNaN()) { + // IEEE defines that NaN is unordered with respect to everything, including itself. + return false; + } + + const bool left_is_negative = IsNegative(); + if (left_is_negative != rhs.IsNegative()) { + // When the signs of left and right differ, we know that left is less than right if it is + // the negative value. The exception to this is if both values are zero, in which case IEEE + // says they should be equal, even if the signs differ. + return left_is_negative && !AreZero(*this, rhs); + } + return (val != rhs.val) && ((val < rhs.val) ^ left_is_negative); + } +}; + +// The following Float16_t conversions are based on the code from +// Eigen library. + +// The conversion routines are Copyright (c) Fabian Giesen, 2016. +// The original license follows: +// +// Copyright (c) Fabian Giesen, 2016 +// All rights reserved. +// Redistribution and use in source and binary forms, with or without +// modification, are permitted. +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +namespace detail +{ +union float32_bits { + unsigned int u; + float f; +}; +}; // namespace detail + +template +inline constexpr uint16_t Float16Impl::ToUint16Impl(float v) noexcept +{ + detail::float32_bits f{}; + f.f = v; + + constexpr detail::float32_bits f32infty = {255 << 23}; + constexpr detail::float32_bits f16max = {(127 + 16) << 23}; + constexpr detail::float32_bits denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23}; + constexpr unsigned int sign_mask = 0x80000000u; + uint16_t val = static_cast(0x0u); + + unsigned int sign = f.u & sign_mask; + f.u ^= sign; + + // NOTE all the integer compares in this function can be safely + // compiled into signed compares since all operands are below + // 0x80000000. Important if you want fast straight SSE2 code + // (since there's no unsigned PCMPGTD). + + if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) + val = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf + } else { // (De)normalized number or zero + if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero + // use a magic value to align our 10 mantissa bits at the bottom of + // the float. as long as FP addition is round-to-nearest-even this + // just works. + f.f += denorm_magic.f; + + // and one integer subtract of the bias later, we have our final float! + val = static_cast(f.u - denorm_magic.u); + } else { + unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd + + // update exponent, rounding bias part 1 + // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but + // without arithmetic overflow. + f.u += 0xc8000fffU; + // rounding bias part 2 + f.u += mant_odd; + // take the bits! + val = static_cast(f.u >> 13); + } + } + + val |= static_cast(sign >> 16); + return val; +} + +template +inline float Float16Impl::ToFloatImpl() const noexcept +{ + constexpr detail::float32_bits magic = {113 << 23}; + constexpr unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift + detail::float32_bits o{}; + + o.u = (val & 0x7fff) << 13; // exponent/mantissa bits + unsigned int exp = shifted_exp & o.u; // just the exponent + o.u += (127 - 15) << 23; // exponent adjust + + // handle exponent special cases + if (exp == shifted_exp) { // Inf/NaN? + o.u += (128 - 16) << 23; // extra exp adjust + } else if (exp == 0) { // Zero/Denormal? + o.u += 1 << 23; // extra exp adjust + o.f -= magic.f; // re-normalize + } + + // Attempt to workaround the Internal Compiler Error on ARM64 + // for bitwise | operator, including std::bitset +#if (defined _MSC_VER) && (defined _M_ARM || defined _M_ARM64 || defined _M_ARM64EC) + if (IsNegative()) { + return -o.f; + } +#else + // original code: + o.u |= (val & 0x8000U) << 16U; // sign bit +#endif + return o.f; +} + +/// Shared implementation between public and internal classes. CRTP pattern. +template +struct BFloat16Impl { + protected: + /// + /// Converts from float to uint16_t float16 representation + /// + /// + /// + static uint16_t ToUint16Impl(float v) noexcept; + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloatImpl() const noexcept; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + uint16_t AbsImpl() const noexcept + { + return static_cast(val & ~kSignMask); + } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + uint16_t NegateImpl() const noexcept + { + return IsNaN() ? val : static_cast(val ^ kSignMask); + } + + public: + // uint16_t special values + static constexpr uint16_t kSignMask = 0x8000U; + static constexpr uint16_t kBiasedExponentMask = 0x7F80U; + static constexpr uint16_t kPositiveInfinityBits = 0x7F80U; + static constexpr uint16_t kNegativeInfinityBits = 0xFF80U; + static constexpr uint16_t kPositiveQNaNBits = 0x7FC1U; + static constexpr uint16_t kNegativeQNaNBits = 0xFFC1U; + static constexpr uint16_t kSignaling_NaNBits = 0x7F80U; + static constexpr uint16_t kEpsilonBits = 0x0080U; + static constexpr uint16_t kMinValueBits = 0xFF7FU; + static constexpr uint16_t kMaxValueBits = 0x7F7FU; + static constexpr uint16_t kRoundToNearest = 0x7FFFU; + static constexpr uint16_t kOneBits = 0x3F80U; + static constexpr uint16_t kMinusOneBits = 0xBF80U; + + uint16_t val{0}; + + BFloat16Impl() = default; + + /// + /// Checks if the value is negative + /// + /// true if negative + bool IsNegative() const noexcept + { + return static_cast(val) < 0; + } + + /// + /// Tests if the value is NaN + /// + /// true if NaN + bool IsNaN() const noexcept + { + return AbsImpl() > kPositiveInfinityBits; + } + + /// + /// Tests if the value is finite + /// + /// true if finite + bool IsFinite() const noexcept + { + return AbsImpl() < kPositiveInfinityBits; + } + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + bool IsPositiveInfinity() const noexcept + { + return val == kPositiveInfinityBits; + } + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + bool IsNegativeInfinity() const noexcept + { + return val == kNegativeInfinityBits; + } + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + bool IsInfinity() const noexcept + { + return AbsImpl() == kPositiveInfinityBits; + } + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + bool IsNaNOrZero() const noexcept + { + auto abs = AbsImpl(); + return (abs == 0 || abs > kPositiveInfinityBits); + } + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + bool IsNormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) != 0); // is not subnormal (has a non-zero exponent) + } + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + bool IsSubnormal() const noexcept + { + auto abs = AbsImpl(); + return (abs < kPositiveInfinityBits) // is finite + && (abs != 0) // is not zero + && ((abs & kBiasedExponentMask) == 0); // is subnormal (has a zero exponent) + } + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + Derived Abs() const noexcept { return Derived::FromBits(AbsImpl()); } + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + Derived Negate() const noexcept { return Derived::FromBits(NegateImpl()); } + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + static bool AreZero(const BFloat16Impl& lhs, const BFloat16Impl& rhs) noexcept + { + // IEEE defines that positive and negative zero are equal, this gives us a quick equality check + // for two values by or'ing the private bits together and stripping the sign. They are both zero, + // and therefore equivalent, if the resulting value is still zero. + return static_cast((lhs.val | rhs.val) & ~kSignMask) == 0; + } +}; + +template +inline uint16_t BFloat16Impl::ToUint16Impl(float v) noexcept +{ + uint16_t result; + if (std::isnan(v)) { + result = kPositiveQNaNBits; + } else { + auto get_msb_half = [](float fl) { + uint16_t result; +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memcpy(&result, reinterpret_cast(&fl) + sizeof(uint16_t), sizeof(uint16_t)); + } else { + std::memcpy(&result, &fl, sizeof(uint16_t)); + } + return result; + }; + + uint16_t upper_bits = get_msb_half(v); + union { + uint32_t U32; + float F32; + }; + F32 = v; + U32 += (upper_bits & 1) + kRoundToNearest; + result = get_msb_half(F32); + } + return result; +} + +template +inline float BFloat16Impl::ToFloatImpl() const noexcept +{ + if (IsNaN()) { + return std::numeric_limits::quiet_NaN(); + } + float result; + char* const first = reinterpret_cast(&result); + char* const second = first + sizeof(uint16_t); +#ifdef __cpp_if_constexpr + if constexpr (detail::endian::native == detail::endian::little) +#else + if (detail::endian::native == detail::endian::little) +#endif + { + std::memset(first, 0, sizeof(uint16_t)); + std::memcpy(second, &val, sizeof(uint16_t)); + } else { + std::memcpy(first, &val, sizeof(uint16_t)); + std::memset(second, 0, sizeof(uint16_t)); + } + return result; +} + +/** \brief IEEE 754 half-precision floating point data type + * + * \details This struct is used for converting float to float16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::Float16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector fp16_values; + * fp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(fp16_values), + * [](float value) { return Ort::Float16_t(value); }); + * + * \endcode + */ +struct Float16_t : OrtDataType::Float16Impl { + private: + /// + /// Constructor from a 16-bit representation of a float16 value + /// No conversion is done here. + /// + /// 16-bit representation + constexpr explicit Float16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::Float16Impl; + + /// + /// Default constructor + /// + Float16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of float16. + /// + /// uint16_t bit representation of float16 + /// new instance of Float16_t + constexpr static Float16_t FromBits(uint16_t v) noexcept { return Float16_t(v); } + + /// + /// __ctor from float. Float is converted into float16 16-bit representation. + /// + /// float value + explicit Float16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts float16 to float + /// + /// float representation of float16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts Float16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + using Base::operator==; + using Base::operator!=; + using Base::operator<; +}; + +static_assert(sizeof(Float16_t) == sizeof(uint16_t), "Sizes must match"); + +/** \brief bfloat16 (Brain Floating Point) data type + * + * \details This struct is used for converting float to bfloat16 and back + * so the user could feed inputs and fetch outputs using these type. + * + * The size of the structure should align with uint16_t and one can freely cast + * uint16_t buffers to/from Ort::BFloat16_t to feed and retrieve data. + * + * \code{.unparsed} + * // This example demonstrates converion from float to float16 + * constexpr float values[] = {1.f, 2.f, 3.f, 4.f, 5.f}; + * std::vector bfp16_values; + * bfp16_values.reserve(std::size(values)); + * std::transform(std::begin(values), std::end(values), std::back_inserter(bfp16_values), + * [](float value) { return Ort::BFloat16_t(value); }); + * + * \endcode + */ +struct BFloat16_t : OrtDataType::BFloat16Impl { + private: + /// + /// Constructor from a uint16_t representation of bfloat16 + /// used in FromBits() to escape overload resolution issue with + /// constructor from float. + /// No conversion is done. + /// + /// 16-bit bfloat16 value + constexpr explicit BFloat16_t(uint16_t v) noexcept { val = v; } + + public: + using Base = OrtDataType::BFloat16Impl; + + BFloat16_t() = default; + + /// + /// Explicit conversion to uint16_t representation of bfloat16. + /// + /// uint16_t bit representation of bfloat16 + /// new instance of BFloat16_t + static constexpr BFloat16_t FromBits(uint16_t v) noexcept { return BFloat16_t(v); } + + /// + /// __ctor from float. Float is converted into bfloat16 16-bit representation. + /// + /// float value + explicit BFloat16_t(float v) noexcept { val = Base::ToUint16Impl(v); } + + /// + /// Converts bfloat16 to float + /// + /// float representation of bfloat16 value + float ToFloat() const noexcept { return Base::ToFloatImpl(); } + + /// + /// Checks if the value is negative + /// + /// true if negative + using Base::IsNegative; + + /// + /// Tests if the value is NaN + /// + /// true if NaN + using Base::IsNaN; + + /// + /// Tests if the value is finite + /// + /// true if finite + using Base::IsFinite; + + /// + /// Tests if the value represents positive infinity. + /// + /// true if positive infinity + using Base::IsPositiveInfinity; + + /// + /// Tests if the value represents negative infinity + /// + /// true if negative infinity + using Base::IsNegativeInfinity; + + /// + /// Tests if the value is either positive or negative infinity. + /// + /// True if absolute value is infinity + using Base::IsInfinity; + + /// + /// Tests if the value is NaN or zero. Useful for comparisons. + /// + /// True if NaN or zero. + using Base::IsNaNOrZero; + + /// + /// Tests if the value is normal (not zero, subnormal, infinite, or NaN). + /// + /// True if so + using Base::IsNormal; + + /// + /// Tests if the value is subnormal (denormal). + /// + /// True if so + using Base::IsSubnormal; + + /// + /// Creates an instance that represents absolute value. + /// + /// Absolute value + using Base::Abs; + + /// + /// Creates a new instance with the sign flipped. + /// + /// Flipped sign instance + using Base::Negate; + + /// + /// IEEE defines that positive and negative zero are equal, this gives us a quick equality check + /// for two values by or'ing the private bits together and stripping the sign. They are both zero, + /// and therefore equivalent, if the resulting value is still zero. + /// + /// first value + /// second value + /// True if both arguments represent zero + using Base::AreZero; + + /// + /// User defined conversion operator. Converts BFloat16_t to float. + /// + explicit operator float() const noexcept { return ToFloat(); } + + // We do not have an inherited impl for the below operators + // as the internal class implements them a little differently + bool operator==(const BFloat16_t& rhs) const noexcept; + bool operator!=(const BFloat16_t& rhs) const noexcept { return !(*this == rhs); } + bool operator<(const BFloat16_t& rhs) const noexcept; +}; + +static_assert(sizeof(BFloat16_t) == sizeof(uint16_t), "Sizes must match"); + +} // namespace OrtDataType + +} // namespace o2 \ No newline at end of file diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h new file mode 100644 index 0000000000000..752acf33ef99a --- /dev/null +++ b/Common/ML/include/ML/ort_interface.h @@ -0,0 +1,102 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.h +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#ifndef O2_ML_ONNX_INTERFACE_H +#define O2_ML_ONNX_INTERFACE_H + +// C++ and system includes +#include +#include +#include +#include +#include + +// ONNX includes +#include + +// O2 includes +#include "GPUORTFloat16.h" +#include "Framework/Logger.h" + +namespace o2 +{ + +namespace ml +{ + +class OrtModel +{ + + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } + void init(std::unordered_map optionsMap){ reset(optionsMap); } + void reset(std::unordered_map); + + virtual ~OrtModel() = default; + + // Conversion + template + std::vector v2v(std::vector&, bool = true); + + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); + + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); + + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); + + // Reset session + void resetSession(); + + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } + + void setActiveThreads(int threads) { intraOpNumThreads = threads; } + + private: + + // ORT runtime objects + Ort::RunOptions runOptions; + std::shared_ptr env = nullptr; + std::shared_ptr session = nullptr; ///< ONNX session + Ort::SessionOptions sessionOptions; + Ort::AllocatorWithDefaultOptions allocator; + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; + + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + + std::string printShape(const std::vector&); + +}; + +} // namespace ml + +} // namespace ml + +#endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx new file mode 100644 index 0000000000000..ad02d2bd63a86 --- /dev/null +++ b/Common/ML/src/ort_interface.cxx @@ -0,0 +1,222 @@ +// Copyright 2019-2020 CERN and copyright holders of ALICE O2. +// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. +// All rights not expressly granted are reserved. +// +// This software is distributed under the terms of the GNU General Public +// License v3 (GPL Version 3), copied verbatim in the file "COPYING". +// +// In applying this license CERN does not waive the privileges and immunities +// granted to it by virtue of its status as an Intergovernmental Organization +// or submit itself to any jurisdiction. + +/// \file ort_interface.cxx +/// \author Christian Sonnabend +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU + +#include "ML/ort_interface.h" + +namespace o2 +{ + +namespace ml +{ + +void OrtModel::reset(std::unordered_map optionsMap){ + // Load from options map + if(!optionsMap.contains("model-path")){ + LOG(fatal) << "(ORT) Model path cannot be empty!"; + } + modelPath = optionsMap["model-path"]; + device = (optionsMap.contains("device") ? optionsMap["device"] : "cpu"); + dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); + deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); + allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); + enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); + + if(device == "rocm") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(sessionOptions, deviceId)); + LOG(info) << "(ORT) ROCM execution provider set"; + } else if(device == "migraphx") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, deviceId)); + LOG(info) << "(ORT) MIGraphX execution provider set"; + } + if(allocateDeviceMemory){ + memoryInfo = Ort::MemoryInfo("Hip", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory (HIP)"; + } +#if defined(__CUDACC__) + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, deviceId)); + if(allocateDeviceMemory){ + memoryInfo = Ort::MemoryInfo("Cuda", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory (CUDA)"; + } +#endif + + if(device == "cpu") { + sessionOptions.SetIntraOpNumThreads(intraOpNumThreads); + if(intraOpNumThreads > 1){ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } else if(intraOpNumThreads == 1){ + sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + } + LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; + } + + sessionOptions.DisableMemPattern(); + sessionOptions.DisableCpuMemArena(); + + if(enableProfiling){ + if(optionsMap.contains("profiling-output-path")){ + sessionOptions.EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; + sessionOptions.DisableProfiling(); + } + } else { + sessionOptions.DisableProfiling(); + } + sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + sessionOptions.SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + + env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); + session.reset(new Ort::Session{*env, modelPath.c_str(), sessionOptions}); + + for (size_t i = 0; i < session->GetInputCount(); ++i) { + mInputNames.push_back(session->GetInputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session->GetInputCount(); ++i) { + mInputShapes.emplace_back(session->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < session->GetOutputCount(); ++i) { + mOutputNames.push_back(session->GetOutputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session->GetOutputCount(); ++i) { + mOutputShapes.emplace_back(session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + // Print names + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } +} + +void OrtModel::resetSession() { + session.reset(new Ort::Session{*env, modelPath.c_str(), sessionOptions}); +} + +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) { + if constexpr (std::is_same_v){ + return input; + } else { + std::vector output(input.size()); + std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); + if(clearInput) input.clear(); + return output; + } +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input){ + std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = outputTensors[0].template GetTensorMutableData(); + outputTensors.clear(); + return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +} + +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input){ + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(i)).data(), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + O* outputValues = outputTensors[0].template GetTensorMutableData(); + outputTensors.clear(); + return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +} + +// template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +// std::vector OrtModel::inference(std::vector& input){ +// std::vector inputShape{input.size(), mInputShapes[0][1]}; +// std::vector inputTensor; +// inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); +// input.clear(); +// auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); +// O* outputValues = outputTensors[0].template GetTensorMutableData(); +// outputTensors.clear(); +// return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +// } + +std::string OrtModel::printShape(const std::vector& v) +{ + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); +} + +template std::vector OrtModel::v2v(std::vector&, bool); + +template std::vector OrtModel::inference(std::vector&); +template std::vector OrtModel::inference(std::vector&); + +template std::vector OrtModel::inference(std::vector>&); +template std::vector OrtModel::inference(std::vector>&); + +// template std::vector OrtModel::v2v(std::vector&, bool); +// template <> std::vector OrtModel::v2v(std::vector& input, bool clearInput) { +// std::vector output(input.size()); +// std::transform(std::begin(input), std::end(input), std::begin(output), [](OrtDataType::Float16_t f) { return Ort::Float16_t::FromBits(f.val); }); +// if(clearInput) input.clear(); +// return output; +// }; +// template <> std::vector OrtModel::v2v(std::vector& input, bool clearInput) { +// std::vector output(input.size()); +// std::transform(std::begin(input), std::end(input), std::begin(output), [](Ort::Float16_t f) { return OrtDataType::Float16_t::FromBits(f.val); }); +// if(clearInput) input.clear(); +// return output; +// }; +// template std::vector OrtModel::v2v(std::vector&, bool); +// +// // template std::vector OrtModel::inference(std::vector&); +// // template std::vector OrtModel::inference(std::vector&); +// // template std::vector OrtModel::inference(std::vector&); +// +// template <> std::vector OrtModel::inference(std::vector& input){ +// return OrtModel::inference(input); +// }; +// template <> std::vector OrtModel::inference(std::vector& input) { +// return OrtModel::inference(input); +// }; +// +// template <> std::vector OrtModel::inference(std::vector& input) { +// return OrtModel::inference(input); +// }; + +} // namespace ml + +} // namespace o2 \ No newline at end of file diff --git a/Detectors/TPC/workflow/CMakeLists.txt b/Detectors/TPC/workflow/CMakeLists.txt index 592fc1ef25d40..4060a4b7832f2 100644 --- a/Detectors/TPC/workflow/CMakeLists.txt +++ b/Detectors/TPC/workflow/CMakeLists.txt @@ -86,6 +86,12 @@ o2_add_executable(onnx-gpu SOURCES test/test_onnx_gpu_inference.cxx PUBLIC_LINK_LIBRARIES O2::TPCWorkflow O2::SimulationDataFormat O2::TPCQC O2::DataFormatsTPC O2::TPCBase ONNXRuntime::ONNXRuntime Boost::thread O2::GPUTracking) +o2_add_executable(onnx-interface + COMPONENT_NAME test + SOURCES test/test_onnx_interface_headers.cxx + PUBLIC_LINK_LIBRARIES O2::TPCWorkflow O2::SimulationDataFormat O2::TPCQC O2::DataFormatsTPC O2::TPCBase O2::ML Boost::thread O2::GPUTracking) + + o2_add_executable(reco-workflow COMPONENT_NAME tpc SOURCES src/tpc-reco-workflow.cxx diff --git a/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx b/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx index ee5d2d7fa15a0..4d5f2af587a4c 100644 --- a/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx @@ -66,212 +66,233 @@ namespace tpc { class onnxGPUinference : public Task { - public: - - onnxGPUinference(std::unordered_map options_map) { - model_path = options_map["path"]; - device = options_map["device"]; - dtype = options_map["dtype"]; - std::stringstream(options_map["device-id"]) >> device_id; - std::stringstream(options_map["num-iter"]) >> test_size_iter; - std::stringstream(options_map["execution-threads"]) >> execution_threads; - std::stringstream(options_map["threads-per-session-cpu"]) >> threads_per_session_cpu; - std::stringstream(options_map["num-tensors"]) >> test_num_tensors; - std::stringstream(options_map["size-tensor"]) >> test_size_tensor; - std::stringstream(options_map["measure-cycle"]) >> epochs_measure; - std::stringstream(options_map["logging-level"]) >> logging_level; - std::stringstream(options_map["enable-optimizations"]) >> enable_optimizations; - - LOG(info) << "Options loaded"; - - execution_threads = std::min((int)execution_threads, (int)boost::thread::hardware_concurrency()); - - // Set the environment variable to use ROCm execution provider - if(device=="GPU"){ - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, device_id)); - LOG(info) << "ROCM execution provider set"; - } else if(device=="CPU"){ - session_options.SetIntraOpNumThreads(threads_per_session_cpu); - if(threads_per_session_cpu > 0){ - LOG(info) << "CPU execution provider set with " << threads_per_session_cpu << " threads"; - } else { - threads_per_session_cpu = 0; - LOG(info) << "CPU execution provider set with default number of threads"; - } - if(threads_per_session_cpu > 1){ - session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } - } else { - LOG(fatal) << "Device not recognized"; - } - // std::vector providers = session.GetProviders(); - // for (const auto& provider : providers) { - // LOG(info) << "Using execution provider: " << provider << std::endl; - // } - - if((int)enable_profiling){ - session_options.EnableProfiling((options_map["profiling-output-path"] + "/ORT_LOG_").c_str()); - } - if(enable_optimizations){ - session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - } - session_options.SetLogSeverityLevel(logging_level); - - env.resize(execution_threads); - session.resize(execution_threads); - for(int s = 0; s < execution_threads; s++){ - env[s] = Ort::Env(ORT_LOGGING_LEVEL_VERBOSE, "onnx_model_inference"); - session[s].reset(new Ort::Session{env[s], model_path.c_str(), session_options}); - } - LOG(info) << "Sessions created"; - - LOG(info) << "Number of iterations: " << test_size_iter << ", size of the test tensor: " << test_size_tensor << ", measuring every " << epochs_measure << " cycles, number of tensors: " << test_num_tensors << ", execution threads: " << execution_threads; - - for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { - mInputNames.push_back(session[0]->GetInputNameAllocated(i, allocator).get()); - } - for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { - mInputShapes.emplace_back(session[0]->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { - mOutputNames.push_back(session[0]->GetOutputNameAllocated(i, allocator).get()); - } - for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { - mOutputShapes.emplace_back(session[0]->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - - LOG(info) << "Initializing ONNX names and sizes"; - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - - // Print names - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } - - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); - } - }; - - void runONNXGPUModel(std::vector>& input) { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = session[thrd]->Run(runOptions, inputNamesChar.data(), input[thrd].data(), input[thrd].size(), outputNamesChar.data(), outputNamesChar.size()); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - void init(InitContext& ic) final {}; - void run(ProcessingContext& pc) final { - double time = 0; - - LOG(info) << "Preparing input data"; - // Prepare input data - std::vector inputShape{test_size_tensor, mInputShapes[0][1]}; - - LOG(info) << "Creating memory info"; - Ort::MemoryInfo mem_info("Cpu", OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemType::OrtMemTypeDefault); - - LOG(info) << "Creating ONNX tensor"; - std::vector> input_tensor(execution_threads); - if(dtype=="FP16"){ - std::vector input_data(mInputShapes[0][1] * test_size_tensor, (Ort::Float16_t)1.f); // Example input - for(int i = 0; i < execution_threads; i++){ - for(int j = 0; j < test_num_tensors; j++){ - input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); - } - } - } else { - std::vector input_data(mInputShapes[0][1] * test_size_tensor, 1.0f); // Example input - for(int i = 0; i < execution_threads; i++){ - for(int j = 0; j < test_num_tensors; j++){ - input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); - } - } - } - - LOG(info) << "Starting inference"; - for(int i = 0; i < test_size_iter; i++){ - auto start_network_eval = std::chrono::high_resolution_clock::now(); - runONNXGPUModel(input_tensor); - // std::vector output = model.inference(test); - auto end_network_eval = std::chrono::high_resolution_clock::now(); - time += std::chrono::duration>(end_network_eval - start_network_eval).count(); - if((i % epochs_measure == 0) && (i != 0)){ - time /= 1e9; - LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; - time = 0; - } - } - - // for(auto out : output){ - // LOG(info) << "Test output: " << out; - // } - pc.services().get().endOfStream(); - pc.services().get().readyToQuit(QuitRequest::Me); - }; - - private: - - std::vector model_buffer; - std::string model_path, device, dtype; - int device_id, execution_threads, threads_per_session_cpu, enable_profiling, logging_level, enable_optimizations; - size_t test_size_iter, test_size_tensor, epochs_measure, test_num_tensors; - - Ort::RunOptions runOptions; - std::vector env; - std::vector> session; - Ort::SessionOptions session_options; - Ort::AllocatorWithDefaultOptions allocator; - - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames; - std::vector> mInputShapes; - std::vector mOutputNames; - std::vector> mOutputShapes; - - std::string printShape(const std::vector& v) - { - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) - ss << v[i] << "x"; - ss << v[v.size() - 1]; - return ss.str(); - }; + public: + + onnxGPUinference(std::unordered_map options_map) { + // Options map + model_path = options_map["path"]; + device = options_map["device"]; + dtype = options_map["dtype"]; + std::stringstream(options_map["device-id"]) >> device_id; + std::stringstream(options_map["num-iter"]) >> test_size_iter; + std::stringstream(options_map["execution-threads"]) >> execution_threads; + std::stringstream(options_map["threads-per-session-cpu"]) >> threads_per_session_cpu; + std::stringstream(options_map["num-tensors"]) >> test_num_tensors; + std::stringstream(options_map["size-tensor"]) >> test_size_tensor; + std::stringstream(options_map["measure-cycle"]) >> epochs_measure; + std::stringstream(options_map["logging-level"]) >> logging_level; + std::stringstream(options_map["enable-profiling"]) >> enable_profiling; + std::stringstream(options_map["enable-optimizations"]) >> enable_optimizations; + + LOG(info) << "Options loaded"; + + execution_threads = std::min((int)execution_threads, (int)boost::thread::hardware_concurrency()); + + // Set the environment variable to use ROCm execution provider + if(device=="ROCM"){ + // OrtROCMProviderOptions* rocm_options = nullptr; + // OrtApi::CreateROCMProviderOptions(&rocm_options); + // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy"}; + // std::vector values{options_map["device-id"].c_str(), "34342961152", "kSameAsRequested"}; + // OrtApi::UpdateROCMProviderOptions(rocm_options, keys.data(), values.data(), keys.size()); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, device_id)); + session_options.DisableMemPattern(); + session_options.DisableCpuMemArena(); + LOG(info) << "ROCM execution provider set"; + } else if (device=="MIGRAPHX") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, device_id)); + LOG(info) << "MIGraphX execution provider set"; + } else if(device=="CPU"){ + session_options.SetIntraOpNumThreads(threads_per_session_cpu); + if(threads_per_session_cpu > 0){ + LOG(info) << "CPU execution provider set with " << threads_per_session_cpu << " threads"; + } else { + threads_per_session_cpu = 0; + LOG(info) << "CPU execution provider set with default number of threads"; + } + if(threads_per_session_cpu > 1){ + session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + } + } else { + LOG(fatal) << "Device not recognized"; + } + // std::vector providers = session.GetProviders(); + // for (const auto& provider : providers) { + // LOG(info) << "Using execution provider: " << provider << std::endl; + // } + + if((bool)enable_profiling){ + LOG(info) << "Profiling enabled"; + session_options.EnableProfiling((options_map["profiling-output-path"] + "/ORT_LOG_").c_str()); + } else { + LOG(info) << "Profiling disabled"; + session_options.DisableProfiling(); + } + if(enable_optimizations){ + session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); + } + session_options.SetLogSeverityLevel(logging_level); + + env.resize(execution_threads); + session.resize(execution_threads); + for(int s = 0; s < execution_threads; s++){ + env[s] = Ort::Env(OrtLoggingLevel(logging_level), "onnx_model_inference"); + session[s].reset(new Ort::Session{env[s], model_path.c_str(), session_options}); + } + LOG(info) << "Sessions created"; + + LOG(info) << "Number of iterations: " << test_size_iter << ", size of the test tensor: " << test_size_tensor << ", measuring every " << epochs_measure << " cycles, number of tensors: " << test_num_tensors << ", execution threads: " << execution_threads; + + for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { + mInputNames.push_back(session[0]->GetInputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { + mInputShapes.emplace_back(session[0]->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { + mOutputNames.push_back(session[0]->GetOutputNameAllocated(i, allocator).get()); + } + for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { + mOutputShapes.emplace_back(session[0]->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + } + + LOG(info) << "Initializing ONNX names and sizes"; + inputNamesChar.resize(mInputNames.size(), nullptr); + std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + outputNamesChar.resize(mOutputNames.size(), nullptr); + std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), + [&](const std::string& str) { return str.c_str(); }); + + // Print names + LOG(info) << "Input Nodes:"; + for (size_t i = 0; i < mInputNames.size(); i++) { + LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + } + + LOG(info) << "Output Nodes:"; + for (size_t i = 0; i < mOutputNames.size(); i++) { + LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + } + }; + + void runONNXGPUModel(std::vector>& input) { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = session[thrd]->Run(runOptions, inputNamesChar.data(), input[thrd].data(), input[thrd].size(), outputNamesChar.data(), outputNamesChar.size()); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + void init(InitContext& ic) final {}; + void run(ProcessingContext& pc) final { + double time = 0; + + LOG(info) << "Preparing input data"; + // Prepare input data + std::vector inputShape{test_size_tensor, mInputShapes[0][1]}; + + LOG(info) << "Creating memory info"; + // Ort::MemoryInfo mem_info("Hip", OrtAllocatorType::OrtArenaAllocator, device_id, OrtMemType::OrtMemTypeDefault); + Ort::MemoryInfo mem_info("Cpu", OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemType::OrtMemTypeDefault); + + LOG(info) << "Creating ONNX tensor"; + std::vector> input_tensor(execution_threads); + if(dtype=="FP16"){ + std::vector input_data(mInputShapes[0][1] * test_size_tensor, (Ort::Float16_t)1.f); // Example input + for(int i = 0; i < execution_threads; i++){ + for(int j = 0; j < test_num_tensors; j++){ + // auto input_data_allocator = allocator.GetAllocation(input_data.size() * sizeof(Ort::Float16_t)); + // (void)hipMemcpy(input_data_allocator.get(), input_data.data(), sizeof(Ort::Float16_t) * input_data.size(), hipMemcpyHostToDevice); + // input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, reinterpret_cast(input_data_allocator.get()), input_data.size(), inputShape.data(), inputShape.size())); + input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); + } + } + } else { + std::vector input_data(mInputShapes[0][1] * test_size_tensor, 1.0f); // Example input + for(int i = 0; i < execution_threads; i++){ + for(int j = 0; j < test_num_tensors; j++){ + input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); + } + } + } + + LOG(info) << "Starting inference"; + for(int i = 0; i < test_size_iter; i++){ + auto start_network_eval = std::chrono::high_resolution_clock::now(); + runONNXGPUModel(input_tensor); + // std::vector output = model.inference(test); + auto end_network_eval = std::chrono::high_resolution_clock::now(); + time += std::chrono::duration>(end_network_eval - start_network_eval).count(); + if((i % epochs_measure == 0) && (i != 0)){ + time /= 1e9; + LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; + time = 0; + } + } + + // for(auto out : output){ + // LOG(info) << "Test output: " << out; + // } + pc.services().get().endOfStream(); + pc.services().get().readyToQuit(QuitRequest::Me); + }; + + private: + + std::vector model_buffer; + std::string model_path, device, dtype; + int device_id, execution_threads, threads_per_session_cpu, enable_profiling, logging_level, enable_optimizations; + size_t test_size_iter, test_size_tensor, epochs_measure, test_num_tensors; + + Ort::RunOptions runOptions; + std::vector env; + std::vector> session; + Ort::SessionOptions session_options; + Ort::AllocatorWithDefaultOptions allocator; + + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames; + std::vector> mInputShapes; + std::vector mOutputNames; + std::vector> mOutputShapes; + + std::string printShape(const std::vector& v) + { + std::stringstream ss(""); + for (size_t i = 0; i < v.size() - 1; i++) + ss << v[i] << "x"; + ss << v[v.size() - 1]; + return ss.str(); + }; }; } } void customize(std::vector& workflowOptions) { - std::vector options{ - {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, - {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, - {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, - {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, - {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, - {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, - {"threads-per-session-cpu", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, - {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, - {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, - {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, - {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, - {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, - {"logging-level", VariantType::Int, 0, {"Logging level"}}, - {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}} - }; - std::swap(workflowOptions, options); + std::vector options{ + {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, + {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, + {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, + {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, + {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, + {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, + {"threads-per-session-cpu", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, + {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, + {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, + {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, + {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, + {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, + {"logging-level", VariantType::Int, 1, {"Logging level"}}, + {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, + {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}} + }; + std::swap(workflowOptions, options); } // --------------------------------- @@ -280,44 +301,45 @@ void customize(std::vector& workflowOptions) DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) { - // A copy of the global workflow options from customize() to pass to the task - std::unordered_map options_map{ - {"path", cfgc.options().get("path")}, - {"device", cfgc.options().get("device")}, - {"device-id", std::to_string(cfgc.options().get("device-id"))}, - {"dtype", cfgc.options().get("dtype")}, - {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, - {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, - {"threads-per-session-cpu", std::to_string(cfgc.options().get("threads-per-session-cpu"))}, - {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, - {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, - {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, - {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, - {"profiling-output-path", cfgc.options().get("profiling-output-path")}, - {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, - {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))} - }; - - return DataProcessorSpec{ - "test-onnx-gpu", - inputs, - outputs, - adaptFromTask(options_map), - Options{ - {"somethingElse", VariantType::String, "-", {"Something else"}} - } - }; + // A copy of the global workflow options from customize() to pass to the task + std::unordered_map options_map{ + {"path", cfgc.options().get("path")}, + {"device", cfgc.options().get("device")}, + {"device-id", std::to_string(cfgc.options().get("device-id"))}, + {"dtype", cfgc.options().get("dtype")}, + {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, + {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, + {"threads-per-session-cpu", std::to_string(cfgc.options().get("threads-per-session-cpu"))}, + {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, + {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, + {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, + {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, + {"profiling-output-path", cfgc.options().get("profiling-output-path")}, + {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, + {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, + {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))} + }; + + return DataProcessorSpec{ + "test-onnx-gpu", + inputs, + outputs, + adaptFromTask(options_map), + Options{ + {"somethingElse", VariantType::String, "-", {"Something else"}} + } + }; } WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) { - WorkflowSpec specs; + WorkflowSpec specs; - static std::vector inputs; - static std::vector outputs; + static std::vector inputs; + static std::vector outputs; - specs.push_back(testProcess(cfgc, inputs, outputs)); + specs.push_back(testProcess(cfgc, inputs, outputs)); - return specs; + return specs; } \ No newline at end of file diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx new file mode 100644 index 0000000000000..0c9b9b48934e8 --- /dev/null +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -0,0 +1,227 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "Algorithm/RangeTokenizer.h" +#include "SimulationDataFormat/MCCompLabel.h" +#include "SimulationDataFormat/ConstMCTruthContainer.h" +#include "SimulationDataFormat/LabelContainer.h" +#include "SimulationDataFormat/IOMCTruthContainerView.h" +#include "SimulationDataFormat/MCTruthContainer.h" + +#include "Headers/DataHeader.h" + +#include "ML/ort_interface.h" + +#include "Steer/MCKinematicsReader.h" + +#include "DPLUtils/RootTreeReader.h" +#include "DPLUtils/MakeRootTreeWriterSpec.h" + +#include "DataFormatsTPC/WorkflowHelper.h" +#include "DataFormatsTPC/ClusterNativeHelper.h" +#include "DataFormatsTPC/ClusterNative.h" +#include "DataFormatsTPC/ClusterGroupAttribute.h" +#include "DataFormatsTPC/Constants.h" +#include "DataFormatsTPC/TrackTPC.h" +#include "DataFormatsGlobalTracking/TrackTuneParams.h" +#include "DataFormatsTPC/Defs.h" + +#include "TPCWorkflow/ProcessingHelpers.h" +#include "TPCQC/Clusters.h" +#include "TPCBase/Painter.h" +#include "TPCBase/CalDet.h" +#include "TPCBase/Mapper.h" + +#include "Framework/Logger.h" +#include "Framework/Task.h" +#include "Framework/DataProcessorSpec.h" +#include "Framework/ConfigParamRegistry.h" +#include "Framework/ControlService.h" +#include "Framework/CompletionPolicyHelpers.h" +#include "Framework/WorkflowSpec.h" +#include "Framework/CallbacksPolicy.h" + +#include "DetectorsRaw/HBFUtils.h" + +using namespace o2; +using namespace o2::ml; +using namespace o2::tpc; +using namespace o2::framework; + +namespace o2 +{ +namespace tpc +{ +class onnxInference : public Task +{ + public: + + onnxInference(std::unordered_map optionsMap) { + options_map = optionsMap; + models = std::vector(std::stoi(options_map["execution-threads"])); + for(int thrd = 0; thrd < std::stoi(options_map["execution-threads"]); thrd++) { + models[thrd].init(options_map); + } + }; + + template + void runONNXGPUModel(std::vector>& input, int execution_threads) { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = models[thrd].inference(input[thrd]); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + template + void runONNXGPUModel(std::vector>>& input, int execution_threads) { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = models[thrd].inference(input[thrd]); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + void init(InitContext& ic) final {}; + void run(ProcessingContext& pc) final { + double time = 0; + int test_size_tensor = std::stoi(options_map["size-tensor"]); + int epochs_measure = std::stoi(options_map["measure-cycle"]); + int execution_threads = std::stoi(options_map["execution-threads"]); + int test_num_tensors = std::stoi(options_map["num-tensors"]); + int test_size_iter = std::stoi(options_map["num-iter"]); + + LOG(info) << "Preparing input data"; + // Prepare input data + std::vector inputShape{test_size_tensor, models[0].getNumInputNodes()[0][1]}; + + LOG(info) << "Creating ONNX tensor"; + std::vector> input_tensor(execution_threads); + std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, Ort::Float16_t(1.0f)); // Example input + for(int i = 0; i < execution_threads; i++){ + input_tensor[i] = input_data; + // input_tensor[i].resize(test_num_tensors); + // for(int j = 0; j < test_num_tensors; j++){ + // input_tensor[i][j] = input_data; + // } + } + + LOG(info) << "Starting inference"; + for(int i = 0; i < test_size_iter; i++){ + auto start_network_eval = std::chrono::high_resolution_clock::now(); + runONNXGPUModel(input_tensor, execution_threads); + auto end_network_eval = std::chrono::high_resolution_clock::now(); + time += std::chrono::duration>(end_network_eval - start_network_eval).count(); + if((i % epochs_measure == 0) && (i != 0)){ + time /= 1e9; + LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; + time = 0; + } + } + + // for(auto out : output){ + // LOG(info) << "Test output: " << out; + // } + pc.services().get().endOfStream(); + pc.services().get().readyToQuit(QuitRequest::Me); + }; + + private: + std::vector models; + std::unordered_map options_map; +}; +} +} + +void customize(std::vector& workflowOptions) +{ + std::vector options{ + {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, + {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, + {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, + {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, + {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, + {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, + {"intra-op-num-threads", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, + {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, + {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, + {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, + {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, + {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, + {"logging-level", VariantType::Int, 1, {"Logging level"}}, + {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, + {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}} + }; + std::swap(workflowOptions, options); +} + +// --------------------------------- +#include "Framework/runDataProcessing.h" + +DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) +{ + + // A copy of the global workflow options from customize() to pass to the task + std::unordered_map options_map{ + {"model-path", cfgc.options().get("path")}, + {"device", cfgc.options().get("device")}, + {"device-id", std::to_string(cfgc.options().get("device-id"))}, + {"dtype", cfgc.options().get("dtype")}, + {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, + {"intra-op-num-threads", std::to_string(cfgc.options().get("intra-op-num-threads"))}, + {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, + {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, + {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, + {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, + {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, + {"profiling-output-path", cfgc.options().get("profiling-output-path")}, + {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, + {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, + {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))} + }; + + return DataProcessorSpec{ + "test-onnx-interface", + inputs, + outputs, + adaptFromTask(options_map), + Options{ + {"somethingElse", VariantType::String, "-", {"Something else"}} + } + }; +} + +WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) +{ + + WorkflowSpec specs; + + static std::vector inputs; + static std::vector outputs; + + specs.push_back(testProcess(cfgc, inputs, outputs)); + + return specs; +} \ No newline at end of file From 792e7ec2bb05fddd9bae67304d5757fb5cc21fca Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Mon, 16 Sep 2024 22:57:03 +0200 Subject: [PATCH 03/21] Finally fixing casting issue with Oet::Float16_t and OrtDataType::Float16_t. ort_interface.h now usuable as library (not header!) :party: --- Common/ML/include/ML/ort_interface.h | 15 +-- Common/ML/src/ort_interface.cxx | 100 +++++++++++------- .../test/test_onnx_interface_headers.cxx | 14 +-- 3 files changed, 74 insertions(+), 55 deletions(-) diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index 752acf33ef99a..5266f7d390024 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -23,9 +23,6 @@ #include #include -// ONNX includes -#include - // O2 includes #include "GPUORTFloat16.h" #include "Framework/Logger.h" @@ -71,16 +68,12 @@ class OrtModel std::vector getOutputNames() const { return mOutputNames; } void setActiveThreads(int threads) { intraOpNumThreads = threads; } - + private: - // ORT runtime objects - Ort::RunOptions runOptions; - std::shared_ptr env = nullptr; - std::shared_ptr session = nullptr; ///< ONNX session - Ort::SessionOptions sessionOptions; - Ort::AllocatorWithDefaultOptions allocator; - Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; // Input & Output specifications of the loaded network std::vector inputNamesChar, outputNamesChar; diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index ad02d2bd63a86..47c2144b7293f 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -11,17 +11,33 @@ /// \file ort_interface.cxx /// \author Christian Sonnabend -/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU +/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU #include "ML/ort_interface.h" +// ONNX includes +#include + namespace o2 { namespace ml { +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file + // ORT runtime objects + Ort::RunOptions runOptions; + std::shared_ptr env = nullptr; + std::shared_ptr session = nullptr; ///< ONNX session + Ort::SessionOptions sessionOptions; + Ort::AllocatorWithDefaultOptions allocator; + Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); +}; + void OrtModel::reset(std::unordered_map optionsMap){ + + pImplOrt = new OrtVariables(); + // Load from options map if(!optionsMap.contains("model-path")){ LOG(fatal) << "(ORT) Model path cannot be empty!"; @@ -37,64 +53,64 @@ void OrtModel::reset(std::unordered_map optionsMap){ enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); if(device == "rocm") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(sessionOptions, deviceId)); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } else if(device == "migraphx") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(sessionOptions, deviceId)); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } if(allocateDeviceMemory){ - memoryInfo = Ort::MemoryInfo("Hip", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + pImplOrt->memoryInfo = Ort::MemoryInfo("Hip", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory (HIP)"; } #if defined(__CUDACC__) - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, deviceId)); + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); if(allocateDeviceMemory){ - memoryInfo = Ort::MemoryInfo("Cuda", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + pImplOrt->memoryInfo = Ort::MemoryInfo("Cuda", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory (CUDA)"; } #endif if(device == "cpu") { - sessionOptions.SetIntraOpNumThreads(intraOpNumThreads); + (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); if(intraOpNumThreads > 1){ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_PARALLEL); + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); } else if(intraOpNumThreads == 1){ - sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); + (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; } - sessionOptions.DisableMemPattern(); - sessionOptions.DisableCpuMemArena(); + (pImplOrt->sessionOptions).DisableMemPattern(); + (pImplOrt->sessionOptions).DisableCpuMemArena(); if(enableProfiling){ if(optionsMap.contains("profiling-output-path")){ - sessionOptions.EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); + (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; - sessionOptions.DisableProfiling(); + (pImplOrt->sessionOptions).DisableProfiling(); } } else { - sessionOptions.DisableProfiling(); + (pImplOrt->sessionOptions).DisableProfiling(); } - sessionOptions.SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); - sessionOptions.SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); + (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); + (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); - env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); - session.reset(new Ort::Session{*env, modelPath.c_str(), sessionOptions}); + pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); - for (size_t i = 0; i < session->GetInputCount(); ++i) { - mInputNames.push_back(session->GetInputNameAllocated(i, allocator).get()); + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } - for (size_t i = 0; i < session->GetInputCount(); ++i) { - mInputShapes.emplace_back(session->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } - for (size_t i = 0; i < session->GetOutputCount(); ++i) { - mOutputNames.push_back(session->GetOutputNameAllocated(i, allocator).get()); + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); } - for (size_t i = 0; i < session->GetOutputCount(); ++i) { - mOutputShapes.emplace_back(session->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } inputNamesChar.resize(mInputNames.size(), nullptr); @@ -117,7 +133,7 @@ void OrtModel::reset(std::unordered_map optionsMap){ } void OrtModel::resetSession() { - session.reset(new Ort::Session{*env, modelPath.c_str(), sessionOptions}); + (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } template @@ -136,9 +152,9 @@ template // class I is the input data type, e.g. float, class std::vector OrtModel::inference(std::vector& input){ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); - auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = outputTensors[0].template GetTensorMutableData(); outputTensors.clear(); return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; @@ -149,10 +165,10 @@ std::vector OrtModel::inference(std::vector>& input){ std::vector inputTensor; for(auto i : input){ std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(i)).data(), i.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (v2v(i)).data(), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); - auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = outputTensors[0].template GetTensorMutableData(); outputTensors.clear(); return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; @@ -164,7 +180,7 @@ std::vector OrtModel::inference(std::vector>& input){ // std::vector inputTensor; // inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); -// auto outputTensors = session->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); +// auto outputTensors = (pImplOrt->session)->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); // O* outputValues = outputTensors[0].template GetTensorMutableData(); // outputTensors.clear(); // return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; @@ -179,13 +195,23 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template std::vector OrtModel::v2v(std::vector&, bool); +// template std::vector OrtModel::v2v(std::vector&, bool); + +// template std::vector OrtModel::inference(std::vector&); -template std::vector OrtModel::inference(std::vector&); -template std::vector OrtModel::inference(std::vector&); +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + outputTensors.clear(); + return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +} -template std::vector OrtModel::inference(std::vector>&); -template std::vector OrtModel::inference(std::vector>&); +// template std::vector OrtModel::inference(std::vector>&); +// template std::vector OrtModel::inference(std::vector>&); // template std::vector OrtModel::v2v(std::vector&, bool); // template <> std::vector OrtModel::v2v(std::vector& input, bool clearInput) { diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index 0c9b9b48934e8..d4ddeef0a6401 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -118,8 +118,8 @@ class onnxInference : public Task std::vector inputShape{test_size_tensor, models[0].getNumInputNodes()[0][1]}; LOG(info) << "Creating ONNX tensor"; - std::vector> input_tensor(execution_threads); - std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, Ort::Float16_t(1.0f)); // Example input + std::vector> input_tensor(execution_threads); + std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, OrtDataType::Float16_t(1.0f)); // Example input for(int i = 0; i < execution_threads; i++){ input_tensor[i] = input_data; // input_tensor[i].resize(test_num_tensors); @@ -129,15 +129,15 @@ class onnxInference : public Task } LOG(info) << "Starting inference"; + auto start_network_eval = std::chrono::high_resolution_clock::now(); for(int i = 0; i < test_size_iter; i++){ - auto start_network_eval = std::chrono::high_resolution_clock::now(); - runONNXGPUModel(input_tensor, execution_threads); - auto end_network_eval = std::chrono::high_resolution_clock::now(); - time += std::chrono::duration>(end_network_eval - start_network_eval).count(); + runONNXGPUModel(input_tensor, execution_threads); if((i % epochs_measure == 0) && (i != 0)){ - time /= 1e9; + auto end_network_eval = std::chrono::high_resolution_clock::now(); + time = std::chrono::duration>(end_network_eval - start_network_eval).count()/1e9; LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; time = 0; + start_network_eval = std::chrono::high_resolution_clock::now(); } } From 59ba4d9d7f3402c5fe4cf2df054d0c8821b31875 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 17 Sep 2024 13:19:54 +0200 Subject: [PATCH 04/21] Using reinterpret_cast for type conversion --- Common/ML/src/ort_interface.cxx | 79 +++++++++++---------------------- 1 file changed, 27 insertions(+), 52 deletions(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 47c2144b7293f..e91ba8faf3e7b 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -152,10 +152,10 @@ template // class I is the input data type, e.g. float, class std::vector OrtModel::inference(std::vector& input){ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = outputTensors[0].template GetTensorMutableData(); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; } @@ -165,27 +165,15 @@ std::vector OrtModel::inference(std::vector>& input){ std::vector inputTensor; for(auto i : input){ std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (v2v(i)).data(), i.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = outputTensors[0].template GetTensorMutableData(); + O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; } -// template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -// std::vector OrtModel::inference(std::vector& input){ -// std::vector inputShape{input.size(), mInputShapes[0][1]}; -// std::vector inputTensor; -// inputTensor.emplace_back(Ort::Value::CreateTensor(memoryInfo, (v2v(input)).data(), input.size(), inputShape.data(), inputShape.size())); -// input.clear(); -// auto outputTensors = (pImplOrt->session)->Run(runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); -// O* outputValues = outputTensors[0].template GetTensorMutableData(); -// outputTensors.clear(); -// return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; -// } - std::string OrtModel::printShape(const std::vector& v) { std::stringstream ss(""); @@ -195,10 +183,6 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -// template std::vector OrtModel::v2v(std::vector&, bool); - -// template std::vector OrtModel::inference(std::vector&); - template <> std::vector OrtModel::inference(std::vector& input) { std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; @@ -210,38 +194,29 @@ template <> std::vector OrtModel::inference{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; } -// template std::vector OrtModel::inference(std::vector>&); -// template std::vector OrtModel::inference(std::vector>&); - -// template std::vector OrtModel::v2v(std::vector&, bool); -// template <> std::vector OrtModel::v2v(std::vector& input, bool clearInput) { -// std::vector output(input.size()); -// std::transform(std::begin(input), std::end(input), std::begin(output), [](OrtDataType::Float16_t f) { return Ort::Float16_t::FromBits(f.val); }); -// if(clearInput) input.clear(); -// return output; -// }; -// template <> std::vector OrtModel::v2v(std::vector& input, bool clearInput) { -// std::vector output(input.size()); -// std::transform(std::begin(input), std::end(input), std::begin(output), [](Ort::Float16_t f) { return OrtDataType::Float16_t::FromBits(f.val); }); -// if(clearInput) input.clear(); -// return output; -// }; -// template std::vector OrtModel::v2v(std::vector&, bool); -// -// // template std::vector OrtModel::inference(std::vector&); -// // template std::vector OrtModel::inference(std::vector&); -// // template std::vector OrtModel::inference(std::vector&); -// -// template <> std::vector OrtModel::inference(std::vector& input){ -// return OrtModel::inference(input); -// }; -// template <> std::vector OrtModel::inference(std::vector& input) { -// return OrtModel::inference(input); -// }; -// -// template <> std::vector OrtModel::inference(std::vector& input) { -// return OrtModel::inference(input); -// }; +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + outputTensors.clear(); + return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +} + +template <> std::vector OrtModel::inference(std::vector>& input) { + std::vector inputTensor; + for(auto i : input){ + std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); + } + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + outputTensors.clear(); + return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; +} } // namespace ml From ae21472d9bc3d081e4830a7b0c7235305ce1e404 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 17 Sep 2024 13:26:33 +0200 Subject: [PATCH 05/21] Modifying test script --- Detectors/TPC/workflow/CMakeLists.txt | 5 - .../workflow/test/test_onnx_gpu_inference.cxx | 345 ------------------ 2 files changed, 350 deletions(-) delete mode 100644 Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx diff --git a/Detectors/TPC/workflow/CMakeLists.txt b/Detectors/TPC/workflow/CMakeLists.txt index 4060a4b7832f2..5bb4c49ab1075 100644 --- a/Detectors/TPC/workflow/CMakeLists.txt +++ b/Detectors/TPC/workflow/CMakeLists.txt @@ -81,11 +81,6 @@ if(OpenMP_CXX_FOUND) target_link_libraries(${mergertargetName} PRIVATE OpenMP::OpenMP_CXX) endif() -o2_add_executable(onnx-gpu - COMPONENT_NAME test - SOURCES test/test_onnx_gpu_inference.cxx - PUBLIC_LINK_LIBRARIES O2::TPCWorkflow O2::SimulationDataFormat O2::TPCQC O2::DataFormatsTPC O2::TPCBase ONNXRuntime::ONNXRuntime Boost::thread O2::GPUTracking) - o2_add_executable(onnx-interface COMPONENT_NAME test SOURCES test/test_onnx_interface_headers.cxx diff --git a/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx b/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx deleted file mode 100644 index 4d5f2af587a4c..0000000000000 --- a/Detectors/TPC/workflow/test/test_onnx_gpu_inference.cxx +++ /dev/null @@ -1,345 +0,0 @@ -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Algorithm/RangeTokenizer.h" -#include "SimulationDataFormat/MCCompLabel.h" -#include "SimulationDataFormat/ConstMCTruthContainer.h" -#include "SimulationDataFormat/LabelContainer.h" -#include "SimulationDataFormat/IOMCTruthContainerView.h" -#include "SimulationDataFormat/MCTruthContainer.h" - -#include "Headers/DataHeader.h" - -#include "Steer/MCKinematicsReader.h" - -#include "DPLUtils/RootTreeReader.h" -#include "DPLUtils/MakeRootTreeWriterSpec.h" - -#include "DataFormatsTPC/WorkflowHelper.h" -#include "DataFormatsTPC/ClusterNativeHelper.h" -#include "DataFormatsTPC/ClusterNative.h" -#include "DataFormatsTPC/ClusterGroupAttribute.h" -#include "DataFormatsTPC/Constants.h" -#include "DataFormatsTPC/TrackTPC.h" -#include "DataFormatsGlobalTracking/TrackTuneParams.h" -#include "DataFormatsTPC/Defs.h" - -#include "TPCWorkflow/ProcessingHelpers.h" -#include "TPCQC/Clusters.h" -#include "TPCBase/Painter.h" -#include "TPCBase/CalDet.h" -#include "TPCBase/Mapper.h" - -#include "Framework/Logger.h" -#include "Framework/Task.h" -#include "Framework/DataProcessorSpec.h" -#include "Framework/ConfigParamRegistry.h" -#include "Framework/ControlService.h" -#include "Framework/CompletionPolicyHelpers.h" -#include "Framework/WorkflowSpec.h" -#include "Framework/CallbacksPolicy.h" - -#include "DetectorsRaw/HBFUtils.h" - -using namespace o2; -using namespace o2::tpc; -using namespace o2::framework; - -namespace o2 -{ -namespace tpc -{ -class onnxGPUinference : public Task -{ - public: - - onnxGPUinference(std::unordered_map options_map) { - // Options map - model_path = options_map["path"]; - device = options_map["device"]; - dtype = options_map["dtype"]; - std::stringstream(options_map["device-id"]) >> device_id; - std::stringstream(options_map["num-iter"]) >> test_size_iter; - std::stringstream(options_map["execution-threads"]) >> execution_threads; - std::stringstream(options_map["threads-per-session-cpu"]) >> threads_per_session_cpu; - std::stringstream(options_map["num-tensors"]) >> test_num_tensors; - std::stringstream(options_map["size-tensor"]) >> test_size_tensor; - std::stringstream(options_map["measure-cycle"]) >> epochs_measure; - std::stringstream(options_map["logging-level"]) >> logging_level; - std::stringstream(options_map["enable-profiling"]) >> enable_profiling; - std::stringstream(options_map["enable-optimizations"]) >> enable_optimizations; - - LOG(info) << "Options loaded"; - - execution_threads = std::min((int)execution_threads, (int)boost::thread::hardware_concurrency()); - - // Set the environment variable to use ROCm execution provider - if(device=="ROCM"){ - // OrtROCMProviderOptions* rocm_options = nullptr; - // OrtApi::CreateROCMProviderOptions(&rocm_options); - // std::vector keys{"device_id", "gpu_mem_limit", "arena_extend_strategy"}; - // std::vector values{options_map["device-id"].c_str(), "34342961152", "kSameAsRequested"}; - // OrtApi::UpdateROCMProviderOptions(rocm_options, keys.data(), values.data(), keys.size()); - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, device_id)); - session_options.DisableMemPattern(); - session_options.DisableCpuMemArena(); - LOG(info) << "ROCM execution provider set"; - } else if (device=="MIGRAPHX") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(session_options, device_id)); - LOG(info) << "MIGraphX execution provider set"; - } else if(device=="CPU"){ - session_options.SetIntraOpNumThreads(threads_per_session_cpu); - if(threads_per_session_cpu > 0){ - LOG(info) << "CPU execution provider set with " << threads_per_session_cpu << " threads"; - } else { - threads_per_session_cpu = 0; - LOG(info) << "CPU execution provider set with default number of threads"; - } - if(threads_per_session_cpu > 1){ - session_options.SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } - } else { - LOG(fatal) << "Device not recognized"; - } - // std::vector providers = session.GetProviders(); - // for (const auto& provider : providers) { - // LOG(info) << "Using execution provider: " << provider << std::endl; - // } - - if((bool)enable_profiling){ - LOG(info) << "Profiling enabled"; - session_options.EnableProfiling((options_map["profiling-output-path"] + "/ORT_LOG_").c_str()); - } else { - LOG(info) << "Profiling disabled"; - session_options.DisableProfiling(); - } - if(enable_optimizations){ - session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); - } - session_options.SetLogSeverityLevel(logging_level); - - env.resize(execution_threads); - session.resize(execution_threads); - for(int s = 0; s < execution_threads; s++){ - env[s] = Ort::Env(OrtLoggingLevel(logging_level), "onnx_model_inference"); - session[s].reset(new Ort::Session{env[s], model_path.c_str(), session_options}); - } - LOG(info) << "Sessions created"; - - LOG(info) << "Number of iterations: " << test_size_iter << ", size of the test tensor: " << test_size_tensor << ", measuring every " << epochs_measure << " cycles, number of tensors: " << test_num_tensors << ", execution threads: " << execution_threads; - - for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { - mInputNames.push_back(session[0]->GetInputNameAllocated(i, allocator).get()); - } - for (size_t i = 0; i < session[0]->GetInputCount(); ++i) { - mInputShapes.emplace_back(session[0]->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { - mOutputNames.push_back(session[0]->GetOutputNameAllocated(i, allocator).get()); - } - for (size_t i = 0; i < session[0]->GetOutputCount(); ++i) { - mOutputShapes.emplace_back(session[0]->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - - LOG(info) << "Initializing ONNX names and sizes"; - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - - // Print names - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } - - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); - } - }; - - void runONNXGPUModel(std::vector>& input) { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = session[thrd]->Run(runOptions, inputNamesChar.data(), input[thrd].data(), input[thrd].size(), outputNamesChar.data(), outputNamesChar.size()); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - void init(InitContext& ic) final {}; - void run(ProcessingContext& pc) final { - double time = 0; - - LOG(info) << "Preparing input data"; - // Prepare input data - std::vector inputShape{test_size_tensor, mInputShapes[0][1]}; - - LOG(info) << "Creating memory info"; - // Ort::MemoryInfo mem_info("Hip", OrtAllocatorType::OrtArenaAllocator, device_id, OrtMemType::OrtMemTypeDefault); - Ort::MemoryInfo mem_info("Cpu", OrtAllocatorType::OrtDeviceAllocator, device_id, OrtMemType::OrtMemTypeDefault); - - LOG(info) << "Creating ONNX tensor"; - std::vector> input_tensor(execution_threads); - if(dtype=="FP16"){ - std::vector input_data(mInputShapes[0][1] * test_size_tensor, (Ort::Float16_t)1.f); // Example input - for(int i = 0; i < execution_threads; i++){ - for(int j = 0; j < test_num_tensors; j++){ - // auto input_data_allocator = allocator.GetAllocation(input_data.size() * sizeof(Ort::Float16_t)); - // (void)hipMemcpy(input_data_allocator.get(), input_data.data(), sizeof(Ort::Float16_t) * input_data.size(), hipMemcpyHostToDevice); - // input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, reinterpret_cast(input_data_allocator.get()), input_data.size(), inputShape.data(), inputShape.size())); - input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); - } - } - } else { - std::vector input_data(mInputShapes[0][1] * test_size_tensor, 1.0f); // Example input - for(int i = 0; i < execution_threads; i++){ - for(int j = 0; j < test_num_tensors; j++){ - input_tensor[i].emplace_back(Ort::Value::CreateTensor(mem_info, input_data.data(), input_data.size(), inputShape.data(), inputShape.size())); - } - } - } - - LOG(info) << "Starting inference"; - for(int i = 0; i < test_size_iter; i++){ - auto start_network_eval = std::chrono::high_resolution_clock::now(); - runONNXGPUModel(input_tensor); - // std::vector output = model.inference(test); - auto end_network_eval = std::chrono::high_resolution_clock::now(); - time += std::chrono::duration>(end_network_eval - start_network_eval).count(); - if((i % epochs_measure == 0) && (i != 0)){ - time /= 1e9; - LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; - time = 0; - } - } - - // for(auto out : output){ - // LOG(info) << "Test output: " << out; - // } - pc.services().get().endOfStream(); - pc.services().get().readyToQuit(QuitRequest::Me); - }; - - private: - - std::vector model_buffer; - std::string model_path, device, dtype; - int device_id, execution_threads, threads_per_session_cpu, enable_profiling, logging_level, enable_optimizations; - size_t test_size_iter, test_size_tensor, epochs_measure, test_num_tensors; - - Ort::RunOptions runOptions; - std::vector env; - std::vector> session; - Ort::SessionOptions session_options; - Ort::AllocatorWithDefaultOptions allocator; - - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames; - std::vector> mInputShapes; - std::vector mOutputNames; - std::vector> mOutputShapes; - - std::string printShape(const std::vector& v) - { - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) - ss << v[i] << "x"; - ss << v[v.size() - 1]; - return ss.str(); - }; -}; -} -} - -void customize(std::vector& workflowOptions) -{ - std::vector options{ - {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, - {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, - {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, - {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, - {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, - {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, - {"threads-per-session-cpu", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, - {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, - {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, - {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, - {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, - {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, - {"logging-level", VariantType::Int, 1, {"Logging level"}}, - {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, - {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}} - }; - std::swap(workflowOptions, options); -} - -// --------------------------------- -#include "Framework/runDataProcessing.h" - -DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) -{ - - // A copy of the global workflow options from customize() to pass to the task - std::unordered_map options_map{ - {"path", cfgc.options().get("path")}, - {"device", cfgc.options().get("device")}, - {"device-id", std::to_string(cfgc.options().get("device-id"))}, - {"dtype", cfgc.options().get("dtype")}, - {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, - {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, - {"threads-per-session-cpu", std::to_string(cfgc.options().get("threads-per-session-cpu"))}, - {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, - {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, - {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, - {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, - {"profiling-output-path", cfgc.options().get("profiling-output-path")}, - {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, - {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, - {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))} - }; - - return DataProcessorSpec{ - "test-onnx-gpu", - inputs, - outputs, - adaptFromTask(options_map), - Options{ - {"somethingElse", VariantType::String, "-", {"Something else"}} - } - }; -} - -WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) -{ - - WorkflowSpec specs; - - static std::vector inputs; - static std::vector outputs; - - specs.push_back(testProcess(cfgc, inputs, outputs)); - - return specs; -} \ No newline at end of file From c9cd4fa132baa3739cce153e4acbccf5ad07cae3 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Tue, 17 Sep 2024 13:33:14 +0200 Subject: [PATCH 06/21] Adding source for float16 implementation --- Common/ML/include/ML/GPUORTFloat16.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/include/ML/GPUORTFloat16.h b/Common/ML/include/ML/GPUORTFloat16.h index ce13f576205f8..9328e91e65517 100644 --- a/Common/ML/include/ML/GPUORTFloat16.h +++ b/Common/ML/include/ML/GPUORTFloat16.h @@ -11,7 +11,7 @@ /// \file GPUORTFloat16.h /// \author Christian Sonnabend -/// \brief An implementation of the ONNXRuntime Float16_t data-type for GPU acceleration +/// \brief An implementation of the ONNXRuntime Float16_t data-type for GPU acceleration (source https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h) #include #include From f4b74de679ea01b8dfe2ff37633be2c4d446cd90 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 27 Sep 2024 10:23:59 +0200 Subject: [PATCH 07/21] Minor update on the 3rd party libraries --- Common/ML/CMakeLists.txt | 2 +- .../include/ML/{ => 3rdparty}/GPUORTFloat16.h | 18 +++++------------- Common/ML/include/ML/ort_interface.h | 1 - Common/ML/src/ort_interface.cxx | 1 + .../test/test_onnx_interface_headers.cxx | 1 + 5 files changed, 8 insertions(+), 15 deletions(-) rename Common/ML/include/ML/{ => 3rdparty}/GPUORTFloat16.h (96%) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 777f474e687fa..954d29d6e2793 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -12,4 +12,4 @@ o2_add_library(ML SOURCES src/ort_interface.cxx TARGETVARNAME targetName - PUBLIC_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file + PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file diff --git a/Common/ML/include/ML/GPUORTFloat16.h b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h similarity index 96% rename from Common/ML/include/ML/GPUORTFloat16.h rename to Common/ML/include/ML/3rdparty/GPUORTFloat16.h index 9328e91e65517..db65328409d3c 100644 --- a/Common/ML/include/ML/GPUORTFloat16.h +++ b/Common/ML/include/ML/3rdparty/GPUORTFloat16.h @@ -1,17 +1,9 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. -/// \file GPUORTFloat16.h -/// \author Christian Sonnabend -/// \brief An implementation of the ONNXRuntime Float16_t data-type for GPU acceleration (source https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h) +// This code was created from: +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_float16.h +// - https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_api.h #include #include diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index 5266f7d390024..a365860db3279 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -24,7 +24,6 @@ #include // O2 includes -#include "GPUORTFloat16.h" #include "Framework/Logger.h" namespace o2 diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index e91ba8faf3e7b..14113582bd7b4 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -14,6 +14,7 @@ /// \brief A header library for loading ONNX models and inferencing them on CPU and GPU #include "ML/ort_interface.h" +#include "ML/3rdparty/GPUORTFloat16.h" // ONNX includes #include diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index d4ddeef0a6401..c42faae731857 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -25,6 +25,7 @@ #include "Headers/DataHeader.h" #include "ML/ort_interface.h" +#include "ML/3rdparty/GPUORTFloat16.h" #include "Steer/MCKinematicsReader.h" From 62eadea32fba1db6f7abb846a2719a45669c6bf8 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 3 Oct 2024 13:49:01 +0200 Subject: [PATCH 08/21] Changing to #ifdef statements for O2 compilation --- Common/ML/src/ort_interface.cxx | 63 ++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index 14113582bd7b4..b7800a707c880 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -44,7 +44,7 @@ void OrtModel::reset(std::unordered_map optionsMap){ LOG(fatal) << "(ORT) Model path cannot be empty!"; } modelPath = optionsMap["model-path"]; - device = (optionsMap.contains("device") ? optionsMap["device"] : "cpu"); + device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); @@ -53,26 +53,33 @@ void OrtModel::reset(std::unordered_map optionsMap){ enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - if(device == "rocm") { + std::string dev_mem_str = "Hip"; +#ifdef ORT_ROCM_BUILD + if(device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; - } else if(device == "migraphx") { + } +#endif +#ifdef ORT_MIGRAPHX_BUILD + if(device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } - if(allocateDeviceMemory){ - pImplOrt->memoryInfo = Ort::MemoryInfo("Hip", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory (HIP)"; +#endif +#ifdef ORT_CUDA_BUILD + if(device == "CUDA") { + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); + LOG(info) << "(ORT) CUDA execution provider set"; + dev_mem_str = "Cuda"; } -#if defined(__CUDACC__) - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); +#endif + if(allocateDeviceMemory){ - pImplOrt->memoryInfo = Ort::MemoryInfo("Cuda", OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory (CUDA)"; + pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); + LOG(info) << "(ORT) Memory info set to on-device memory"; } -#endif - if(device == "cpu") { + if(device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); if(intraOpNumThreads > 1){ (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); @@ -158,7 +165,7 @@ std::vector OrtModel::inference(std::vector& input){ auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); - return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; + return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h @@ -172,7 +179,7 @@ std::vector OrtModel::inference(std::vector>& input){ auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); - return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; + return std::vector{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; } std::string OrtModel::printShape(const std::vector& v) @@ -184,6 +191,28 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + outputTensors.clear(); + return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; +} + +template <> std::vector OrtModel::inference(std::vector& input) { + std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputTensor; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); + // input.clear(); + auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); + float* outputValues = outputTensors[0].template GetTensorMutableData(); + outputTensors.clear(); + return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; +} + template <> std::vector OrtModel::inference(std::vector& input) { std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; @@ -192,7 +221,7 @@ template <> std::vector OrtModel::inferencesession)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); - return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; + return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } template <> std::vector OrtModel::inference(std::vector& input) { @@ -203,7 +232,7 @@ template <> std::vector OrtModel::inferencesession)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); - return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; + return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } template <> std::vector OrtModel::inference(std::vector>& input) { @@ -216,7 +245,7 @@ template <> std::vector OrtModel::inferencesession)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); outputTensors.clear(); - return std::vector{outputValues, outputValues + input.size() * mOutputShapes[0][1]}; + return std::vector{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; } } // namespace ml From 224a137a61148b568114e0ec6e501dc543695bc6 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Thu, 3 Oct 2024 11:49:37 +0000 Subject: [PATCH 09/21] Please consider the following formatting changes --- Common/ML/include/ML/ort_interface.h | 76 +++-- Common/ML/src/ort_interface.cxx | 86 +++--- .../test/test_onnx_interface_headers.cxx | 271 +++++++++--------- 3 files changed, 223 insertions(+), 210 deletions(-) diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h index a365860db3279..2fe9a44a0623c 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/ort_interface.h @@ -35,60 +35,58 @@ namespace ml class OrtModel { - public: - // Constructor - OrtModel() = default; - OrtModel(std::unordered_map optionsMap){ reset(optionsMap); } - void init(std::unordered_map optionsMap){ reset(optionsMap); } - void reset(std::unordered_map); + public: + // Constructor + OrtModel() = default; + OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } + void init(std::unordered_map optionsMap) { reset(optionsMap); } + void reset(std::unordered_map); - virtual ~OrtModel() = default; + virtual ~OrtModel() = default; - // Conversion - template - std::vector v2v(std::vector&, bool = true); + // Conversion + template + std::vector v2v(std::vector&, bool = true); - // Inferencing - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&); + // Inferencing + template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h + std::vector inference(std::vector&); - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&); + template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h + std::vector inference(std::vector>&); - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); + // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type + // std::vector inference(std::vector&); - // Reset session - void resetSession(); + // Reset session + void resetSession(); - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } + std::vector> getNumInputNodes() const { return mInputShapes; } + std::vector> getNumOutputNodes() const { return mOutputShapes; } + std::vector getInputNames() const { return mInputNames; } + std::vector getOutputNames() const { return mOutputNames; } - void setActiveThreads(int threads) { intraOpNumThreads = threads; } + void setActiveThreads(int threads) { intraOpNumThreads = threads; } - private: + private: + // ORT variables -> need to be hidden as Pimpl + struct OrtVariables; + OrtVariables* pImplOrt; - // ORT variables -> need to be hidden as Pimpl - struct OrtVariables; - OrtVariables* pImplOrt; + // Input & Output specifications of the loaded network + std::vector inputNamesChar, outputNamesChar; + std::vector mInputNames, mOutputNames; + std::vector> mInputShapes, mOutputShapes; - // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; - - // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; - - std::string printShape(const std::vector&); + // Environment settings + std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda + int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; + std::string printShape(const std::vector&); }; } // namespace ml -} // namespace ml +} // namespace o2 #endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx index b7800a707c880..9686437006ffc 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/ort_interface.cxx @@ -25,7 +25,7 @@ namespace o2 namespace ml { -struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file +struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file // ORT runtime objects Ort::RunOptions runOptions; std::shared_ptr env = nullptr; @@ -35,12 +35,13 @@ struct OrtModel::OrtVariables { // The actual implementation is hidden in the . Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); }; -void OrtModel::reset(std::unordered_map optionsMap){ +void OrtModel::reset(std::unordered_map optionsMap) +{ pImplOrt = new OrtVariables(); // Load from options map - if(!optionsMap.contains("model-path")){ + if (!optionsMap.contains("model-path")) { LOG(fatal) << "(ORT) Model path cannot be empty!"; } modelPath = optionsMap["model-path"]; @@ -48,42 +49,42 @@ void OrtModel::reset(std::unordered_map optionsMap){ dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); + intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; #ifdef ORT_ROCM_BUILD - if(device == "ROCM") { + if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } #endif #ifdef ORT_MIGRAPHX_BUILD - if(device == "MIGRAPHX") { + if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } #endif #ifdef ORT_CUDA_BUILD - if(device == "CUDA") { + if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } #endif - if(allocateDeviceMemory){ + if (allocateDeviceMemory) { pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); LOG(info) << "(ORT) Memory info set to on-device memory"; } - if(device == "CPU") { + if (device == "CPU") { (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if(intraOpNumThreads > 1){ + if (intraOpNumThreads > 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if(intraOpNumThreads == 1){ + } else if (intraOpNumThreads == 1) { (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); } LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; @@ -92,8 +93,8 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->sessionOptions).DisableMemPattern(); (pImplOrt->sessionOptions).DisableCpuMemArena(); - if(enableProfiling){ - if(optionsMap.contains("profiling-output-path")){ + if (enableProfiling) { + if (optionsMap.contains("profiling-output-path")) { (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); } else { LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; @@ -109,24 +110,24 @@ void OrtModel::reset(std::unordered_map optionsMap){ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); + mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); + mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); } for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); + mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); } inputNamesChar.resize(mInputNames.size(), nullptr); std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); outputNamesChar.resize(mOutputNames.size(), nullptr); std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); + [&](const std::string& str) { return str.c_str(); }); // Print names LOG(info) << "Input Nodes:"; @@ -140,24 +141,28 @@ void OrtModel::reset(std::unordered_map optionsMap){ } } -void OrtModel::resetSession() { +void OrtModel::resetSession() +{ (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } -template -std::vector OrtModel::v2v(std::vector& input, bool clearInput) { - if constexpr (std::is_same_v){ +template +std::vector OrtModel::v2v(std::vector& input, bool clearInput) +{ + if constexpr (std::is_same_v) { return input; } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if(clearInput) input.clear(); + if (clearInput) + input.clear(); return output; } } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); @@ -168,10 +173,11 @@ std::vector OrtModel::inference(std::vector& input){ return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input){ +template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } @@ -191,7 +197,9 @@ std::string OrtModel::printShape(const std::vector& v) return ss.str(); } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); @@ -202,7 +210,9 @@ template <> std::vector OrtModel::inference(std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -213,7 +223,9 @@ template <> std::vector OrtModel::inference{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -224,7 +236,9 @@ template <> std::vector OrtModel::inference{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } -template <> std::vector OrtModel::inference(std::vector& input) { +template <> +std::vector OrtModel::inference(std::vector& input) +{ std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); @@ -235,9 +249,11 @@ template <> std::vector OrtModel::inference{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; } -template <> std::vector OrtModel::inference(std::vector>& input) { +template <> +std::vector OrtModel::inference(std::vector>& input) +{ std::vector inputTensor; - for(auto i : input){ + for (auto i : input) { std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index c42faae731857..45a13bf874b0d 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -69,113 +69,115 @@ namespace tpc { class onnxInference : public Task { - public: - - onnxInference(std::unordered_map optionsMap) { - options_map = optionsMap; - models = std::vector(std::stoi(options_map["execution-threads"])); - for(int thrd = 0; thrd < std::stoi(options_map["execution-threads"]); thrd++) { - models[thrd].init(options_map); - } - }; - - template - void runONNXGPUModel(std::vector>& input, int execution_threads) { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = models[thrd].inference(input[thrd]); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - template - void runONNXGPUModel(std::vector>>& input, int execution_threads) { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = models[thrd].inference(input[thrd]); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - void init(InitContext& ic) final {}; - void run(ProcessingContext& pc) final { - double time = 0; - int test_size_tensor = std::stoi(options_map["size-tensor"]); - int epochs_measure = std::stoi(options_map["measure-cycle"]); - int execution_threads = std::stoi(options_map["execution-threads"]); - int test_num_tensors = std::stoi(options_map["num-tensors"]); - int test_size_iter = std::stoi(options_map["num-iter"]); - - LOG(info) << "Preparing input data"; - // Prepare input data - std::vector inputShape{test_size_tensor, models[0].getNumInputNodes()[0][1]}; - - LOG(info) << "Creating ONNX tensor"; - std::vector> input_tensor(execution_threads); - std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, OrtDataType::Float16_t(1.0f)); // Example input - for(int i = 0; i < execution_threads; i++){ - input_tensor[i] = input_data; - // input_tensor[i].resize(test_num_tensors); - // for(int j = 0; j < test_num_tensors; j++){ - // input_tensor[i][j] = input_data; - // } - } - - LOG(info) << "Starting inference"; - auto start_network_eval = std::chrono::high_resolution_clock::now(); - for(int i = 0; i < test_size_iter; i++){ - runONNXGPUModel(input_tensor, execution_threads); - if((i % epochs_measure == 0) && (i != 0)){ - auto end_network_eval = std::chrono::high_resolution_clock::now(); - time = std::chrono::duration>(end_network_eval - start_network_eval).count()/1e9; - LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor*epochs_measure*execution_threads*test_num_tensors/time) << " elements / s"; - time = 0; - start_network_eval = std::chrono::high_resolution_clock::now(); - } - } - - // for(auto out : output){ - // LOG(info) << "Test output: " << out; - // } - pc.services().get().endOfStream(); - pc.services().get().readyToQuit(QuitRequest::Me); - }; - - private: - std::vector models; - std::unordered_map options_map; + public: + onnxInference(std::unordered_map optionsMap) + { + options_map = optionsMap; + models = std::vector(std::stoi(options_map["execution-threads"])); + for (int thrd = 0; thrd < std::stoi(options_map["execution-threads"]); thrd++) { + models[thrd].init(options_map); + } + }; + + template + void runONNXGPUModel(std::vector>& input, int execution_threads) + { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = models[thrd].inference(input[thrd]); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + template + void runONNXGPUModel(std::vector>>& input, int execution_threads) + { + std::vector threads(execution_threads); + for (int thrd = 0; thrd < execution_threads; thrd++) { + threads[thrd] = std::thread([&, thrd] { + auto outputTensors = models[thrd].inference(input[thrd]); + }); + } + for (auto& thread : threads) { + thread.join(); + } + }; + + void init(InitContext& ic) final {}; + void run(ProcessingContext& pc) final + { + double time = 0; + int test_size_tensor = std::stoi(options_map["size-tensor"]); + int epochs_measure = std::stoi(options_map["measure-cycle"]); + int execution_threads = std::stoi(options_map["execution-threads"]); + int test_num_tensors = std::stoi(options_map["num-tensors"]); + int test_size_iter = std::stoi(options_map["num-iter"]); + + LOG(info) << "Preparing input data"; + // Prepare input data + std::vector inputShape{test_size_tensor, models[0].getNumInputNodes()[0][1]}; + + LOG(info) << "Creating ONNX tensor"; + std::vector> input_tensor(execution_threads); + std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, OrtDataType::Float16_t(1.0f)); // Example input + for (int i = 0; i < execution_threads; i++) { + input_tensor[i] = input_data; + // input_tensor[i].resize(test_num_tensors); + // for(int j = 0; j < test_num_tensors; j++){ + // input_tensor[i][j] = input_data; + // } + } + + LOG(info) << "Starting inference"; + auto start_network_eval = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < test_size_iter; i++) { + runONNXGPUModel(input_tensor, execution_threads); + if ((i % epochs_measure == 0) && (i != 0)) { + auto end_network_eval = std::chrono::high_resolution_clock::now(); + time = std::chrono::duration>(end_network_eval - start_network_eval).count() / 1e9; + LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor * epochs_measure * execution_threads * test_num_tensors / time) << " elements / s"; + time = 0; + start_network_eval = std::chrono::high_resolution_clock::now(); + } + } + + // for(auto out : output){ + // LOG(info) << "Test output: " << out; + // } + pc.services().get().endOfStream(); + pc.services().get().readyToQuit(QuitRequest::Me); + }; + + private: + std::vector models; + std::unordered_map options_map; }; -} -} +} // namespace tpc +} // namespace o2 void customize(std::vector& workflowOptions) { - std::vector options{ - {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, - {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, - {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, - {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, - {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, - {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, - {"intra-op-num-threads", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, - {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, - {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, - {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, - {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, - {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, - {"logging-level", VariantType::Int, 1, {"Logging level"}}, - {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, - {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}} - }; - std::swap(workflowOptions, options); + std::vector options{ + {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, + {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, + {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, + {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, + {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, + {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, + {"intra-op-num-threads", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, + {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, + {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, + {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, + {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, + {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, + {"logging-level", VariantType::Int, 1, {"Logging level"}}, + {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, + {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}}}; + std::swap(workflowOptions, options); } // --------------------------------- @@ -184,45 +186,42 @@ void customize(std::vector& workflowOptions) DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) { - // A copy of the global workflow options from customize() to pass to the task - std::unordered_map options_map{ - {"model-path", cfgc.options().get("path")}, - {"device", cfgc.options().get("device")}, - {"device-id", std::to_string(cfgc.options().get("device-id"))}, - {"dtype", cfgc.options().get("dtype")}, - {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, - {"intra-op-num-threads", std::to_string(cfgc.options().get("intra-op-num-threads"))}, - {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, - {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, - {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, - {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, - {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, - {"profiling-output-path", cfgc.options().get("profiling-output-path")}, - {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, - {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, - {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))} - }; - - return DataProcessorSpec{ - "test-onnx-interface", - inputs, - outputs, - adaptFromTask(options_map), - Options{ - {"somethingElse", VariantType::String, "-", {"Something else"}} - } - }; + // A copy of the global workflow options from customize() to pass to the task + std::unordered_map options_map{ + {"model-path", cfgc.options().get("path")}, + {"device", cfgc.options().get("device")}, + {"device-id", std::to_string(cfgc.options().get("device-id"))}, + {"dtype", cfgc.options().get("dtype")}, + {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, + {"intra-op-num-threads", std::to_string(cfgc.options().get("intra-op-num-threads"))}, + {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, + {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, + {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, + {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, + {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, + {"profiling-output-path", cfgc.options().get("profiling-output-path")}, + {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, + {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, + {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))}}; + + return DataProcessorSpec{ + "test-onnx-interface", + inputs, + outputs, + adaptFromTask(options_map), + Options{ + {"somethingElse", VariantType::String, "-", {"Something else"}}}}; } WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) { - WorkflowSpec specs; + WorkflowSpec specs; - static std::vector inputs; - static std::vector outputs; + static std::vector inputs; + static std::vector outputs; - specs.push_back(testProcess(cfgc, inputs, outputs)); + specs.push_back(testProcess(cfgc, inputs, outputs)); - return specs; + return specs; } \ No newline at end of file From 9c8f167df18c0e8b1e5bf02db23bb108660bac7e Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 28 Nov 2024 14:30:19 +0100 Subject: [PATCH 10/21] Changing names to CamelCase and adding test task for onnx model --- Common/ML/CMakeLists.txt | 16 +++++++++++++++- .../ML/{ort_interface.h => OrtInterface.h} | 4 ++-- .../src/{ort_interface.cxx => OrtInterface.cxx} | 16 +++++++++++----- .../test/test_onnx_interface_headers.cxx | 4 ++-- 4 files changed, 30 insertions(+), 10 deletions(-) rename Common/ML/include/ML/{ort_interface.h => OrtInterface.h} (98%) rename Common/ML/src/{ort_interface.cxx => OrtInterface.cxx} (98%) diff --git a/Common/ML/CMakeLists.txt b/Common/ML/CMakeLists.txt index 954d29d6e2793..94394ace6dc86 100644 --- a/Common/ML/CMakeLists.txt +++ b/Common/ML/CMakeLists.txt @@ -9,7 +9,21 @@ # granted to it by virtue of its status as an Intergovernmental Organization # or submit itself to any jurisdiction. +# Pass ORT variables as a preprocessor definition +if(DEFINED ENV{ORT_ROCM_BUILD}) + add_compile_definitions(ORT_ROCM_BUILD=$ENV{ORT_ROCM_BUILD}) +endif() +if(DEFINED ENV{ORT_CUDA_BUILD}) + add_compile_definitions(ORT_CUDA_BUILD=$ENV{ORT_CUDA_BUILD}) +endif() +if(DEFINED ENV{ORT_MIGRAPHX_BUILD}) + add_compile_definitions(ORT_MIGRAPHX_BUILD=$ENV{ORT_MIGRAPHX_BUILD}) +endif() +if(DEFINED ENV{ORT_TENSORRT_BUILD}) + add_compile_definitions(ORT_TENSORRT_BUILD=$ENV{ORT_TENSORRT_BUILD}) +endif() + o2_add_library(ML - SOURCES src/ort_interface.cxx + SOURCES src/OrtInterface.cxx TARGETVARNAME targetName PRIVATE_LINK_LIBRARIES O2::Framework ONNXRuntime::ONNXRuntime) \ No newline at end of file diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/OrtInterface.h similarity index 98% rename from Common/ML/include/ML/ort_interface.h rename to Common/ML/include/ML/OrtInterface.h index 2fe9a44a0623c..965e9365e4372 100644 --- a/Common/ML/include/ML/ort_interface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -9,7 +9,7 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file ort_interface.h +/// \file OrtInterface.h /// \author Christian Sonnabend /// \brief A header library for loading ONNX models and inferencing them on CPU and GPU @@ -89,4 +89,4 @@ class OrtModel } // namespace o2 -#endif // O2_ML_ORT_INTERFACE_H \ No newline at end of file +#endif // O2_ML_OrtInterface_H \ No newline at end of file diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/OrtInterface.cxx similarity index 98% rename from Common/ML/src/ort_interface.cxx rename to Common/ML/src/OrtInterface.cxx index 9686437006ffc..2991565556882 100644 --- a/Common/ML/src/ort_interface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -9,11 +9,11 @@ // granted to it by virtue of its status as an Intergovernmental Organization // or submit itself to any jurisdiction. -/// \file ort_interface.cxx +/// \file OrtInterface.cxx /// \author Christian Sonnabend /// \brief A header library for loading ONNX models and inferencing them on CPU and GPU -#include "ML/ort_interface.h" +#include "ML/OrtInterface.h" #include "ML/3rdparty/GPUORTFloat16.h" // ONNX includes @@ -55,24 +55,30 @@ void OrtModel::reset(std::unordered_map optionsMap) enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; -#ifdef ORT_ROCM_BUILD +#if defined(ORT_ROCM_BUILD) + #if ORT_ROCM_BUILD == 1 if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } + #endif #endif -#ifdef ORT_MIGRAPHX_BUILD +#if defined(ORT_MIGRAPHX_BUILD) + #if ORT_MIGRAPHX_BUILD == 1 if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } + #endif #endif -#ifdef ORT_CUDA_BUILD +#if defined(ORT_CUDA_BUILD) + #if ORT_CUDA_BUILD == 1 if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } + #endif #endif if (allocateDeviceMemory) { diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index 45a13bf874b0d..62843c251132d 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -24,7 +24,7 @@ #include "Headers/DataHeader.h" -#include "ML/ort_interface.h" +#include "ML/OrtInterface.h" #include "ML/3rdparty/GPUORTFloat16.h" #include "Steer/MCKinematicsReader.h" @@ -139,7 +139,7 @@ class onnxInference : public Task if ((i % epochs_measure == 0) && (i != 0)) { auto end_network_eval = std::chrono::high_resolution_clock::now(); time = std::chrono::duration>(end_network_eval - start_network_eval).count() / 1e9; - LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor * epochs_measure * execution_threads * test_num_tensors / time) << " elements / s"; + LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor * epochs_measure * execution_threads / time) << " elements / s"; time = 0; start_network_eval = std::chrono::high_resolution_clock::now(); } From de1ae504d7e291fb258dadda9b884fedda8c5750 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Thu, 28 Nov 2024 15:01:55 +0100 Subject: [PATCH 11/21] Fixing warning of narrowing conversion --- Common/ML/src/OrtInterface.cxx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 2991565556882..0f1bc36d939f7 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -169,7 +169,7 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); @@ -184,7 +184,7 @@ std::vector OrtModel::inference(std::vector>& input) { std::vector inputTensor; for (auto i : input) { - std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); @@ -206,7 +206,7 @@ std::string OrtModel::printShape(const std::vector& v) template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); @@ -219,7 +219,7 @@ std::vector OrtModel::inference(std::vector& input) template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); @@ -232,7 +232,7 @@ std::vector OrtModel::inference(std::vecto template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); @@ -245,7 +245,7 @@ std::vector OrtModel::inference std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{input.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); @@ -260,7 +260,7 @@ std::vector OrtModel::inference inputTensor; for (auto i : input) { - std::vector inputShape{i.size() / mInputShapes[0][1], mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); From a3bf6af90934a5e4ce2577d94f246209c64d7464 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 11:59:03 +0100 Subject: [PATCH 12/21] Adding mapping of ORT logging to InfoLogger and disabling telemetry events --- Common/ML/src/OrtInterface.cxx | 44 ++++++++++++++----- .../test/test_onnx_interface_headers.cxx | 2 +- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index 0f1bc36d939f7..a69808c6c9930 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -50,35 +50,35 @@ void OrtModel::reset(std::unordered_map optionsMap) deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); - loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); + loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 2); enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); std::string dev_mem_str = "Hip"; #if defined(ORT_ROCM_BUILD) - #if ORT_ROCM_BUILD == 1 +#if ORT_ROCM_BUILD == 1 if (device == "ROCM") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) ROCM execution provider set"; } - #endif +#endif #endif #if defined(ORT_MIGRAPHX_BUILD) - #if ORT_MIGRAPHX_BUILD == 1 +#if ORT_MIGRAPHX_BUILD == 1 if (device == "MIGRAPHX") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) MIGraphX execution provider set"; } - #endif +#endif #endif #if defined(ORT_CUDA_BUILD) - #if ORT_CUDA_BUILD == 1 +#if ORT_CUDA_BUILD == 1 if (device == "CUDA") { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); LOG(info) << "(ORT) CUDA execution provider set"; dev_mem_str = "Cuda"; } - #endif +#endif #endif if (allocateDeviceMemory) { @@ -112,7 +112,27 @@ void OrtModel::reset(std::unordered_map optionsMap) (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); - pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); + pImplOrt->env = std::make_shared( + OrtLoggingLevel(loggingLevel), + (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()), + // Integrate ORT logging into Fairlogger + [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { + if(severity == ORT_LOGGING_LEVEL_VERBOSE) { + LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } else if(severity == ORT_LOGGING_LEVEL_INFO) { + LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } else if(severity == ORT_LOGGING_LEVEL_WARNING) { + LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } else if(severity == ORT_LOGGING_LEVEL_ERROR) { + LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } else if(severity == ORT_LOGGING_LEVEL_FATAL) { + LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } else { + LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; + } + }, + (void*)3); + (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { @@ -136,14 +156,14 @@ void OrtModel::reset(std::unordered_map optionsMap) [&](const std::string& str) { return str.c_str(); }); // Print names - LOG(info) << "Input Nodes:"; + LOG(info) << "\tInput Nodes:"; for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); + LOG(info) << "\t\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); } - LOG(info) << "Output Nodes:"; + LOG(info) << "\tOutput Nodes:"; for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); + LOG(info) << "\t\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); } } diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index 62843c251132d..b1a872f59f292 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -174,7 +174,7 @@ void customize(std::vector& workflowOptions) {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, - {"logging-level", VariantType::Int, 1, {"Logging level"}}, + {"logging-level", VariantType::Int, 2, {"Logging level"}}, {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}}}; std::swap(workflowOptions, options); From a19e595fd1b73efa21a1b20980ec4861aa434008 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:12:17 +0100 Subject: [PATCH 13/21] Removing old files and adding whitespace --- Common/ML/include/ML/ort_interface.h | 92 ------ Common/ML/src/ort_interface.cxx | 280 ------------------ .../test/test_onnx_interface_headers.cxx | 2 +- 3 files changed, 1 insertion(+), 373 deletions(-) delete mode 100644 Common/ML/include/ML/ort_interface.h delete mode 100644 Common/ML/src/ort_interface.cxx diff --git a/Common/ML/include/ML/ort_interface.h b/Common/ML/include/ML/ort_interface.h deleted file mode 100644 index e2049b8508cb4..0000000000000 --- a/Common/ML/include/ML/ort_interface.h +++ /dev/null @@ -1,92 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file ort_interface.h -/// \author Christian Sonnabend -/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU - -#ifndef O2_ML_ONNX_INTERFACE_H -#define O2_ML_ONNX_INTERFACE_H - -// C++ and system includes -#include -#include -#include -#include -#include - -// O2 includes -#include "Framework/Logger.h" - -namespace o2 -{ - -namespace ml -{ - -class OrtModel -{ - - public: - // Constructor - OrtModel() = default; - OrtModel(std::unordered_map optionsMap) { reset(optionsMap); } - void init(std::unordered_map optionsMap) { reset(optionsMap); } - void reset(std::unordered_map); - - virtual ~OrtModel() = default; - - // Conversion - template - std::vector v2v(std::vector&, bool = true); - - // Inferencing - template // class I is the input data type, e.g. float, class O is the output data type, e.g. OrtDataType::Float16_t from O2/Common/ML/include/ML/GPUORTFloat16.h - std::vector inference(std::vector&); - - template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h - std::vector inference(std::vector>&); - - // template // class I is the input data type, e.g. float, class T the throughput data type and class O is the output data type - // std::vector inference(std::vector&); - - // Reset session - void resetSession(); - - std::vector> getNumInputNodes() const { return mInputShapes; } - std::vector> getNumOutputNodes() const { return mOutputShapes; } - std::vector getInputNames() const { return mInputNames; } - std::vector getOutputNames() const { return mOutputNames; } - - void setActiveThreads(int threads) { intraOpNumThreads = threads; } - - private: - // ORT variables -> need to be hidden as Pimpl - struct OrtVariables; - OrtVariables* pImplOrt; - - // Input & Output specifications of the loaded network - std::vector inputNamesChar, outputNamesChar; - std::vector mInputNames, mOutputNames; - std::vector> mInputShapes, mOutputShapes; - - // Environment settings - std::string modelPath, device = "cpu", dtype = "float"; // device options should be cpu, rocm, migraphx, cuda - int intraOpNumThreads = 0, deviceId = 0, enableProfiling = 0, loggingLevel = 0, allocateDeviceMemory = 0, enableOptimizations = 0; - - std::string printShape(const std::vector&); -}; - -} // namespace ml - -} // namespace o2 - -#endif // O2_ML_ORT_INTERFACE_H diff --git a/Common/ML/src/ort_interface.cxx b/Common/ML/src/ort_interface.cxx deleted file mode 100644 index 27ac8eee16b7b..0000000000000 --- a/Common/ML/src/ort_interface.cxx +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright 2019-2020 CERN and copyright holders of ALICE O2. -// See https://alice-o2.web.cern.ch/copyright for details of the copyright holders. -// All rights not expressly granted are reserved. -// -// This software is distributed under the terms of the GNU General Public -// License v3 (GPL Version 3), copied verbatim in the file "COPYING". -// -// In applying this license CERN does not waive the privileges and immunities -// granted to it by virtue of its status as an Intergovernmental Organization -// or submit itself to any jurisdiction. - -/// \file ort_interface.cxx -/// \author Christian Sonnabend -/// \brief A header library for loading ONNX models and inferencing them on CPU and GPU - -#include "ML/ort_interface.h" -#include "ML/3rdparty/GPUORTFloat16.h" - -// ONNX includes -#include - -namespace o2 -{ - -namespace ml -{ - -struct OrtModel::OrtVariables { // The actual implementation is hidden in the .cxx file - // ORT runtime objects - Ort::RunOptions runOptions; - std::shared_ptr env = nullptr; - std::shared_ptr session = nullptr; ///< ONNX session - Ort::SessionOptions sessionOptions; - Ort::AllocatorWithDefaultOptions allocator; - Ort::MemoryInfo memoryInfo = Ort::MemoryInfo("Cpu", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault); -}; - -void OrtModel::reset(std::unordered_map optionsMap) -{ - - pImplOrt = new OrtVariables(); - - // Load from options map - if (!optionsMap.contains("model-path")) { - LOG(fatal) << "(ORT) Model path cannot be empty!"; - } - modelPath = optionsMap["model-path"]; - device = (optionsMap.contains("device") ? optionsMap["device"] : "CPU"); - dtype = (optionsMap.contains("dtype") ? optionsMap["dtype"] : "float"); - deviceId = (optionsMap.contains("device-id") ? std::stoi(optionsMap["device-id"]) : 0); - allocateDeviceMemory = (optionsMap.contains("allocate-device-memory") ? std::stoi(optionsMap["allocate-device-memory"]) : 0); - intraOpNumThreads = (optionsMap.contains("intra-op-num-threads") ? std::stoi(optionsMap["intra-op-num-threads"]) : 0); - loggingLevel = (optionsMap.contains("logging-level") ? std::stoi(optionsMap["logging-level"]) : 0); - enableProfiling = (optionsMap.contains("enable-profiling") ? std::stoi(optionsMap["enable-profiling"]) : 0); - enableOptimizations = (optionsMap.contains("enable-optimizations") ? std::stoi(optionsMap["enable-optimizations"]) : 0); - - std::string dev_mem_str = "Hip"; -#ifdef ORT_ROCM_BUILD - if (device == "ROCM") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) ROCM execution provider set"; - } -#endif -#ifdef ORT_MIGRAPHX_BUILD - if (device == "MIGRAPHX") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_MIGraphX(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) MIGraphX execution provider set"; - } -#endif -#ifdef ORT_CUDA_BUILD - if (device == "CUDA") { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(pImplOrt->sessionOptions, deviceId)); - LOG(info) << "(ORT) CUDA execution provider set"; - dev_mem_str = "Cuda"; - } -#endif - - if (allocateDeviceMemory) { - pImplOrt->memoryInfo = Ort::MemoryInfo(dev_mem_str.c_str(), OrtAllocatorType::OrtDeviceAllocator, deviceId, OrtMemType::OrtMemTypeDefault); - LOG(info) << "(ORT) Memory info set to on-device memory"; - } - - if (device == "CPU") { - (pImplOrt->sessionOptions).SetIntraOpNumThreads(intraOpNumThreads); - if (intraOpNumThreads > 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_PARALLEL); - } else if (intraOpNumThreads == 1) { - (pImplOrt->sessionOptions).SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL); - } - LOG(info) << "(ORT) CPU execution provider set with " << intraOpNumThreads << " threads"; - } - - (pImplOrt->sessionOptions).DisableMemPattern(); - (pImplOrt->sessionOptions).DisableCpuMemArena(); - - if (enableProfiling) { - if (optionsMap.contains("profiling-output-path")) { - (pImplOrt->sessionOptions).EnableProfiling((optionsMap["profiling-output-path"] + "/ORT_LOG_").c_str()); - } else { - LOG(warning) << "(ORT) If profiling is enabled, optionsMap[\"profiling-output-path\"] should be set. Disabling profiling for now."; - (pImplOrt->sessionOptions).DisableProfiling(); - } - } else { - (pImplOrt->sessionOptions).DisableProfiling(); - } - (pImplOrt->sessionOptions).SetGraphOptimizationLevel(GraphOptimizationLevel(enableOptimizations)); - (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); - - pImplOrt->env = std::make_shared(OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str())); - pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); - - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { - mInputShapes.emplace_back((pImplOrt->session)->GetInputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputNames.push_back((pImplOrt->session)->GetOutputNameAllocated(i, pImplOrt->allocator).get()); - } - for (size_t i = 0; i < (pImplOrt->session)->GetOutputCount(); ++i) { - mOutputShapes.emplace_back((pImplOrt->session)->GetOutputTypeInfo(i).GetTensorTypeAndShapeInfo().GetShape()); - } - - inputNamesChar.resize(mInputNames.size(), nullptr); - std::transform(std::begin(mInputNames), std::end(mInputNames), std::begin(inputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - outputNamesChar.resize(mOutputNames.size(), nullptr); - std::transform(std::begin(mOutputNames), std::end(mOutputNames), std::begin(outputNamesChar), - [&](const std::string& str) { return str.c_str(); }); - - // Print names - if (loggingLevel > 1) { - LOG(info) << "Input Nodes:"; - for (size_t i = 0; i < mInputNames.size(); i++) { - LOG(info) << "\t" << mInputNames[i] << " : " << printShape(mInputShapes[i]); - } - - LOG(info) << "Output Nodes:"; - for (size_t i = 0; i < mOutputNames.size(); i++) { - LOG(info) << "\t" << mOutputNames[i] << " : " << printShape(mOutputShapes[i]); - } - } -} - -void OrtModel::resetSession() -{ - pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); -} - -template -std::vector OrtModel::v2v(std::vector& input, bool clearInput) -{ - if constexpr (std::is_same_v) { - return input; - } else { - std::vector output(input.size()); - std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if (clearInput) { - input.clear(); - } - return output; - } -} - -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h -std::vector OrtModel::inference(std::vector>& input) -{ - std::vector inputTensor; - for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); - } - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -std::string OrtModel::printShape(const std::vector& v) -{ - std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) { - ss << v[i] << "x"; - } - ss << v[v.size() - 1]; - return ss.str(); -} - -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - float* outputValues = outputTensors[0].template GetTensorMutableData(); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - float* outputValues = outputTensors[0].template GetTensorMutableData(); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template <> -std::vector OrtModel::inference(std::vector& input) -{ - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -template <> -std::vector OrtModel::inference(std::vector>& input) -{ - std::vector inputTensor; - for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); - } - // input.clear(); - auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); - OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); - std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; - outputTensors.clear(); - return outputValuesVec; -} - -} // namespace ml - -} // namespace o2 diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx index b1a872f59f292..762857c3f10c7 100644 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx @@ -224,4 +224,4 @@ WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) specs.push_back(testProcess(cfgc, inputs, outputs)); return specs; -} \ No newline at end of file +} From 9df2dfb0f1644dc0486b5146dd35f8857c8243ed Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:15:59 +0100 Subject: [PATCH 14/21] Removing add_subdirectory (duplicate) --- Common/CMakeLists.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/Common/CMakeLists.txt b/Common/CMakeLists.txt index c0d31ae11e364..0b92758e45f43 100644 --- a/Common/CMakeLists.txt +++ b/Common/CMakeLists.txt @@ -14,7 +14,6 @@ add_subdirectory(MathUtils) add_subdirectory(Field) add_subdirectory(Types) add_subdirectory(Utils) -add_subdirectory(ML) add_subdirectory(SimConfig) add_subdirectory(DCAFitter) add_subdirectory(ML) From c4bc6b61da4c9c9df694c14458c874045f0f9577 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:29:11 +0100 Subject: [PATCH 15/21] Reformatting to adjust to current dev branch --- Common/ML/include/ML/OrtInterface.h | 6 ++-- Common/ML/src/OrtInterface.cxx | 51 +++++++++++++++++------------ 2 files changed, 33 insertions(+), 24 deletions(-) diff --git a/Common/ML/include/ML/OrtInterface.h b/Common/ML/include/ML/OrtInterface.h index 965e9365e4372..89631d59a3846 100644 --- a/Common/ML/include/ML/OrtInterface.h +++ b/Common/ML/include/ML/OrtInterface.h @@ -13,8 +13,8 @@ /// \author Christian Sonnabend /// \brief A header library for loading ONNX models and inferencing them on CPU and GPU -#ifndef O2_ML_ONNX_INTERFACE_H -#define O2_ML_ONNX_INTERFACE_H +#ifndef O2_ML_ORTINTERFACE_H +#define O2_ML_ORTINTERFACE_H // C++ and system includes #include @@ -89,4 +89,4 @@ class OrtModel } // namespace o2 -#endif // O2_ML_OrtInterface_H \ No newline at end of file +#endif // O2_ML_ORTINTERFACE_H diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index a69808c6c9930..d0f80f3b6239b 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -133,7 +133,7 @@ void OrtModel::reset(std::unordered_map optionsMap) }, (void*)3); (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events - (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + pImplOrt->session = std::make_shared({*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); @@ -169,7 +169,7 @@ void OrtModel::reset(std::unordered_map optionsMap) void OrtModel::resetSession() { - (pImplOrt->session).reset(new Ort::Session{*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + pImplOrt->session = std::make_shared({*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); } template @@ -180,8 +180,9 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if (clearInput) + if (clearInput){ input.clear(); + } return output; } } @@ -189,14 +190,15 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(input)).data(), input.size(), inputShape.data(), inputShape.size())); + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + return outputValuesVec; } template // class I is the input data type, e.g. float, class O is the output data type, e.g. O2::gpu::OrtDataType::Float16_t from O2/GPU/GPUTracking/ML/convert_float16.h @@ -204,21 +206,23 @@ std::vector OrtModel::inference(std::vector>& input) { std::vector inputTensor; for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), mInputShapes[0][1]}; - inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, (reinterpret_cast(i)).data(), i.size(), inputShape.data(), inputShape.size())); + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; + inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); O* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + return outputValuesVec; } std::string OrtModel::printShape(const std::vector& v) { std::stringstream ss(""); - for (size_t i = 0; i < v.size() - 1; i++) + for (size_t i = 0; i < v.size() - 1; i++) { ss << v[i] << "x"; + } ss << v[v.size() - 1]; return ss.str(); } @@ -226,53 +230,57 @@ std::string OrtModel::printShape(const std::vector& v) template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, input.data(), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + return outputValuesVec; } template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); float* outputValues = outputTensors[0].template GetTensorMutableData(); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + return outputValuesVec; } template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + return outputValuesVec; } template <> std::vector OrtModel::inference(std::vector& input) { - std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(input.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; std::vector inputTensor; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(input.data()), input.size(), inputShape.data(), inputShape.size())); // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputShape[0] * mOutputShapes[0][1]}; + return outputValuesVec; } template <> @@ -280,16 +288,17 @@ std::vector OrtModel::inference inputTensor; for (auto i : input) { - std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), mInputShapes[0][1]}; + std::vector inputShape{(int64_t)(i.size() / mInputShapes[0][1]), (int64_t)mInputShapes[0][1]}; inputTensor.emplace_back(Ort::Value::CreateTensor(pImplOrt->memoryInfo, reinterpret_cast(i.data()), i.size(), inputShape.data(), inputShape.size())); } // input.clear(); auto outputTensors = (pImplOrt->session)->Run(pImplOrt->runOptions, inputNamesChar.data(), inputTensor.data(), inputTensor.size(), outputNamesChar.data(), outputNamesChar.size()); OrtDataType::Float16_t* outputValues = reinterpret_cast(outputTensors[0].template GetTensorMutableData()); + std::vector outputValuesVec{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; outputTensors.clear(); - return std::vector{outputValues, outputValues + inputTensor.size() / mInputShapes[0][1] * mOutputShapes[0][1]}; + return outputValuesVec; } } // namespace ml -} // namespace o2 \ No newline at end of file +} // namespace o2 From e427f0a724ffd6c3433607dab1fa93125c2e0b6a Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:31:36 +0100 Subject: [PATCH 16/21] Adding whitespace --- Common/ML/src/OrtInterface.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index d0f80f3b6239b..a9a39da2f4ad2 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -180,7 +180,7 @@ std::vector OrtModel::v2v(std::vector& input, bool clearInput) } else { std::vector output(input.size()); std::transform(std::begin(input), std::end(input), std::begin(output), [](I f) { return O(f); }); - if (clearInput){ + if (clearInput) { input.clear(); } return output; From e436204262908698fb72d3be74b6b3ce3e850f56 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:38:54 +0100 Subject: [PATCH 17/21] Removing test task --- Detectors/TPC/workflow/CMakeLists.txt | 6 - .../test/test_onnx_interface_headers.cxx | 227 ------------------ 2 files changed, 233 deletions(-) delete mode 100644 Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx diff --git a/Detectors/TPC/workflow/CMakeLists.txt b/Detectors/TPC/workflow/CMakeLists.txt index 5bb4c49ab1075..ad52e407bea84 100644 --- a/Detectors/TPC/workflow/CMakeLists.txt +++ b/Detectors/TPC/workflow/CMakeLists.txt @@ -81,12 +81,6 @@ if(OpenMP_CXX_FOUND) target_link_libraries(${mergertargetName} PRIVATE OpenMP::OpenMP_CXX) endif() -o2_add_executable(onnx-interface - COMPONENT_NAME test - SOURCES test/test_onnx_interface_headers.cxx - PUBLIC_LINK_LIBRARIES O2::TPCWorkflow O2::SimulationDataFormat O2::TPCQC O2::DataFormatsTPC O2::TPCBase O2::ML Boost::thread O2::GPUTracking) - - o2_add_executable(reco-workflow COMPONENT_NAME tpc SOURCES src/tpc-reco-workflow.cxx diff --git a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx b/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx deleted file mode 100644 index 762857c3f10c7..0000000000000 --- a/Detectors/TPC/workflow/test/test_onnx_interface_headers.cxx +++ /dev/null @@ -1,227 +0,0 @@ -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "Algorithm/RangeTokenizer.h" -#include "SimulationDataFormat/MCCompLabel.h" -#include "SimulationDataFormat/ConstMCTruthContainer.h" -#include "SimulationDataFormat/LabelContainer.h" -#include "SimulationDataFormat/IOMCTruthContainerView.h" -#include "SimulationDataFormat/MCTruthContainer.h" - -#include "Headers/DataHeader.h" - -#include "ML/OrtInterface.h" -#include "ML/3rdparty/GPUORTFloat16.h" - -#include "Steer/MCKinematicsReader.h" - -#include "DPLUtils/RootTreeReader.h" -#include "DPLUtils/MakeRootTreeWriterSpec.h" - -#include "DataFormatsTPC/WorkflowHelper.h" -#include "DataFormatsTPC/ClusterNativeHelper.h" -#include "DataFormatsTPC/ClusterNative.h" -#include "DataFormatsTPC/ClusterGroupAttribute.h" -#include "DataFormatsTPC/Constants.h" -#include "DataFormatsTPC/TrackTPC.h" -#include "DataFormatsGlobalTracking/TrackTuneParams.h" -#include "DataFormatsTPC/Defs.h" - -#include "TPCWorkflow/ProcessingHelpers.h" -#include "TPCQC/Clusters.h" -#include "TPCBase/Painter.h" -#include "TPCBase/CalDet.h" -#include "TPCBase/Mapper.h" - -#include "Framework/Logger.h" -#include "Framework/Task.h" -#include "Framework/DataProcessorSpec.h" -#include "Framework/ConfigParamRegistry.h" -#include "Framework/ControlService.h" -#include "Framework/CompletionPolicyHelpers.h" -#include "Framework/WorkflowSpec.h" -#include "Framework/CallbacksPolicy.h" - -#include "DetectorsRaw/HBFUtils.h" - -using namespace o2; -using namespace o2::ml; -using namespace o2::tpc; -using namespace o2::framework; - -namespace o2 -{ -namespace tpc -{ -class onnxInference : public Task -{ - public: - onnxInference(std::unordered_map optionsMap) - { - options_map = optionsMap; - models = std::vector(std::stoi(options_map["execution-threads"])); - for (int thrd = 0; thrd < std::stoi(options_map["execution-threads"]); thrd++) { - models[thrd].init(options_map); - } - }; - - template - void runONNXGPUModel(std::vector>& input, int execution_threads) - { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = models[thrd].inference(input[thrd]); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - template - void runONNXGPUModel(std::vector>>& input, int execution_threads) - { - std::vector threads(execution_threads); - for (int thrd = 0; thrd < execution_threads; thrd++) { - threads[thrd] = std::thread([&, thrd] { - auto outputTensors = models[thrd].inference(input[thrd]); - }); - } - for (auto& thread : threads) { - thread.join(); - } - }; - - void init(InitContext& ic) final {}; - void run(ProcessingContext& pc) final - { - double time = 0; - int test_size_tensor = std::stoi(options_map["size-tensor"]); - int epochs_measure = std::stoi(options_map["measure-cycle"]); - int execution_threads = std::stoi(options_map["execution-threads"]); - int test_num_tensors = std::stoi(options_map["num-tensors"]); - int test_size_iter = std::stoi(options_map["num-iter"]); - - LOG(info) << "Preparing input data"; - // Prepare input data - std::vector inputShape{test_size_tensor, models[0].getNumInputNodes()[0][1]}; - - LOG(info) << "Creating ONNX tensor"; - std::vector> input_tensor(execution_threads); - std::vector input_data(models[0].getNumInputNodes()[0][1] * test_size_tensor, OrtDataType::Float16_t(1.0f)); // Example input - for (int i = 0; i < execution_threads; i++) { - input_tensor[i] = input_data; - // input_tensor[i].resize(test_num_tensors); - // for(int j = 0; j < test_num_tensors; j++){ - // input_tensor[i][j] = input_data; - // } - } - - LOG(info) << "Starting inference"; - auto start_network_eval = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < test_size_iter; i++) { - runONNXGPUModel(input_tensor, execution_threads); - if ((i % epochs_measure == 0) && (i != 0)) { - auto end_network_eval = std::chrono::high_resolution_clock::now(); - time = std::chrono::duration>(end_network_eval - start_network_eval).count() / 1e9; - LOG(info) << "Total time: " << time << "s. Timing: " << uint64_t((double)test_size_tensor * epochs_measure * execution_threads / time) << " elements / s"; - time = 0; - start_network_eval = std::chrono::high_resolution_clock::now(); - } - } - - // for(auto out : output){ - // LOG(info) << "Test output: " << out; - // } - pc.services().get().endOfStream(); - pc.services().get().readyToQuit(QuitRequest::Me); - }; - - private: - std::vector models; - std::unordered_map options_map; -}; -} // namespace tpc -} // namespace o2 - -void customize(std::vector& workflowOptions) -{ - std::vector options{ - {"path", VariantType::String, "./model.pt", {"Path to ONNX model"}}, - {"device", VariantType::String, "CPU", {"Device on which the ONNX model is run"}}, - {"device-id", VariantType::Int, 0, {"Device ID on which the ONNX model is run"}}, - {"dtype", VariantType::String, "-", {"Dtype in which the ONNX model is run (FP16 or FP32)"}}, - {"size-tensor", VariantType::Int, 100, {"Size of the input tensor"}}, - {"execution-threads", VariantType::Int, 1, {"If > 1 will run session->Run() with multiple threads as execution providers"}}, - {"intra-op-num-threads", VariantType::Int, 0, {"Number of threads per session for CPU execution provider"}}, - {"num-tensors", VariantType::Int, 1, {"Number of tensors on which execution is being performed"}}, - {"num-iter", VariantType::Int, 100, {"Number of iterations"}}, - {"measure-cycle", VariantType::Int, 10, {"Epochs in which to measure"}}, - {"enable-profiling", VariantType::Int, 0, {"Enable profiling"}}, - {"profiling-output-path", VariantType::String, "/scratch/csonnabe/O2_new", {"Path to save profiling output"}}, - {"logging-level", VariantType::Int, 2, {"Logging level"}}, - {"enable-optimizations", VariantType::Int, 0, {"Enable optimizations"}}, - {"allocate-device-memory", VariantType::Int, 0, {"Allocate the memory on device"}}}; - std::swap(workflowOptions, options); -} - -// --------------------------------- -#include "Framework/runDataProcessing.h" - -DataProcessorSpec testProcess(ConfigContext const& cfgc, std::vector& inputs, std::vector& outputs) -{ - - // A copy of the global workflow options from customize() to pass to the task - std::unordered_map options_map{ - {"model-path", cfgc.options().get("path")}, - {"device", cfgc.options().get("device")}, - {"device-id", std::to_string(cfgc.options().get("device-id"))}, - {"dtype", cfgc.options().get("dtype")}, - {"size-tensor", std::to_string(cfgc.options().get("size-tensor"))}, - {"intra-op-num-threads", std::to_string(cfgc.options().get("intra-op-num-threads"))}, - {"execution-threads", std::to_string(cfgc.options().get("execution-threads"))}, - {"num-tensors", std::to_string(cfgc.options().get("num-tensors"))}, - {"num-iter", std::to_string(cfgc.options().get("num-iter"))}, - {"measure-cycle", std::to_string(cfgc.options().get("measure-cycle"))}, - {"enable-profiling", std::to_string(cfgc.options().get("enable-profiling"))}, - {"profiling-output-path", cfgc.options().get("profiling-output-path")}, - {"logging-level", std::to_string(cfgc.options().get("logging-level"))}, - {"enable-optimizations", std::to_string(cfgc.options().get("enable-optimizations"))}, - {"allocate-device-memory", std::to_string(cfgc.options().get("allocate-device-memory"))}}; - - return DataProcessorSpec{ - "test-onnx-interface", - inputs, - outputs, - adaptFromTask(options_map), - Options{ - {"somethingElse", VariantType::String, "-", {"Something else"}}}}; -} - -WorkflowSpec defineDataProcessing(ConfigContext const& cfgc) -{ - - WorkflowSpec specs; - - static std::vector inputs; - static std::vector outputs; - - specs.push_back(testProcess(cfgc, inputs, outputs)); - - return specs; -} From 27fa75289cd72d10169a5966e54393a367029419 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:41:16 +0100 Subject: [PATCH 18/21] Adding back the white space --- Detectors/TPC/workflow/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/Detectors/TPC/workflow/CMakeLists.txt b/Detectors/TPC/workflow/CMakeLists.txt index ad52e407bea84..3b05e5067108c 100644 --- a/Detectors/TPC/workflow/CMakeLists.txt +++ b/Detectors/TPC/workflow/CMakeLists.txt @@ -81,6 +81,7 @@ if(OpenMP_CXX_FOUND) target_link_libraries(${mergertargetName} PRIVATE OpenMP::OpenMP_CXX) endif() + o2_add_executable(reco-workflow COMPONENT_NAME tpc SOURCES src/tpc-reco-workflow.cxx From 7b82026009be2324695f4e522b75ee518a18e9fe Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:44:28 +0100 Subject: [PATCH 19/21] Removing brackets --- Common/ML/src/OrtInterface.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index a9a39da2f4ad2..ae0a96920d707 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -169,7 +169,7 @@ void OrtModel::reset(std::unordered_map optionsMap) void OrtModel::resetSession() { - pImplOrt->session = std::make_shared({*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); } template From 43eb1774fcf05228b281538fcafecaeb44324ac7 Mon Sep 17 00:00:00 2001 From: Christian Sonnabend Date: Fri, 29 Nov 2024 12:46:30 +0100 Subject: [PATCH 20/21] Removing curly braces --- Common/ML/src/OrtInterface.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index ae0a96920d707..f0d0bacb5094f 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -133,7 +133,7 @@ void OrtModel::reset(std::unordered_map optionsMap) }, (void*)3); (pImplOrt->env)->DisableTelemetryEvents(); // Disable telemetry events - pImplOrt->session = std::make_shared({*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions}); + pImplOrt->session = std::make_shared(*(pImplOrt->env), modelPath.c_str(), pImplOrt->sessionOptions); for (size_t i = 0; i < (pImplOrt->session)->GetInputCount(); ++i) { mInputNames.push_back((pImplOrt->session)->GetInputNameAllocated(i, pImplOrt->allocator).get()); From 2cc5d3eb1040c2dcfa640e23eb81792b913e6297 Mon Sep 17 00:00:00 2001 From: ALICE Action Bot Date: Fri, 29 Nov 2024 11:48:22 +0000 Subject: [PATCH 21/21] Please consider the following formatting changes --- Common/ML/src/OrtInterface.cxx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Common/ML/src/OrtInterface.cxx b/Common/ML/src/OrtInterface.cxx index f0d0bacb5094f..eb124ff6f12c9 100644 --- a/Common/ML/src/OrtInterface.cxx +++ b/Common/ML/src/OrtInterface.cxx @@ -113,19 +113,19 @@ void OrtModel::reset(std::unordered_map optionsMap) (pImplOrt->sessionOptions).SetLogSeverityLevel(OrtLoggingLevel(loggingLevel)); pImplOrt->env = std::make_shared( - OrtLoggingLevel(loggingLevel), + OrtLoggingLevel(loggingLevel), (optionsMap["onnx-environment-name"].empty() ? "onnx_model_inference" : optionsMap["onnx-environment-name"].c_str()), // Integrate ORT logging into Fairlogger [](void* param, OrtLoggingLevel severity, const char* category, const char* logid, const char* code_location, const char* message) { - if(severity == ORT_LOGGING_LEVEL_VERBOSE) { + if (severity == ORT_LOGGING_LEVEL_VERBOSE) { LOG(debug) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; - } else if(severity == ORT_LOGGING_LEVEL_INFO) { + } else if (severity == ORT_LOGGING_LEVEL_INFO) { LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; - } else if(severity == ORT_LOGGING_LEVEL_WARNING) { + } else if (severity == ORT_LOGGING_LEVEL_WARNING) { LOG(warning) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; - } else if(severity == ORT_LOGGING_LEVEL_ERROR) { + } else if (severity == ORT_LOGGING_LEVEL_ERROR) { LOG(error) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; - } else if(severity == ORT_LOGGING_LEVEL_FATAL) { + } else if (severity == ORT_LOGGING_LEVEL_FATAL) { LOG(fatal) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message; } else { LOG(info) << "(ORT) [" << logid << "|" << category << "|" << code_location << "]: " << message;