Skip to content

Commit

Permalink
[experimental feature] Add timing mechanism to measure IAA work (#1077)
Browse files Browse the repository at this point in the history
  • Loading branch information
mcao59 authored Oct 11, 2024
1 parent 3c74801 commit 117ec5b
Show file tree
Hide file tree
Showing 8 changed files with 203 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
option(SANITIZE_MEMORY "Enables memory sanitizing" OFF)
option(SANITIZE_THREADS "Enables threads sanitizing" OFF)
option(LOG_HW_INIT "Enables HW initialization log" OFF)
option(QPL_LOG_IAA_TIME "(Experimental feature) Enables timestamps to track IAA execution time in single-threaded application" OFF)
option(EFFICIENT_WAIT "Enables usage of efficient wait instructions" OFF)
option(LIB_FUZZING_ENGINE "Enables fuzzy testing" OFF)
option(DYNAMIC_LOADING_LIBACCEL_CONFIG "Loads the accelerator configuration library (libaccel-config) dynamically with dlopen" ON)
Expand All @@ -49,6 +50,7 @@ option(QPL_USE_CLANG_TIDY "Run clang-tidy" OFF)
message(STATUS "Memory sanitizing build: ${SANITIZE_MEMORY}")
message(STATUS "Threads sanitizing build: ${SANITIZE_THREADS}")
message(STATUS "Hardware initialization logging: ${LOG_HW_INIT}")
message(STATUS "IAA execution time logging: ${QPL_LOG_IAA_TIME}")
message(STATUS "Efficient wait instructions: ${EFFICIENT_WAIT}")
message(STATUS "Fuzz testing build: ${LIB_FUZZING_ENGINE}")
message(STATUS "Load libaccel-config dynamically with dlopen: ${DYNAMIC_LOADING_LIBACCEL_CONFIG}")
Expand Down
1 change: 1 addition & 0 deletions sources/c_api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ target_compile_definitions(qpl_c_api
PUBLIC -DQPL_LIB # needed for middle_layer_lib
PUBLIC $<$<BOOL:${DYNAMIC_LOADING_LIBACCEL_CONFIG}>:DYNAMIC_LOADING_LIBACCEL_CONFIG> # needed for middle_layer_lib
PUBLIC $<$<BOOL:${LOG_HW_INIT}>:LOG_HW_INIT> # needed for middle_layer_lib
PUBLIC $<$<BOOL:${QPL_LOG_IAA_TIME}>:QPL_LOG_IAA_TIME> # needed for middle_layer_lib
PUBLIC $<$<C_COMPILER_ID:MSVC>:_ENABLE_EXTENDED_ALIGNED_STORAGE>)

set_property(GLOBAL APPEND PROPERTY QPL_LIB_DEPS
Expand Down
1 change: 1 addition & 0 deletions sources/middle-layer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ target_compile_definitions(middle_layer_lib
PUBLIC QPL_VERSION="${CMAKE_PROJECT_VERSION}"
PUBLIC $<$<C_COMPILER_ID:MSVC>:_ENABLE_EXTENDED_ALIGNED_STORAGE>
PUBLIC $<$<BOOL:${LOG_HW_INIT}>:LOG_HW_INIT>
PUBLIC $<$<BOOL:${QPL_LOG_IAA_TIME}>:QPL_LOG_IAA_TIME>
PUBLIC $<$<BOOL:${EFFICIENT_WAIT}>:QPL_EFFICIENT_WAIT>
PUBLIC QPL_BADARG_CHECK
PUBLIC $<$<BOOL:${DYNAMIC_LOADING_LIBACCEL_CONFIG}>:DYNAMIC_LOADING_LIBACCEL_CONFIG>)
Expand Down
18 changes: 18 additions & 0 deletions sources/middle-layer/dispatcher/hw_dispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@

#include "hw_dispatcher.hpp"

#ifdef QPL_LOG_IAA_TIME
#include "util/hw_timing_util.hpp"
#endif

#if defined(__linux__)

#endif
Expand All @@ -22,6 +26,15 @@

namespace qpl::ml::dispatcher {

#ifdef QPL_LOG_IAA_TIME
/**
* @brief Global vector that stores the timestamps and meta data for
* operation executed on Intel® In-Memory Analytics Accelerator (Intel® IAA).
*/
std::vector<iaa_operation_timestamp> iaa_timestamps;

#endif //QPL_LOG_IAA_TIME

hw_dispatcher::hw_dispatcher() noexcept : hw_init_status_(hw_dispatcher::initialize_hw()) {
hw_support_ = hw_init_status_ == HW_ACCELERATOR_STATUS_OK;
}
Expand Down Expand Up @@ -78,6 +91,11 @@ auto hw_dispatcher::initialize_hw() noexcept -> hw_accelerator_status {
}

hw_dispatcher::~hw_dispatcher() noexcept {
#ifdef QPL_LOG_IAA_TIME
// Print elapsed time if hw timing is enabled
calculate_iaa_elapsed_time();
#endif

#if defined(__linux__)
// Variables
auto* context_ptr = hw_context_.get_driver_context_ptr();
Expand Down
22 changes: 22 additions & 0 deletions sources/middle-layer/dispatcher/hw_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@
#include "libaccel_config.h"
#endif //DYNAMIC_LOADING_LIBACCEL_CONFIG

#ifdef QPL_LOG_IAA_TIME
#include "util/hw_timing_util.hpp"
#endif //QPL_LOG_IAA_TIME

#define QPL_HWSTS_RET(expr, err_code) \
{ \
if (expr) { return (err_code); } \
Expand Down Expand Up @@ -119,6 +123,14 @@ auto hw_queue::enqueue_descriptor(void* desc_ptr) const noexcept -> qpl_status {
: "=r"(retry)
: "a"(current_place_ptr), "d"(desc_ptr));

#ifdef QPL_LOG_IAA_TIME
if (!retry) {
// If enqueue is successful, record the start time and meta data
record_meta_data(desc_ptr);
record_iaa_start_time();
}
#endif

// add with different verbosity level to not crowd output
// DIAG(" ENQCMD submitted\n");

Expand All @@ -130,6 +142,11 @@ auto hw_queue::enqueue_descriptor(void* desc_ptr) const noexcept -> qpl_status {
// DIAG(" write submitted\n");

if (ret == sizeof(hw_decompress_analytics_descriptor)) {
#ifdef QPL_LOG_IAA_TIME
// If enqueue is successful, record the start time and meta data
record_meta_data(desc_ptr);
record_iaa_start_time();
#endif
return QPL_STS_OK;
} else {
DIAG(" write returned %ld, expected %ld\n", ret, sizeof(hw_decompress_analytics_descriptor));
Expand All @@ -152,6 +169,11 @@ auto hw_queue::execute_noop() const noexcept -> qpl_status {
hw_iaa_descriptor_set_completion_record(&desc, &completion_record);

const qpl_status status = enqueue_descriptor(&desc);

#ifdef QPL_LOG_IAA_TIME
record_invalid_end_time_to_skip_iaa_timing();
#endif

if (QPL_STS_OK == status) {
while (completion_record.status == 0) {
_mm_pause();
Expand Down
9 changes: 9 additions & 0 deletions sources/middle-layer/util/awaiter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

#include "awaiter.hpp"

#ifdef QPL_LOG_IAA_TIME
#include "hw_timing_util.hpp"
#endif

#if defined(__linux__)

#include <x86intrin.h>
Expand Down Expand Up @@ -66,6 +70,11 @@ awaiter::~awaiter() noexcept {
_mm_pause();
}
#endif

#ifdef QPL_LOG_IAA_TIME
// Record the end time right after descriptor is completed
dispatcher::record_iaa_end_time();
#endif
}

void awaiter::wait_for(volatile void* address, uint8_t initial_value) noexcept {
Expand Down
9 changes: 9 additions & 0 deletions sources/middle-layer/util/descriptor_processing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
#include "util/completion_record.hpp"
#include "util/hw_status_converting.hpp"

#ifdef QPL_LOG_IAA_TIME
#include "util/hw_timing_util.hpp"
#endif

namespace qpl::ml::util {

enum class execution_mode_t { sync, async };
Expand Down Expand Up @@ -101,6 +105,11 @@ inline auto process_descriptor(hw_descriptor* const desc
reinterpret_cast<hw_decompress_analytics_descriptor*>(descriptor_ptr)->src1_size;
}
} else {
// Async path
#ifdef QPL_LOG_IAA_TIME
qpl::ml::dispatcher::record_invalid_end_time_to_skip_iaa_timing();
#endif

if constexpr (std::is_same<other::crc_operation_result_t, return_t>::value) {
operation_result.status_code_ = convert_hw_accelerator_status_to_qpl_status(accel_status);
} else {
Expand Down
141 changes: 141 additions & 0 deletions sources/middle-layer/util/hw_timing_util.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*******************************************************************************
* Copyright (C) 2024 Intel Corporation
*
* SPDX-License-Identifier: MIT
******************************************************************************/

#ifndef QPL_SOURCES_MIDDLE_LAYER_UTIL_HW_TIMING_UTIL_HPP_
#define QPL_SOURCES_MIDDLE_LAYER_UTIL_HW_TIMING_UTIL_HPP_

#ifdef QPL_LOG_IAA_TIME

#include <chrono>
#include <cstdint>
#include <iostream>
#include <string>
#include <vector>

#include "hw_definitions.h"
#include "hw_iaa_flags.h"

namespace qpl::ml::dispatcher {

// The offset of opcode in the field that packs (Opcode 31:24, opflags 23:0)
#define QPL_OPCODE_OFFSET_IN_OPCODE_OPFLAGS 24U

struct iaa_operation_timestamp {
uint8_t operation;
uint32_t src_size;
std::chrono::time_point<std::chrono::high_resolution_clock> start_time;
std::chrono::time_point<std::chrono::high_resolution_clock> end_time;
};

// Global vector of structures that hold the timestamps and meta data
// for operations executed on Intel® In-Memory Analytics Accelerator (Intel® IAA).
// Global vector is defined in sources/middle-layer/dispatcher/hw_dispatcher.cpp.
extern std::vector<iaa_operation_timestamp> iaa_timestamps;

/**
* @brief Records the meta data for operation executed on Intel IAA
* by adding a new entry to the vector that stores operation timestamps, and
* setting the fields.
*
* @note The application must be single-threaded to ensure accurate
* correspondence between recorded values.
*/
inline void record_meta_data(void* desc_ptr) {
// The locations of opcode and source 1 size are the same in
// descriptor structures of different operation types,
// so just cast into hw_compress_descriptor
hw_compress_descriptor* const this_ptr = (hw_compress_descriptor*)desc_ptr;
uint32_t op_code_op_flags = this_ptr->op_code_op_flags;
uint8_t op = static_cast<uint8_t>(op_code_op_flags >> QPL_OPCODE_OFFSET_IN_OPCODE_OPFLAGS);
uint32_t source_size = this_ptr->source_size;

iaa_operation_timestamp iaa_op_timestamp;
iaa_op_timestamp.operation = op;
iaa_op_timestamp.src_size = source_size;
iaa_timestamps.push_back(iaa_op_timestamp);
}

/**
* @brief Records the current time as the start time for operation executed
* on Intel IAA by setting the start time field in the most recent entry of the vector
* that stores operation timestamps.
*
* @note The application must be single-threaded to ensure accurate
* correspondence between recorded start times and end times.
*/
inline void record_iaa_start_time() {
if (!iaa_timestamps.empty()) iaa_timestamps.back().start_time = std::chrono::high_resolution_clock::now();
}

/**
* @brief Records the current time as the end time for operation executed
* on Intel IAA by setting the end time field in the most recent entry of the vector
* that stores operation timestamps.
*
* @note The application must be single-threaded to ensure accurate
* correspondence between recorded start times and end times.
*/
inline void record_iaa_end_time() {
if (!iaa_timestamps.empty()) iaa_timestamps.back().end_time = std::chrono::high_resolution_clock::now();
}

/**
* @brief Skips the time measurement of Intel IAA by storing an invalid
* value as the end time. For example, since execution in async path
* is non-blocking, accurate time measurement is not possible. Therefore, timing
* should be skipped.
*/
inline void record_invalid_end_time_to_skip_iaa_timing() {
if (!iaa_timestamps.empty())
iaa_timestamps.back().end_time = std::chrono::time_point<std::chrono::high_resolution_clock>::min();
}

inline std::string get_name_from_opcode(uint8_t opcode) {
switch (opcode) {
case QPL_OPCODE_DECOMPRESS: return "decompress";
case QPL_OPCODE_COMPRESS: return "compress";
case QPL_OPCODE_CRC64: return "crc64";
case QPL_OPCODE_SCAN: return "scan";
case QPL_OPCODE_EXTRACT: return "extract";
case QPL_OPCODE_SELECT: return "select";
case QPL_OPCODE_EXPAND: return "expand";
default: return "unsupported operation";
}
}

static std::ostream& operator<<(std::ostream& os, const iaa_operation_timestamp& timestamp) {
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(timestamp.end_time - timestamp.start_time);
double elapsedTime = elapsed.count();
std::string operation = get_name_from_opcode(timestamp.operation);

os << "Time taken to complete HW descriptor: " << elapsedTime << " ns, operation: " << operation
<< ", source size: " << timestamp.src_size;

return os;
}

/**
* @brief Calculates the elapsed time for execution on Intel IAA by finding the
* differences between the recorded start times and end times.
*
* This function is called in the destructor of the singleton hw_dispatcher
* object to output the timing values before the application terminates.
*
* @note The application must be single-threaded to ensure accurate
* correspondence between recorded start times and end times
*/
inline void calculate_iaa_elapsed_time() {
for (std::size_t i = 0; i < iaa_timestamps.size(); ++i) {
if (iaa_timestamps[i].end_time != std::chrono::time_point<std::chrono::high_resolution_clock>::min()) {
std::cout << "#" << i << ": " << iaa_timestamps[i] << "\n";
}
}
}

} // namespace qpl::ml::dispatcher

#endif //QPL_LOG_IAA_TIME
#endif //QPL_SOURCES_MIDDLE_LAYER_UTIL_HW_TIMING_UTIL_HPP_

0 comments on commit 117ec5b

Please sign in to comment.