Skip to content

Commit

Permalink
memory alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanlehome committed Sep 8, 2022
1 parent 0c74110 commit 0b5cf16
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 21 deletions.
7 changes: 4 additions & 3 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -547,17 +547,18 @@ void AnalysisConfig::EnableTensorRtEngine(
}

use_tensorrt_ = true;

#if PADDLE_WITH_TENSORRT
// https://forums.developer.nvidia.com/t/nvinfer1-createexecutioncontextwithoutdevicememory-returns-nullptr/111878/2
// when trt version less than 7.2,
// createExecutionContextWithoutDeviceMemory() has bug
// so, we cannot enable engine context memory sharing
// createExecutionContextWithoutDeviceMemory() has bug.
// so, we cannot enable engine context memory sharing.
#if IS_TRT_VERSION_GE(7200)
trt_engine_memory_sharing_ = true;
#else
LOG(WARNING)
<< "TensorRT engine context memory sharing needs version 7.2 and after.";
trt_engine_memory_sharing_ = false;
#endif
#endif
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
Expand Down
5 changes: 4 additions & 1 deletion paddle/fluid/inference/tensorrt/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,10 @@ void TensorRTEngine::Execute(int batch_size,
void *context_memory{nullptr};
context_memory =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.getContextMemory(this);
.getContextMemory(
predictor_id_per_thread,
phi::GPUPlace(device_id_),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
infer_context->setDeviceMemory(context_memory);
}
if (!with_dynamic_shape()) {
Expand Down
42 changes: 25 additions & 17 deletions paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ limitations under the License. */

#include <NvInfer.h>

#include <cstdint>
#include <map>
#include <memory>
#include <mutex> // NOLINT
Expand All @@ -37,6 +38,8 @@ limitations under the License. */
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/stream.h"
#include "paddle/utils/any.h"

namespace paddle {
Expand Down Expand Up @@ -705,6 +708,7 @@ class TensorRTEngine {

class TRTEngineManager {
using PredictorID = int;
using AllocationPtr = phi::Allocator::AllocationPtr;

public:
bool Empty() const {
Expand Down Expand Up @@ -753,7 +757,7 @@ class TRTEngineManager {
}

void DeleteAll() {
// std::unique_lock<std::mutex> lock(mutex_);
std::unique_lock<std::mutex> lock(mutex_);
for (auto& item : engines_) {
item.second.reset(nullptr);
}
Expand Down Expand Up @@ -785,37 +789,41 @@ class TRTEngineManager {
}
}

void* getContextMemory(TensorRTEngine* trt_engine) {
void* getContextMemory(PredictorID predictor_id,
const phi::GPUPlace& place,
const phi::Stream& stream) {
std::unique_lock<std::mutex> lock(mutex_);
auto predictor_id = trt_engine->predictor_id_per_thread;
auto alignment = getAlignmentSize(place);
if (context_memorys_.count(predictor_id) == 0) {
void* context_memory{nullptr};
cudaMalloc(&context_memory, max_ctx_mem_size_);
if (context_memory == nullptr) {
PADDLE_ENFORCE_EQ(
max_ctx_mem_size_,
0,
platform::errors::InvalidArgument(
"The context memory size is non-zero, but the "
"memory address we applied for is NULL, we failed to set it."));
}
context_memorys_[predictor_id] = context_memory;
auto context_memory =
memory::Alloc(place, max_ctx_mem_size_ + alignment, stream);
// context_memory_[predictor_id].reset(context_memory.release());
context_memorys_[predictor_id] = std::move(context_memory);
}
return context_memorys_[predictor_id];
return getAlignedMemory(context_memorys_[predictor_id]->ptr(), alignment);
}

void releaseContextMemory(PredictorID predictor_id) {
std::unique_lock<std::mutex> lock(mutex_);
if (context_memorys_.count(predictor_id)) {
cudaFree(context_memorys_[predictor_id]);
context_memorys_[predictor_id].reset(nullptr);
context_memorys_.erase(predictor_id);
}
}

private:
size_t getAlignmentSize(const phi::GPUPlace& place) {
const auto& prop = platform::GetDeviceProperties(place.GetDeviceId());
return prop.textureAlignment;
}

void* getAlignedMemory(void* addr, size_t alignment) {
return reinterpret_cast<void*>(uintptr_t(addr) & (~(alignment - 1)));
}

mutable std::mutex mutex_;
size_t max_ctx_mem_size_{0};
std::unordered_map<PredictorID, void*> context_memorys_;
std::unordered_map<PredictorID, AllocationPtr> context_memorys_;
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
};

Expand Down

0 comments on commit 0b5cf16

Please sign in to comment.