Skip to content

Commit

Permalink
[CPU] optimize TensorIterator DynamicBuffer (openvinotoolkit#15163)
Browse files Browse the repository at this point in the history
* optimize TensorIterator DynamicBuffer by preallocating a large chunk of intermediate buffer.

code clean.

review update: always copy in transfer as it is not worthy.

review update: update mem_holder_buffer as dnnl::memory instead of shared_ptr of it.

review update: reuse mem_buffer_holder even if the shape changes.

review update: growth factor.

review update: bug fix.

* fix code style

* review update: rewrite the dynamic buffer using the cpu Memory class, instead of dnnl::memory

* Update src/plugins/intel_cpu/src/nodes/tensoriterator.cpp

Co-authored-by: Maksim Kutakov <[email protected]>

* Update src/plugins/intel_cpu/src/nodes/tensoriterator.cpp

Co-authored-by: Maksim Kutakov <[email protected]>

* review update: minor fix

---------

Co-authored-by: Maksim Kutakov <[email protected]>
  • Loading branch information
ceciliapeng2011 and maxnick authored Feb 21, 2023
1 parent 8153664 commit 86b5004
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 64 deletions.
162 changes: 105 additions & 57 deletions src/plugins/intel_cpu/src/nodes/tensoriterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,97 +231,148 @@ DynamicBuffer::DynamicBuffer(const MemoryPtr &from_, const std::vector<MemoryPtr
}

void DynamicBuffer::execute(const dnnl::engine& eng, const int iter) {
if (from->getStaticDims()[map_rule.axis] != std::abs(map_rule.stride))
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << std::abs(map_rule.stride) <<
" is expected, but actual: " << from->getStaticDims()[map_rule.axis];

if (iter == 0) {
init(eng);
return;
}

auto new_buffer = create_buffer(eng);
move_buffer(new_buffer);
// if chunk_offset_in_byte out of range of buffer holder, reallocate a larger chunk
if (check_buffer()) {
auto new_buffer = create_buffer(eng);
move_buffer(new_buffer);
}

move_data();
}

void DynamicBuffer::init(const dnnl::engine& eng) {
chunk_offset_in_byte = 0;
buffer_offset_in_byte = 0;
void DynamicBuffer::reset(int max_iter_count_) {
max_iter_count = max_iter_count_;
}

const auto axis = map_rule.axis;
void DynamicBuffer::init(const dnnl::engine& eng) {
const auto stride = map_rule.stride;
const auto abs_stride = std::abs(stride);

auto src_mem = from->GetPrimitive();
auto src_desc = src_mem.get_desc();
auto dims = src_desc.dims();

if (dims[axis] != abs_stride)
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << abs_stride <<
" is expected, but actual: " << dims[axis];

// We have no idea of "from" node memory dims until the sub_graph has been executed.
const auto& src_mem = from->GetPrimitive();
const auto& src_desc = src_mem.get_desc();
const auto& dims = src_desc.dims();
count = std::accumulate(dims.begin(), dims.begin() + map_rule.axis, size_t(1), std::multiplies<size_t>());
len = std::accumulate(dims.begin() + map_rule.axis + 1, dims.end(), elem_size, std::multiplies<size_t>());
mem_holder_buffer = memory(src_desc, eng);
copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), get_ptr(mem_holder_buffer), 0, 0, 1, from->GetSize());
}
chunk_unit_in_byte = abs_stride * len;

dnnl::memory DynamicBuffer::create_buffer(const dnnl::engine& eng) {
const auto axis = map_rule.axis;
const auto stride = map_rule.stride;
const auto abs_stride = std::abs(stride);

const auto old_desc = mem_holder_buffer.get_desc();
auto dims = old_desc.dims();

if (from->getStaticDims()[axis] != abs_stride)
IE_THROW() << "TensorIterator (Loop) has incorrect output shape[axis] after iteration for concatenation. " << abs_stride <<
" is expected, but actual: " << from->getStaticDims()[axis];
if (!mem_holder_buffer) { // else reuse buffer holder of last inference
// preallocate a large chunk of memory to hold intermediate concated outputs of all iterations.
mem_holder_buffer = create_buffer(eng);
}

dims[axis] += abs_stride;
dnnl::memory::desc new_buffer_desc(dims, old_desc.data_type(), DnnlExtensionUtils::GetPlainFormatByRank(dims.size()));
// reset chunk_offset_in_byte since the first execution
chunk_stride_in_byte = mem_holder_buffer->GetSize() / count;
chunk_offset_in_byte = stride > 0 ? 0 : (chunk_stride_in_byte - chunk_unit_in_byte);
num_execs = 0;
}

if (stride > 0.0f) {
chunk_offset_in_byte += new_buffer_desc.data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
bool DynamicBuffer::check_buffer() {
if (map_rule.stride > 0) {
if (chunk_offset_in_byte + chunk_unit_in_byte > chunk_stride_in_byte) return true;
} else {
buffer_offset_in_byte = from->GetPrimitive().get_desc().data.format_desc.blocking.strides[axis] * elem_size * abs_stride;
if (chunk_offset_in_byte < 0) return true;
}
return false;
}

MemoryPtr DynamicBuffer::create_buffer(const dnnl::engine& eng) {
const auto abs_stride = std::abs(map_rule.stride);

return dnnl::memory(new_buffer_desc, eng);
const auto estimate_iters = [&] () {
if (max_iter_count != -1) return max_iter_count;

// in case of no idea of memory upper boundary
return (num_execs == 0) ? 1 : 2 * num_execs; // growth factor 2
};
const auto estimated_iters = estimate_iters();
const Shape _shape = Shape({count, static_cast<size_t>(abs_stride * estimated_iters), len/elem_size});
auto _descCreator = BlockedDescCreator::getCommonCreators().at(LayoutType::ncsp);
auto new_buffer_desc = _descCreator->createSharedDesc(from->getDesc().getPrecision(), _shape);

auto _ptr = std::make_shared<Memory>(eng);
_ptr->Create(*new_buffer_desc);
return _ptr;
}

void DynamicBuffer::move_buffer(dnnl::memory new_buffer) {
const auto axis = map_rule.axis;
const auto src_stride = mem_holder_buffer.get_desc().dims()[axis] * len;
const auto dst_stride = new_buffer.get_desc().dims()[axis] * len;
void DynamicBuffer::move_buffer(const MemoryPtr& new_buffer) {
const auto stride = map_rule.stride;

// copy data from old buffer to new buffer
const auto src_stride = chunk_stride_in_byte;
const auto dst_stride = new_buffer->getStaticDims()[1] * len;

copy(get_ptr(mem_holder_buffer), get_ptr(new_buffer) + buffer_offset_in_byte,
src_stride, dst_stride, count, src_stride);
const auto valid_size = chunk_unit_in_byte * num_execs;
const auto src_offset_in_byte = stride > 0 ? 0 : (src_stride - valid_size);
chunk_offset_in_byte = stride > 0 ? 0 : (dst_stride - valid_size); // reset chunk_offset_in_byte

copy(reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + src_offset_in_byte, reinterpret_cast<uint8_t*>(new_buffer->GetPtr()) + chunk_offset_in_byte,
src_stride, dst_stride, count, valid_size);

// assign mem_holder_buffer
mem_holder_buffer = new_buffer;
chunk_stride_in_byte = mem_holder_buffer->GetSize() / count;

// adjust for next execution
if (stride > 0) {
chunk_offset_in_byte += valid_size;
} else {
chunk_offset_in_byte -= chunk_unit_in_byte;
}
}

void DynamicBuffer::move_data() {
const auto axis = map_rule.axis;
const auto src_stride = abs(map_rule.stride) * len;
const auto dst_stride = mem_holder_buffer.get_desc().dims()[axis] * len;
const auto dst_stride = chunk_stride_in_byte;

copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), get_ptr(mem_holder_buffer) + chunk_offset_in_byte,
src_stride, dst_stride, count, src_stride);
copy(reinterpret_cast<const uint8_t*>(from->GetPtr()), reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + chunk_offset_in_byte,
src_stride, dst_stride, count, chunk_unit_in_byte);

// adjust for next execution
num_execs++;
if (map_rule.stride > 0) {
chunk_offset_in_byte += chunk_unit_in_byte;
} else {
chunk_offset_in_byte -= chunk_unit_in_byte;
}
}

void DynamicBuffer::transfer(const Node* node) {
if (mem_holder_buffer) {
if (mem_holder_buffer && num_execs > 0) {
const auto axis = map_rule.axis;
const auto stride = map_rule.stride;
const auto abs_stride = std::abs(stride);

const auto& src_mem = from->GetPrimitive();
const auto& src_desc = src_mem.get_desc();
auto dims = src_desc.dims();
dims[axis] = abs_stride * num_execs;
const auto desc = node->getBaseMemDescAtOutputPort(map_rule.from)->cloneWithNewDims(
DnnlExtensionUtils::convertToVectorDims(mem_holder_buffer.get_desc().dims()));
DnnlExtensionUtils::convertToVectorDims(dims));

redefineToMemories(to, desc);

copy(get_ptr(mem_holder_buffer), reinterpret_cast<uint8_t*>(to.front()->GetPtr()), 0, 0, 1, to.front()->GetSize());
const auto src_stride = chunk_stride_in_byte;
const auto dst_stride = to.front()->getStaticDims()[axis] * len;
const auto valid_size = chunk_unit_in_byte * num_execs;
const auto src_offset_in_byte = stride > 0 ? 0 : (src_stride - valid_size);
copy(reinterpret_cast<uint8_t*>(mem_holder_buffer->GetPtr()) + src_offset_in_byte, reinterpret_cast<uint8_t*>(to.front()->GetPtr()),
src_stride, dst_stride, count, dst_stride);
} else {
VectorDims newDims = to.front()->GetShape().getDims();
nullifyUndefinedDims(newDims);

const auto desc = node->getBaseMemDescAtOutputPort(map_rule.from)->cloneWithNewDims(newDims);
redefineToMemories(to, desc);
}

mem_holder_buffer.reset(nullptr);
}

void DynamicBuffer::copy(const uint8_t* src, uint8_t* dst, const size_t src_stride, const size_t dst_stride, const size_t count, const size_t len) {
Expand All @@ -330,13 +381,6 @@ void DynamicBuffer::copy(const uint8_t* src, uint8_t* dst, const size_t src_stri
});
}

uint8_t* DynamicBuffer::get_ptr(dnnl::memory& prim) {
auto ptr = static_cast<uint8_t*>(prim.get_data_handle());
auto md = prim.get_desc().data;
dnnl::impl::memory_desc_wrapper wrapper(md);
return ptr + wrapper.offset0() * wrapper.data_type_size();
}

bool TensorIterator::isSupportedOperation(const std::shared_ptr<const ov::Node>& op, std::string& errorMessage) noexcept {
try {
if (!one_of(op->get_type_info(),
Expand Down Expand Up @@ -517,6 +561,10 @@ void TensorIterator::prepareParams() {
prepareOutputPorts();
prepareBackEdges();
}

// reset local states of DynamicBuffer
for (auto& buffer : buffers)
buffer->reset(lastUsedTripCount);
}
}

Expand Down
21 changes: 14 additions & 7 deletions src/plugins/intel_cpu/src/nodes/tensoriterator.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,33 +65,40 @@ class PortChecker {
class DynamicBuffer {
public:
DynamicBuffer(const MemoryPtr &from_, const std::vector<MemoryPtr> &to_, const PortMap &map_rule_);
~DynamicBuffer() = default;

void execute(const dnnl::engine& eng, const int iter);
void transfer(const Node* node);

void reset(int max_iter_count_); // reset local

private:
void init(const dnnl::engine& eng);

/* methods for resize and refill buffer */
dnnl::memory create_buffer(const dnnl::engine& eng);
void move_buffer(dnnl::memory new_buffer);
bool check_buffer();
MemoryPtr create_buffer(const dnnl::engine& eng);
void move_buffer(const MemoryPtr& new_buffer);
void move_data();

static void copy(const uint8_t* src, uint8_t* dst, const size_t src_stride, const size_t dst_stride, const size_t count, const size_t len);
static uint8_t* get_ptr(dnnl::memory& prim);

/* variable states */
size_t len = 1lu;
size_t count = 1lu;
size_t elem_size = 0lu;

ptrdiff_t chunk_stride_in_byte = 0;
ptrdiff_t chunk_offset_in_byte = 0;
ptrdiff_t buffer_offset_in_byte = 0;
size_t chunk_unit_in_byte = 0lu; // the amount of bytes copied per each count per each execution (iteration)
int num_execs = 0lu; // number of executions happened
int max_iter_count = -1; // estimated maximum iter count

/* invariable states */
MemoryPtr from;
std::vector<MemoryPtr> to;
PortMap map_rule;
size_t elem_size = 0lu;

dnnl::memory mem_holder_buffer;
MemoryPtr mem_holder_buffer;
};

class TensorIterator : public Node {
Expand Down

0 comments on commit 86b5004

Please sign in to comment.