Skip to content

Commit

Permalink
[GPU] Improve memory pool access performance (openvinotoolkit#22977)
Browse files Browse the repository at this point in the history
### Details:
- Previously, memory conflict check was done using std::string
(primitive_id), and it was time consuming
 - Fixed to use unique_id as mem_dep, instead of std::string 

### Tickets:
 - 131916
  • Loading branch information
yeonbok authored Mar 7, 2024
1 parent e70e59b commit 5872549
Show file tree
Hide file tree
Showing 12 changed files with 91 additions and 70 deletions.
29 changes: 17 additions & 12 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/memory_pool.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@ using primitive_id = std::string;
using memory_ptr = std::shared_ptr<memory>;

struct memory_user {
primitive_id _id;
size_t _unique_id;
uint32_t _network_id;
primitive_id _prim_id;

memory_user(primitive_id id, uint32_t network_id)
: _id(id), _network_id(network_id) {}
memory_user(size_t unique_id, uint32_t network_id, primitive_id prim_id)
: _unique_id(unique_id), _network_id(network_id), _prim_id(prim_id) {}

friend std::ostream& operator<<(std::ostream& os, const memory_user& memory_user) {
os << memory_user._id << "(" << memory_user._network_id << ")";
os << memory_user._prim_id << " (unique_id:" << memory_user._unique_id << ", net_id:" << memory_user._network_id << ")";
return os;
}
};
Expand All @@ -43,7 +44,7 @@ struct memory_user_comparer {
bool operator()(const memory_user& l_mu, const memory_user& r_mu) const {
if (l_mu._network_id != r_mu._network_id)
return l_mu._network_id < r_mu._network_id;
return l_mu._id < r_mu._id;
return l_mu._unique_id < r_mu._unique_id;
}
};

Expand Down Expand Up @@ -91,7 +92,7 @@ class memory_pool {
memory_pool();

memory_ptr alloc_memory(const layout& layout, allocation_type type, bool reset = true);
static bool has_conflict(const memory_set&, const std::set<primitive_id>&, uint32_t network_id);
static bool has_conflict(const memory_set&, const std::set<size_t>&, uint32_t network_id);

std::multimap<uint64_t, memory_record> _non_padded_pool;
std::map<layout, std::list<memory_record>, padded_pool_comparer> _padded_pool;
Expand All @@ -103,29 +104,33 @@ class memory_pool {
~memory_pool();
memory_ptr get_memory(const layout& layout,
const primitive_id& id,
size_t unique_id,
uint32_t network_id,
const std::set<primitive_id>& restrictions,
const std::set<size_t>& restrictions,
allocation_type type,
bool reusable = true,
bool reset = true); // get from pool or create memory allocation
memory_ptr get_memory(const layout& layout, allocation_type type, bool reset = true);
memory_ptr get_from_non_padded_pool(const layout& layout,
const primitive_id& id,
const primitive_id& prim_id,
size_t unique_id,
uint32_t network_id,
const std::set<primitive_id>&,
const std::set<size_t>&,
allocation_type type,
bool reset = true);
memory_ptr get_from_padded_pool(const layout& layout,
const primitive_id& id,
const primitive_id& prim_id,
size_t unique_id,
uint32_t network_id,
const std::set<primitive_id>& restrictions,
const std::set<size_t>& restrictions,
allocation_type type);
memory_ptr get_from_across_networks_pool(const layout& layout,
const primitive_id& id,
size_t unique_id,
uint32_t network_id,
allocation_type type);
void clear_pool_for_network(uint32_t network_id);
void release_memory(memory* memory, const primitive_id& id, uint32_t network_id);
void release_memory(memory* memory, const size_t& unique_id, primitive_id prim_id, uint32_t network_id);

size_t get_non_padded_pool_size() {
return _non_padded_pool.size();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ using namespace cldnn;
void basic_memory_dependencies::run(program& p) {
OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "pass::BasicMemoryDependencies");
auto itr = p.get_processing_order().begin();
std::vector<primitive_id> past_outputs;
std::vector<size_t> past_outputs;
while (itr != p.get_processing_order().end()) {
auto& node = *itr;
itr++;
Expand Down Expand Up @@ -62,7 +62,7 @@ void basic_memory_dependencies::run(program& p) {
node->add_memory_dependency(past_outputs);
// if current node is an output add it to the outputs list after restriction.
if (node->is_output()) {
past_outputs.push_back(node->id());
past_outputs.push_back(node->get_unique_id());
if (node->is_type<mutable_data>()) {
// if output is mutable data, then propagate output flag to its dependencies
for (auto& dep : node->get_dependencies()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -288,10 +288,10 @@ void prepare_quantization::prepare_scale_shift_opt(program &p, quantize_node& qu
p.add_connection(in_shift_node, new_quantize_node);
p.add_connection(out_scale_node, new_quantize_node);
p.add_connection(out_shift_node, new_quantize_node);
new_quantize_node.add_memory_dependency(in_scale_node.id());
new_quantize_node.add_memory_dependency(in_shift_node.id());
new_quantize_node.add_memory_dependency(out_scale_node.id());
new_quantize_node.add_memory_dependency(out_shift_node.id());
new_quantize_node.add_memory_dependency(in_scale_node.get_unique_id());
new_quantize_node.add_memory_dependency(in_shift_node.get_unique_id());
new_quantize_node.add_memory_dependency(out_scale_node.get_unique_id());
new_quantize_node.add_memory_dependency(out_shift_node.get_unique_id());
p.get_processing_order().insert(&new_quantize_node, &in_shift_node);
p.get_processing_order().insert(&new_quantize_node, &in_scale_node);
p.get_processing_order().insert(&new_quantize_node, &out_shift_node);
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/include/pass_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ class memory_dependency_pass : public base_pass {
explicit memory_dependency_pass(const std::string& pass_name) : base_pass(pass_name) {}
void add_memory_dependency(program_node* node, program_node* dep) {
if (node->can_be_optimized() || !dep->can_be_optimized()) {
node->add_memory_dependency(dep->id());
node->add_memory_dependency(static_cast<int32_t>(dep->get_unique_id()));
} else {
if (node->id() == dep->id()) {
return;
Expand Down
6 changes: 3 additions & 3 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class primitive_inst {
}
return _network.get_primitives(users);
}
std::set<primitive_id> get_runtime_memory_dependencies() { return _runtime_memory_dependencies; }
std::set<size_t> get_runtime_memory_dependencies() { return _runtime_memory_dependencies; }

const kernel_impl_params* get_impl_params() const { return _impl_params.get(); }
// return pointer to const to prevent arbitrary 'execute' call -> use primitive_inst.execute() instead
Expand Down Expand Up @@ -264,7 +264,7 @@ class primitive_inst {
memory_pool& pool,
const program_node& _node,
const kernel_impl_params& impl_params,
const std::set<primitive_id>& memory_dependencies,
const std::set<size_t>& memory_dependencies,
uint32_t net_id,
bool is_internal,
size_t idx = 0,
Expand Down Expand Up @@ -333,7 +333,7 @@ class primitive_inst {
std::vector<cldnn::primitive_id> _exec_dep_ids;

// List of primitive ids that this primitive can't share memory buffers with
std::set<primitive_id> _runtime_memory_dependencies;
std::set<size_t> _runtime_memory_dependencies;

// This is sub-network generated on demand to execute unfused primitives sequence instead of single fused primitive
// Needed for dynamic path only, as fusion in some cases may be illegal, but it can't be checked on program build phase,
Expand Down
13 changes: 9 additions & 4 deletions src/plugins/intel_gpu/src/graph/include/program_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -202,9 +202,9 @@ struct program_node {
size_t get_dependency_index(const program_node& node) const;
size_t get_user_index(const program_node& node) const;

std::set<primitive_id> get_memory_dependencies() const;
void add_memory_dependency(primitive_id);
void add_memory_dependency(std::vector<primitive_id>);
std::set<size_t> get_memory_dependencies() const;
void add_memory_dependency(size_t);
void add_memory_dependency(std::vector<size_t>);

template <class PType>
bool have_user_with_type() const {
Expand Down Expand Up @@ -425,6 +425,11 @@ struct program_node {
unique_id = cur_id++;
}

void set_unique_id(size_t _id) {
unique_id = _id;
}


static void reset_unique_id() {
cur_id = 0;
}
Expand Down Expand Up @@ -473,7 +478,7 @@ struct program_node {
std::list<program_node*> users;

// list of primitives that can reuse same memory buffers due to execution order conflicts
std::set<primitive_id> memory_dependencies;
std::set<size_t> memory_dependencies;

impl_types impl_type = impl_types::any;
bool constant = false;
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/network.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,7 +1379,7 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
auto device_mem = inst_mem.get_engine()->allocate_memory(inst_mem.get_layout(), allocation_type::usm_device, false);
device_mem->copy_from(get_stream(), inst_mem);
GPU_DEBUG_LOG << "[" << node.id() << ": constant]" << std::endl;
_memory_pool->release_memory(&inst_mem, node.id(), get_id());
_memory_pool->release_memory(&inst_mem, node.get_unique_id(), node.id(), get_id());
instance->set_output_memory(device_mem);
}
}
Expand Down
13 changes: 6 additions & 7 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,16 +152,16 @@ static memory::ptr get_memory_from_pool(engine& _engine,
const layout& layout,
allocation_type type,
bool reusable_across_network,
const std::set<std::string>& memory_dependencies,
const std::set<size_t>& memory_dependencies,
bool reset = true,
memory* curr_memory = nullptr) {
OPENVINO_ASSERT(!layout.is_dynamic() || layout.has_upper_bound(),
"[GPU] Can't allocate output for dynamic layout without upper bound");
// Use layout with max tensor for dynamic shape with upper bound
if (_node.get_program().get_config().get_property(ov::intel_gpu::enable_memory_pool)) {
if (curr_memory != nullptr)
pool.release_memory(curr_memory, _node.id(), net_id);
return pool.get_memory(layout, _node.id(), net_id, memory_dependencies, type, reusable_across_network, reset);
pool.release_memory(curr_memory, _node.get_unique_id(), _node.id(), net_id);
return pool.get_memory(layout, _node.id(), _node.get_unique_id(), net_id, memory_dependencies, type, reusable_across_network, reset);
}
return pool.get_memory(layout, type, reset);
}
Expand Down Expand Up @@ -962,7 +962,7 @@ void primitive_inst::do_runtime_skip_reorder() {
update_memory_dependencies = [&](std::vector<primitive_inst*> users) {
for (auto& user : users) {
GPU_DEBUG_TRACE_DETAIL << "[do runtime skip reorder] add " << id() << " to restriction list of " << user->id() << std::endl;
user->_runtime_memory_dependencies.insert(id());
user->_runtime_memory_dependencies.insert(get_node().get_unique_id());
if (user->can_be_optimized())
update_memory_dependencies(user->get_user_insts());
}
Expand Down Expand Up @@ -1465,7 +1465,7 @@ primitive_inst::primitive_inst(network& network)
, _mem_allocated(false)
, _type(nullptr) {}

primitive_inst::primitive_inst(network& network, program_node const& node, bool allocate_memory)
primitive_inst::primitive_inst(network & network, program_node const& node, bool allocate_memory)
: _network(network)
, _node(&node)
, _node_output_layout(node.get_output_layout())
Expand Down Expand Up @@ -1775,7 +1775,7 @@ memory::ptr primitive_inst::allocate_output(engine& _engine,
memory_pool& pool,
const program_node& _node,
const kernel_impl_params& impl_params,
const std::set<primitive_id>& memory_dependencies,
const std::set<size_t>& memory_dependencies,
uint32_t net_id,
bool is_internal,
size_t idx,
Expand Down Expand Up @@ -2124,5 +2124,4 @@ std::string primitive_inst::get_implementation_name() const {

return "undef";
}

} // namespace cldnn
12 changes: 9 additions & 3 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -769,7 +769,9 @@ const std::vector<primitive_id>& program::get_allocating_order(bool forced_updat
void program::prepare_memory_dependencies() {
if (!_config.get_property(ov::intel_gpu::enable_memory_pool))
return;

for (auto& node : get_processing_order()) {
node->add_memory_dependency(node->get_unique_id());
}
apply_opt_pass<basic_memory_dependencies>();
apply_opt_pass<skipped_branch_memory_dependencies>();
apply_opt_pass<oooq_memory_dependencies>();
Expand All @@ -781,9 +783,13 @@ std::string program::get_memory_dependencies_string() const {
while (itr != processing_order.end()) {
auto& node = *itr;
itr++;
mem_dep = mem_dep.append("primitive: ").append(node->id()).append(" restricted list: ");
mem_dep = mem_dep.append("primitive: ")
.append(node->id())
.append("(unique_id:")
.append(std::to_string(node->get_unique_id()))
.append(") restricted list: ");
for (auto it : node->get_memory_dependencies())
mem_dep = mem_dep.append(it).append(", ");
mem_dep = mem_dep.append(std::to_string(it)).append(",");
mem_dep = mem_dep.append("\n");
}
return mem_dep;
Expand Down
9 changes: 5 additions & 4 deletions src/plugins/intel_gpu/src/graph/program_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,6 @@ program_node::program_node(std::shared_ptr<primitive> prim, program& prog)
output_layouts.push_back(output_layout);
valid_output_layouts.push_back(false);
}
add_memory_dependency(id());
}
}

Expand Down Expand Up @@ -196,11 +195,11 @@ void program_node::remove_dependency(size_t idx) {
dependencies.erase(dependencies.begin() + idx);
}

std::set<primitive_id> program_node::get_memory_dependencies() const { return memory_dependencies; }
std::set<size_t> program_node::get_memory_dependencies() const { return memory_dependencies; }

void program_node::add_memory_dependency(primitive_id prim) { memory_dependencies.insert(prim); }
void program_node::add_memory_dependency(size_t prim) { memory_dependencies.insert(prim); }

void program_node::add_memory_dependency(std::vector<primitive_id> prim_list) {
void program_node::add_memory_dependency(std::vector<size_t> prim_list) {
memory_dependencies.insert(prim_list.begin(), prim_list.end());
}

Expand Down Expand Up @@ -639,6 +638,7 @@ void program_node::add_dependant_shape_of_node(const program_node* node) {
}

void program_node::save(cldnn::BinaryOutputBuffer& ob) const {
ob << unique_id;
ob << valid_output_layouts;
ob << output_layouts;

Expand Down Expand Up @@ -775,6 +775,7 @@ void program_node::save(cldnn::BinaryOutputBuffer& ob) const {
}

void program_node::load(cldnn::BinaryInputBuffer& ib) {
ib >> unique_id;
ib >> valid_output_layouts;
ib >> output_layouts;

Expand Down
Loading

0 comments on commit 5872549

Please sign in to comment.