Skip to content

Commit

Permalink
[NPU] Adding support for the set_tensors method (#26823)
Browse files Browse the repository at this point in the history
### Details:
 - *Adding support for the set_tensor method*

set_tensors works differently in case the plugin or the compiler handles
the batch:
- in case the compiler handles batching we need to create a continuous
L0 tensor and copy all the tensors into that big tensor even when
tensors are part of the same L0 context
- in case the plugin handles batching and the remote tensor feature is
supported copy is not used if the tensors are part of the same L0
context.

### Tickets:
 - *EISW-116494*
  • Loading branch information
pereanub authored Oct 8, 2024
1 parent 4d3a534 commit 4a30cb8
Show file tree
Hide file tree
Showing 9 changed files with 872 additions and 135 deletions.
27 changes: 22 additions & 5 deletions src/plugins/intel_npu/src/al/include/sync_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ namespace intel_npu {
*/
class SyncInferRequest : public ov::IInferRequest {
public:
explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel);
explicit SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config);

/**
* @brief Gets an input/output tensor for inference.
Expand Down Expand Up @@ -50,8 +50,8 @@ class SyncInferRequest : public ov::IInferRequest {
* @brief Currently there is no support implemented for batches of tensors, thus this call is a simple redirection
* to the "set_tensor" one.
*/
void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;
virtual void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;

/**
* @brief Gets inputs for infer request
Expand Down Expand Up @@ -126,6 +126,15 @@ class SyncInferRequest : public ov::IInferRequest {
*/
void check_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) const;

/**
* @brief Basic checks for input tensors
*
* @param port Input port
* @param tensors Input tensors
*/
void check_batched_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const;

/**
* @brief Check that all tensors are valid. Throws an exception if it's not.
*/
Expand Down Expand Up @@ -153,14 +162,22 @@ class SyncInferRequest : public ov::IInferRequest {
const ov::Allocator& allocator = {},
const std::optional<std::size_t> batchSize = std::nullopt) const;

bool is_batched_input(size_t idx) const;

ov::SoPtr<ov::ITensor>& get_user_input(size_t index) const;
std::vector<ov::SoPtr<ov::ITensor>>& get_user_inputs(size_t index) const;

// This is intel_npu::ICompiledModel pointer, but need to use OV base class because
// ov::IInferRequest::get_compiled_model returns a refernce to shared_ptr!
std::shared_ptr<const ov::ICompiledModel> _compiledModel;

NetworkMetadata _metadata;

mutable std::vector<std::shared_ptr<ov::ITensor>> _userInputTensors;
mutable std::vector<std::shared_ptr<ov::ITensor>> _userOutputTensors;
Logger _logger;

// In case set_tensors is called, we receive a vector with N tensors otherwise only 1 tensor is needed
mutable std::vector<std::vector<ov::SoPtr<ov::ITensor>>> _userInputTensors;
mutable std::vector<ov::SoPtr<ov::ITensor>> _userOutputTensors;

mutable std::vector<ov::SoPtr<ov::IVariableState>> _variableStates;

Expand Down
125 changes: 111 additions & 14 deletions src/plugins/intel_npu/src/al/src/sync_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ constexpr size_t BATCH_AXIS = 0;

namespace intel_npu {

SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel)
SyncInferRequest::SyncInferRequest(const std::shared_ptr<const ICompiledModel>& compiledModel, const Config& config)
: _compiledModel(compiledModel),
_metadata(compiledModel->get_network_metadata()),
_userInputTensors(_metadata.inputs.size(), nullptr),
_userOutputTensors(_metadata.outputs.size(), nullptr) {
_logger("SyncInferRequest", config.get<LOG_LEVEL>()),
_userInputTensors(_metadata.inputs.size(), std::vector<ov::SoPtr<ov::ITensor>>(1, {nullptr})),
_userOutputTensors(_metadata.outputs.size(), {nullptr}) {
OPENVINO_ASSERT(_compiledModel);

if (get_outputs().empty()) {
Expand Down Expand Up @@ -121,7 +122,7 @@ ov::SoPtr<ov::ITensor> SyncInferRequest::get_tensor(const ov::Output<const ov::N
OPENVINO_ASSERT(foundPort.found(), "Cannot find tensor for port ", port);

if (foundPort.is_input()) {
return _userInputTensors.at(foundPort.idx);
return get_user_input(foundPort.idx);
}
return _userOutputTensors.at(foundPort.idx);
}
Expand All @@ -138,17 +139,22 @@ void SyncInferRequest::set_tensor(const ov::Output<const ov::Node>& port, const
}

if (foundPort.is_input()) {
_userInputTensors.at(foundPort.idx) = tensor._ptr;
get_user_input(foundPort.idx) = tensor;
} else {
_userOutputTensors.at(foundPort.idx) = tensor._ptr;
_userOutputTensors.at(foundPort.idx) = tensor;
}
}

std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& /*port*/) const {
std::vector<ov::SoPtr<ov::ITensor>> SyncInferRequest::get_tensors(const ov::Output<const ov::Node>& port) const {
OV_ITT_SCOPED_TASK(ov::itt::domains::Plugin, "get_tensors");

// Using batches of tensors is currently not supported by the NPU plugin. In this scenario, the OpenVINO API demands
// returning an empty vector.
auto foundPort = find_port(port);
OPENVINO_ASSERT(foundPort.found(), "Cannot find input tensors for port ", port);

if (foundPort.is_input() && is_batched_input(foundPort.idx)) {
return get_user_inputs(foundPort.idx);
}

return {};
}

Expand Down Expand Up @@ -192,11 +198,89 @@ void SyncInferRequest::check_tensor(const ov::Output<const ov::Node>& port,
"Tensor data equal nullptr!");
}

void SyncInferRequest::check_batched_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) const {
OPENVINO_ASSERT(!tensors.empty(), "set_input_tensors/set_tensors can't be called with empty tensors");
OPENVINO_ASSERT(
tensors.size() != 1,
"Internal error (plugin): check_batched_tensors is not allowed to have only one tensor inside batch");

auto layout = ov::layout::get_layout(port);

int64_t batch_idx;

if (layout.empty()) {
_logger.warning("set_input_tensors/set_tensors layout is not set, assuming batch dimension is found on 0 axis");
batch_idx = BATCH_AXIS;
} else {
OPENVINO_ASSERT(ov::layout::has_batch(layout),
"set_input_tensors/set_tensors can be used only for inputs with N(batch) dimension"
" 'layout' defined. Current layout is ",
layout.to_string());
batch_idx = ov::layout::batch_idx(layout);
}

if (batch_idx < 0) {
batch_idx += static_cast<int64_t>(tensors[BATCH_AXIS]->get_shape().size());
}
OPENVINO_ASSERT(batch_idx == BATCH_AXIS,
"set_input_tensors/set_tensors is not currently supported for batch dimension index ",
batch_idx,
" != 0");
std::for_each(tensors.begin(), tensors.end(), [&batch_idx](const ov::SoPtr<ov::ITensor>& item) {
OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
OPENVINO_ASSERT(item->get_shape()[batch_idx] == 1,
"set_input_tensors/set_tensors. Tensors shall represent one item in a batch, ",
item->get_shape()[batch_idx],
" provided");
});
auto tensors_size = static_cast<int>(tensors.size());
if (port.get_partial_shape().rank().is_static()) {
OPENVINO_ASSERT(batch_idx >= 0 && batch_idx < port.get_partial_shape().rank().get_length(),
"set_input_tensors/set_tensors error. Layout ",
layout.to_string(),
" is incorrect for operation with shape ",
port.get_partial_shape());
auto batch = port.get_partial_shape()[batch_idx];

OPENVINO_ASSERT(batch.is_dynamic() || batch.get_length() == tensors_size,
"set_input_tensors/set_tensors error. Input shape ",
port.get_partial_shape(),
"batch ",
batch,
"doesn't match with total blobs count: ",
tensors_size);
}

auto batched_shape = tensors[BATCH_AXIS]->get_shape();
auto element_type = tensors[BATCH_AXIS]->get_element_type();
batched_shape[batch_idx] = tensors_size;
for (const auto& item : tensors) {
OPENVINO_ASSERT(item, "Unintialized tensor is provided!");
auto item_shape = item->get_shape();
item_shape[batch_idx] = batched_shape[batch_idx];
OPENVINO_ASSERT(item_shape == batched_shape && item->get_element_type() == element_type &&
"set_input_tensors/set_tensors error. Tensor with element type ",
item->get_element_type(),
" and shape ",
item_shape,
" is not compatible with batched tensor with element type ",
element_type,
" and shape ",
batched_shape);
OPENVINO_ASSERT(item->is_continuous(), "Strides for batched tensors should be default.");
}
}

void SyncInferRequest::check_tensors() const {
const auto& inputs = _compiledModel->inputs();
for (size_t i = 0; i < inputs.size(); i++) {
if (_userInputTensors.at(i)) {
check_tensor(inputs[i], _userInputTensors.at(i));
if (is_batched_input(i)) {
check_batched_tensors(inputs[i], get_user_inputs(i));
continue;
}
if (get_user_input(i)) {
check_tensor(inputs[i], get_user_input(i));
}
}

Expand Down Expand Up @@ -229,16 +313,16 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto
OPENVINO_ASSERT(descriptor.relatedDescriptorIndex.has_value(),
"The link between state descriptors is missing, state name: ",
descriptor.nameFromCompiler);
tensor = _userInputTensors.at(*descriptor.relatedDescriptorIndex);
tensor = get_user_input(*descriptor.relatedDescriptorIndex)._ptr;
} else if (allocator) {
tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape, allocator);
} else {
tensor = ov::make_tensor(descriptor.precision, allocatedTensorShape);
}

if (isInput) {
if (_userInputTensors.at(index) == nullptr) {
_userInputTensors.at(index) = tensor;
if (get_user_input(index) == nullptr) {
get_user_input(index) = tensor;
}

if (descriptor.isStateInput) {
Expand All @@ -250,4 +334,17 @@ std::shared_ptr<ov::ITensor> SyncInferRequest::allocate_tensor(const IODescripto

return tensor;
}

bool SyncInferRequest::is_batched_input(size_t idx) const {
return _userInputTensors.at(idx).size() > 1;
}

ov::SoPtr<ov::ITensor>& SyncInferRequest::get_user_input(size_t index) const {
return _userInputTensors.at(index).at(0);
}

std::vector<ov::SoPtr<ov::ITensor>>& SyncInferRequest::get_user_inputs(size_t index) const {
return _userInputTensors.at(index);
}

} // namespace intel_npu
14 changes: 11 additions & 3 deletions src/plugins/intel_npu/src/backend/include/zero_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ class ZeroInferRequest final : public SyncInferRequest {

ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override;
void set_tensor(const ov::Output<const ov::Node>& port, const ov::SoPtr<ov::ITensor>& tensor) override;
void set_tensors(const ov::Output<const ov::Node>& port,
const std::vector<ov::SoPtr<ov::ITensor>>& tensors) override;

void infer() override;
void infer_async() override;
Expand Down Expand Up @@ -54,7 +56,7 @@ class ZeroInferRequest final : public SyncInferRequest {
* @returns The batch size deduced by the algorithm or the default value of 1 if batching cannot be performed inside
* the plugin.
*/
std::optional<size_t> getBatchSize(const NetworkMetadata& metadata);
std::optional<size_t> get_batch_size(const NetworkMetadata& metadata);

/**
* @brief Check the received tensor and set the Level Zero tensor accordingly
Expand All @@ -75,6 +77,12 @@ class ZeroInferRequest final : public SyncInferRequest {
void check_network_precision(const ov::element::Type_t precision) const override;
void create_pipeline();

std::shared_ptr<ov::ITensor>& get_level_zero_input(size_t index, size_t tensorNo = 0) const;
std::vector<std::shared_ptr<ov::ITensor>>& get_level_zero_inputs(size_t index) const;

std::optional<TensorData>& get_input_tensor_data(size_t index, size_t tensorNo = 0) const;
std::vector<std::optional<TensorData>>& get_input_tensors_data(size_t index) const;

const std::shared_ptr<ZeroInitStructsHolder> _initStructs;
const std::shared_ptr<const IExecutor> _executorPtr;
const ZeroExecutor* _executor;
Expand All @@ -83,10 +91,10 @@ class ZeroInferRequest final : public SyncInferRequest {

// A copy of each tensor is needed to maintain the original L0 memory allocation in case the user provides another
// memory area for the tensor.
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroInputTensors;
mutable std::vector<std::vector<std::shared_ptr<ov::ITensor>>> _levelZeroInputTensors;
mutable std::vector<std::shared_ptr<ov::ITensor>> _levelZeroOutputTensors;

mutable std::vector<std::optional<TensorData>> _inputTensorsData;
mutable std::vector<std::vector<std::optional<TensorData>>> _inputTensorsData;
mutable std::vector<std::optional<TensorData>> _outputTensorsData;

ze_device_properties_t _properties = {};
Expand Down
5 changes: 3 additions & 2 deletions src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ struct Pipeline {
zeroProfiling::ProfilingPool& profiling_pool,
zeroProfiling::ProfilingQuery& profiling_query,
std::shared_ptr<zeroProfiling::NpuInferProfiling> npu_profiling,
const std::vector<std::optional<TensorData>>& inputTensorsData,
const std::vector<std::vector<std::optional<TensorData>>>& inputTensorsData,
const std::vector<std::optional<TensorData>>& outputTensorsData,
const size_t numberOfCommandLists);

Expand All @@ -37,7 +37,8 @@ struct Pipeline {
void pull();
void reset() const;

void updateCommandList(const TensorData& tensorsData, const uint32_t index);
void updateCommandList(const TensorData& tensorsData, uint32_t index);
void updateCommandList(const TensorData& tensorsData, uint32_t index, size_t commandListIndex);

protected:
const Config _config;
Expand Down
Loading

0 comments on commit 4a30cb8

Please sign in to comment.