Skip to content

Commit

Permalink
[CPU] [ARM] FullyConnected: int8 support
Browse files Browse the repository at this point in the history
  • Loading branch information
eshoguli committed Jul 31, 2024
1 parent 1ecaa15 commit 1de5490
Show file tree
Hide file tree
Showing 96 changed files with 637 additions and 53 deletions.
5 changes: 3 additions & 2 deletions src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ struct Config {
Unknown
};

bool collectPerfCounters = false;
// TODO: workaround to collect performance counters
bool collectPerfCounters = true;
bool exclusiveAsyncRequests = false;
SnippetsMode snippetsMode = SnippetsMode::Enable;
std::string dumpToDot = {};
Expand Down Expand Up @@ -79,7 +80,7 @@ struct Config {
LPTransformsMode lpTransformsMode = LPTransformsMode::On;
#else
// Currently INT8 mode is not optimized on ARM / RISCV or other non-x86 platforms, fallback to FP32 mode.
LPTransformsMode lpTransformsMode = LPTransformsMode::Off;
LPTransformsMode lpTransformsMode = LPTransformsMode::On;
#endif
// default inference precision
ov::element::Type inferencePrecision = ov::element::f32;
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ void DnnlMemoryMngr::notifyUpdate() {

StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
m_eng(eng), m_pMemDesc(desc) {
OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
if (desc->getPrecision() == element::string) {
OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
}
Expand All @@ -484,7 +485,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo

m_size = m_pMemDesc->getCurrentMemSize();

if (data) {
if (data || desc->empty()) {
m_pMemMngr = std::make_shared<StaticMemoryMngr>(const_cast<void*>(data), m_size);
} else {
m_pMemMngr = std::make_shared<StaticMemoryMngr>(m_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
//

#include "acl_common_executor.hpp"

#include <ostream>

#include "acl_utils.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"
Expand Down Expand Up @@ -38,9 +41,9 @@ static void initACLTensorParams(const MemoryPtr& memoryPtr,
}
}

static ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
ACLInfo ACLCommonExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
ACLInfo aclMemoryInfo = nullptr;
if (dataType != arm_compute::DataType::UNKNOWN) {
aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
Expand Down Expand Up @@ -72,6 +75,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
ACLMemoryTypes aclDataType{};
ACLMemoryLayouts aclDataLayout{};
for (auto& cpu_mem_ptr : memory) {
if (cpu_mem_ptr.second->getSize() == 0) {
continue;
}
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
aclMemoryShapes[index],
Expand Down Expand Up @@ -108,18 +114,79 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
configureThreadSafe([&] {
iFunction = configureFunction(aclMemoryTensors);
});

// for (auto& cpu_mem_ptr : memory) {
// const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
// if (aclTensorAttrs.memoryUsageIndicator[index]) {
// aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
// }
// }
return true;
}

//namespace {
//std::ostream& operator<<(std::ostream& os, const arm_compute::ITensorInfo* tensor_info) {
// const auto data_type = tensor_info->data_type();
// switch (data_type) {
// case arm_compute::DataType::S8: {
// return os << "S8";
// }
// case arm_compute::DataType::QSYMM8: {
// return os << "QSYMM8";
// }
// case arm_compute::DataType::QASYMM8: {
// return os << "QASYMM8";
// }
// case arm_compute::DataType::QASYMM8_SIGNED: {
// return os << "QASYMM8_SIGNED";
// }
// case arm_compute::DataType::S32: {
// return os << "S32";
// }
// case arm_compute::DataType::F32: {
// return os << "F32";
// }
// default: {
// return os << "[UNKNOWN]";
// }
// }
//}
//} // namespace

void ACLCommonExecutor::execute(const MemoryArgs &memory) {
// TODO: Move import_memory() to update() function - CVS-145871
for (auto& cpu_mem_ptr : memory) {
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
if (aclTensorAttrs.memoryUsageIndicator[index]) {
if (aclMemoryTensors[index]) {
aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
}
}

// for (auto index = 0; index < aclMemoryTensors.size(); ++index) {
// const auto& tensor = aclMemoryTensors[index];
// if ((tensor == nullptr) || (index == ACLArgs::ACL_DST)) {
// continue;
// }
//
// if (index == ACLArgs::ACL_SRC_0) {
// std::cout << "src0 ";
// } else if (index == ACLArgs::ACL_WEI) {
// std::cout << "src1 ";
// } else if (index == ACLArgs::ACL_BIAS) {
// std::cout << "biases ";
// } else {
// std::cout << "[UNKNOWN] ";
// }
// std::cout << tensor->info() << ":" << std::endl;
// tensor->print(std::cout);
// }

iFunction->run();

// {
// std::shared_ptr<arm_compute::Tensor> tensor = aclMemoryTensors[ACLArgs::ACL_DST];
// std::cout << "dst " << tensor->info() << ":" << std::endl;
// tensor->print(std::cout);
// }
}

ACLCommonExecutor::~ACLCommonExecutor() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ class ACLCommonExecutor : public Executor {

protected:
ACLTensorAttrs aclTensorAttrs;

virtual ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout);

private:
ACLMemoryTensors aclMemoryTensors;
ACLFunction iFunction = nullptr;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_lowp_fullyconnected.hpp"

#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"

#include "nodes/executors/executor.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"
#include "nodes/executors/debug_messages.hpp"
#include "nodes/executors/implementation_utils.hpp"
#include "acl_weights.hpp"
#include "acl_utils.hpp"

namespace ov {
namespace intel_cpu {

static void initFCAttrs(const FCAttrs &attrs,
ACLTensorAttrs& aclTensorAttrs,
ACLFCAttrs& aclfcAttrs,
const MemoryArgs &memory,
arm_compute::GEMMInfo& gemmInfo,
const PostOps &postOps) {
aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
//fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
aclfcAttrs.inputPrecision = memory.at(ARG_SRC)->getDescPtr()->getPrecision();
//fullyConnectedLayerInfo.transpose_weights = false;
// ??
gemmInfo.set_pretranspose_B(false);
aclfcAttrs.weightsNonTransposed = attrs.weightsNonTransposed;

// Add postops
if (!postOps.empty() && postOps.size() == 1) {
if (const auto activation = std::dynamic_pointer_cast<ActivationPostOp>(postOps[0])) {
gemmInfo.set_activation_info(getActivationLayerInfo(convertToEltwiseAlgorithm(activation->type()),
activation->alpha(),
activation->beta(),
activation->gamma()));
}
}

if (memory.at(ARG_SRC)->getPrecision() != memory.at(ARG_WEI)->getPrecision()) {
aclfcAttrs.isConvertedWeights = true;
}
}

ACLLowpFullyConnectedExecutor::ACLLowpFullyConnectedExecutor(const FCAttrs &attrs,
const PostOps &postOps,
const MemoryArgs &memory,
const ExecutorContext::CPtr context) {
initFCAttrs(attrs, aclTensorAttrs, aclfcAttrs, memory, gemmInfo, postOps);
packedWeights = acl_fc_executor::prepareWeightMemory(memory, context, attrs, aclTensorAttrs, aclfcAttrs, postOps);
}

bool ACLLowpFullyConnectedExecutor::supports(const FCConfig &config) {
// TODO: check weights layout
const auto attrs = static_cast<FCAttrs>(config.attrs);
if (std::any_of(
attrs.dequantizationScales.begin(),
attrs.dequantizationScales.end(),
[](float value) { return value != 1.f;})) {
return false;
}

const auto src1_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_SRC))->getBlockDims();
const auto src2_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_WEI))->getBlockDims();

const auto precision = srcType(config);
VERIFY(one_of(precision, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(postOpsNumbers(config) == 0, UNSUPPORTED_NUMBER_OF_POSTOPS);
VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
return true;
}

void ACLLowpFullyConnectedExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {
acl_fc_executor::updateFCTensorsShapes(aclMemoryShapes);
}

arm_compute::Status ACLLowpFullyConnectedExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) {
const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
aclMemoryInfos[ACLArgs::ACL_WEI].get(),
nullptr, //aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
aclMemoryInfos[ACLArgs::ACL_DST].get(),
gemmInfo);
return matMulValid;
}

ACLFunction ACLLowpFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
auto gemm = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
gemm->configure(
aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
aclMemoryTensors[ACLArgs::ACL_WEI].get(),
nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
aclMemoryTensors.at(ACLArgs::ACL_DST).get(),
gemmInfo);

if (aclfcAttrs.isConvertedWeights || !aclfcAttrs.weightsNonTransposed) {
aclTensorAttrs.memoryUsageIndicator[ACLArgs::ACL_WEI] = false;
aclMemoryTensors[ACLArgs::ACL_WEI]->allocator()->import_memory(packedWeights->getData());
}
return gemm;
}

// TODO: move to ACLLowpExecutor
ACLInfo ACLLowpFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
arm_compute::DataType result;
switch (dataType) {
case arm_compute::DataType::S8: {
result = arm_compute::DataType::QASYMM8_SIGNED;
break;
}
case arm_compute::DataType::U8: {
result = arm_compute::DataType::QASYMM8;
break;
}
default: {
result = dataType;
break;
}
}

return ACLCommonExecutor::initTensorInfo(tensorShape, result, dataLayout);
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "acl_common_executor.hpp"
#include "nodes/executors/fullyconnected_config.hpp"
#include "acl_weights.hpp"

namespace ov {
namespace intel_cpu {

class ACLLowpFullyConnectedExecutor : public ACLCommonExecutor {
public:
ACLLowpFullyConnectedExecutor(const FCAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context);

static bool supports(const FCConfig& config);

void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override;

arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;

ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;

impl_desc_type implType() const override {
return impl_desc_type::gemm_acl;
}

protected:
ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) override;

private:
arm_compute::GEMMInfo gemmInfo;
arm_compute::WeightsInfo weightsInfo;

MemoryCPtr packedWeights;
ACLFCAttrs aclfcAttrs;
};

using ACLLowpFullyConnectedExecutorPtr = std::shared_ptr<ACLLowpFullyConnectedExecutor>;

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ inline int axisCast(const std::size_t axis, const std::size_t shapeSize, ACLAxis
* @param precision precision to be converted
* @return ComputeLibrary DataType or UNKNOWN if precision is not mapped to DataType
*/
inline arm_compute::DataType precisionToAclDataType(ov::element::Type precision) {
inline arm_compute::DataType precisionToAclDataType(const ov::element::Type& precision) {
switch (precision) {
case ov::element::i8: return arm_compute::DataType::S8;
case ov::element::u8: return arm_compute::DataType::U8;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#define UNSUPPORTED_DST_RANK " unsupported dst rank"
#define UNSUPPORTED_DST_STRIDES " unsupported dst strides"
#define HEURISTICS_MISMATCH " heuristics mismatch"
#define UNSUPPORTED_PER_CHANNEL_QUANTIZATION " unsupported per-channel quantization"

#define VERIFY(condition, ...) \
do { \
Expand Down
Loading

0 comments on commit 1de5490

Please sign in to comment.