Skip to content

Commit

Permalink
[CPU] [ARM] int8: GEMM support
Browse files Browse the repository at this point in the history
  • Loading branch information
eshoguli committed Jul 18, 2024
1 parent ea6c2b2 commit 4d3f499
Show file tree
Hide file tree
Showing 27 changed files with 772 additions and 173 deletions.
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ struct Config {
Unknown
};

bool collectPerfCounters = false;
// TODO: workaround to collect performance counters
bool collectPerfCounters = true;
bool exclusiveAsyncRequests = false;
SnippetsMode snippetsMode = SnippetsMode::Enable;
std::string dumpToDot = {};
Expand Down
3 changes: 2 additions & 1 deletion src/plugins/intel_cpu/src/cpu_memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ void DnnlMemoryMngr::notifyUpdate() {

StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const void* data, bool pads_zeroing) :
m_eng(eng), m_pMemDesc(desc) {
OPENVINO_ASSERT(!desc->empty() || (desc->empty() && (data == nullptr)));
if (desc->getPrecision() == element::string) {
OPENVINO_THROW("[CPU] StaticMemory object cannot be created for string data.");
}
Expand All @@ -484,7 +485,7 @@ StaticMemory::StaticMemory(const dnnl::engine& eng, MemoryDescPtr desc, const vo

m_size = m_pMemDesc->getCurrentMemSize();

if (data) {
if (data || desc->empty()) {
m_pMemMngr = std::make_shared<StaticMemoryMngr>(const_cast<void*>(data), m_size);
} else {
m_pMemMngr = std::make_shared<StaticMemoryMngr>(m_size);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// SPDX-License-Identifier: Apache-2.0
//

#include "src/core/utils/quantization/AsymmHelpers.h"

#include "acl_common_executor.hpp"
#include "acl_utils.hpp"
#include "nodes/executors/memory_arguments.hpp"
Expand Down Expand Up @@ -66,7 +68,16 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
ACLMemoryTypes aclDataType{};
ACLMemoryLayouts aclDataLayout{};
for (auto& cpu_mem_ptr : memory) {
// TODO: don't init empty tensor
if (cpu_mem_ptr.second->getSize() == 0) {
continue;
}
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);

if (index == ACLArgs::ACL_DST) {
std::cout << std::endl;
}

initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
aclMemoryShapes[index],
aclDataType[index],
Expand All @@ -79,6 +90,9 @@ bool ACLCommonExecutor::update(const MemoryArgs &memory) {
// Initialize arm_compute::TensorInfo objects
ACLMemoryInfo aclMemoryInfos;
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
if (i == ACLArgs::ACL_DST) {
std::cout << std::endl;
}
aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]);
}

Expand Down Expand Up @@ -108,6 +122,7 @@ void ACLCommonExecutor::execute(const MemoryArgs &memory) {
aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
}
}

iFunction->run();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const
aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
fullyConnectedLayerInfo.weights_trained_layout = getAclDataLayoutByMemoryDesc(memory.at(ARG_WEI)->getDescPtr());
fullyConnectedLayerInfo.transpose_weights = !attrs.weightsNonTransposed;
if (!attrs.dequantizationScales.empty()) {
dequantizationScale = attrs.dequantizationScales[0];
}

// Add postops
if (!postOps.empty() && postOps.size() == 1) {
Expand All @@ -35,20 +32,10 @@ ACLFullyConnectedExecutor::ACLFullyConnectedExecutor(const FCAttrs &attrs, const
}

bool ACLFullyConnectedExecutor::supports(const FCConfig &config) {
// issue #<create and put number here>
const auto attrs = static_cast<FCAttrs>(config.attrs);
if (std::any_of(
attrs.dequantizationScales.begin(),
attrs.dequantizationScales.end(),
[](float value) { return value != 1.f;})) {
return false;
}

VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS);
VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
VERIFY(one_of(weiRank(config), 2U, 3U), UNSUPPORTED_WEI_RANK);
VERIFY(static_cast<FCAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
return true;
}

Expand Down Expand Up @@ -87,43 +74,16 @@ arm_compute::Status ACLFullyConnectedExecutor::validateTensorsInfo(const ACLMemo
}

ACLFunction ACLFullyConnectedExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
const auto dstTensor = aclMemoryTensors.at(ACLArgs::ACL_DST).get();
if (dequantizationScale != 1.0) {
dstTensor->info()->set_quantization_info(arm_compute::QuantizationInfo(dequantizationScale, 0));
}

auto neFC = std::make_unique<arm_compute::NEFullyConnectedLayer>();
neFC->configure(
aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
aclMemoryTensors[ACLArgs::ACL_WEI].get(),
aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
dstTensor,
aclMemoryTensors[ACLArgs::ACL_DST].get(),
fullyConnectedLayerInfo,
weightsInfo);
return neFC;
}

ACLInfo ACLFullyConnectedExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
arm_compute::DataType fcDataType;
switch (dataType) {
case arm_compute::DataType::S8: {
fcDataType = arm_compute::DataType::QASYMM8_SIGNED;
break;
}
case arm_compute::DataType::U8: {
fcDataType = arm_compute::DataType::QASYMM8;
break;
}
default: {
fcDataType = dataType;
break;
}
}

return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout);
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,9 @@ class ACLFullyConnectedExecutor : public ACLCommonExecutor {
impl_desc_type implType() const override {
return impl_desc_type::gemm_acl;
}

protected:
ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) override;

private:
arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
arm_compute::WeightsInfo weightsInfo;
float dequantizationScale = 1.f;
};

using ACLFullyConnectedExecutorPtr = std::shared_ptr<ACLFullyConnectedExecutor>;
Expand Down
92 changes: 92 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_gemm.hpp"

#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"

#include "nodes/executors/executor.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"
#include "nodes/executors/debug_messages.hpp"
#include "nodes/executors/implementation_utils.hpp"

namespace ov {
namespace intel_cpu {

ACLGEMMExecutor::ACLGEMMExecutor(const GEMMAttrs &attrs,
const PostOps &postOps,
const MemoryArgs &memory,
const ExecutorContext::CPtr context) {
aclTensorAttrs.hasLayoutTypeNHWC = memory.at(ARG_SRC)->getDescPtr()->hasLayoutType(LayoutType::nspc);
}

bool ACLGEMMExecutor::supports(const GEMMConfig &config) {
// TODO: check weights layout
const auto attrs = static_cast<GEMMAttrs>(config.attrs);
if (std::any_of(
attrs.dequantizationScales.begin(),
attrs.dequantizationScales.end(),
[](float value) { return value != 1.f;})) {
return false;
}

const auto src1_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_SRC))->getBlockDims();
const auto src2_dims = std::dynamic_pointer_cast<BlockedMemoryDesc>(config.descs.at(ARG_WEI))->getBlockDims();

VERIFY(one_of(srcType(config), ov::element::f16, ov::element::f32, ov::element::i8, ov::element::u8), UNSUPPORTED_SRC_PRECISIONS);
VERIFY(postOpsNumbers(config) < 2, UNSUPPORTED_NUMBER_OF_POSTOPS);
VERIFY(one_of(srcRank(config), 2U, 3U, 4U), UNSUPPORTED_SRC_RANK);
VERIFY(one_of(weiRank(config), 2U, 3U, 4U), UNSUPPORTED_WEI_RANK);
VERIFY(static_cast<GEMMAttrs>(config.attrs).dequantizationScales.size() <= 1, UNSUPPORTED_PER_CHANNEL_QUANTIZATION);
return true;
}

void ACLGEMMExecutor::updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) {}

arm_compute::Status ACLGEMMExecutor::validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) {
const auto matMulValid = arm_compute::NEGEMMLowpMatrixMultiplyCore::validate(
aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
aclMemoryInfos[ACLArgs::ACL_WEI].get(),
aclMemoryInfos[ACLArgs::ACL_BIAS].get(),
aclMemoryInfos[ACLArgs::ACL_DST].get(),
gemmInfo);
return matMulValid;
}

ACLFunction ACLGEMMExecutor::configureFunction(const ACLMemoryTensors & aclMemoryTensors) {
auto matMull = std::make_unique<arm_compute::NEGEMMLowpMatrixMultiplyCore>();
matMull->configure(
aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
aclMemoryTensors[ACLArgs::ACL_WEI].get(),
// TODO: fix me
nullptr, //aclMemoryTensors[ACLArgs::ACL_BIAS].get(),
aclMemoryTensors.at(ACLArgs::ACL_DST).get());
return matMull;
}

ACLInfo ACLGEMMExecutor::initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
arm_compute::DataType fcDataType;
switch (dataType) {
case arm_compute::DataType::S8: {
fcDataType = arm_compute::DataType::QASYMM8_SIGNED;
break;
}
case arm_compute::DataType::U8: {
fcDataType = arm_compute::DataType::QASYMM8;
break;
}
default: {
fcDataType = dataType;
break;
}
}

return ACLCommonExecutor::initTensorInfo(tensorShape, fcDataType, dataLayout);
}

} // namespace intel_cpu
} // namespace ov
46 changes: 46 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_gemm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "acl_common_executor.hpp"
#include "nodes/executors/gemm_config.hpp"

namespace ov {
namespace intel_cpu {

class ACLGEMMExecutor : public ACLCommonExecutor {
public:
ACLGEMMExecutor(const GEMMAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context);

static bool supports(const GEMMConfig& config);

void updateTensorsShapes(ACLMemoryShapes& aclMemoryShapes) override;

arm_compute::Status validateTensorsInfo(const ACLMemoryInfo & aclMemoryInfos) override;

ACLFunction configureFunction(const ACLMemoryTensors & aclMemoryTensors) override;

impl_desc_type implType() const override {
return impl_desc_type::gemm_acl;
}

protected:
ACLInfo initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) override;

private:
arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo;
arm_compute::GEMMInfo gemmInfo;
arm_compute::WeightsInfo weightsInfo;
};

using ACLGEMMExecutorPtr = std::shared_ptr<ACLGEMMExecutor>;

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ struct FCAttrs {
// @todo probably we don't want with bias flag, since this information is already
// a part of src memory descs
bool withBias = false;
bool weightsNonTransposed = false;
// TODO: why default is false???
bool weightsNonTransposed = true;
bool sparseWeights = false;
// @todo only memory descriptors should be a part of attributes
// actual memory should be passed into "execute" or "prepareMemory" calls
Expand Down
34 changes: 34 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/gemm_attrs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include <vector>

#include "cpu_memory.h"
#include "executor_config.hpp"

namespace ov {
namespace intel_cpu {

// @todo require explicit initialization of all the attributes?
struct GEMMAttrs {
// @todo probably we don't want with bias flag, since this information is already
// a part of src memory descs
bool withBias = false;
// TODO: why default is false???
bool weightsNonTransposed = true;
bool sparseWeights = false;
// @todo only memory descriptors should be a part of attributes
// actual memory should be passed into "execute" or "prepareMemory" calls
std::vector<float> dequantizationScales;
// @todo should be passed as an additional memory input?
MemoryCPtr decompressionSubtractPtr;
MemoryCPtr decompressionMultiplyPtr;
uint64_t dynamicQuantizationGroupSize;
ov::intel_cpu::Config::ModelType modelType = ov::intel_cpu::Config::ModelType::Unknown;
};

} // namespace intel_cpu
} // namespace ov
14 changes: 14 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/gemm_config.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "executor_config.hpp"
#include "gemm_attrs.hpp"

namespace ov {
namespace intel_cpu {
using GEMMConfig = ov::intel_cpu::executor::Config<GEMMAttrs>;
} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 4d3f499

Please sign in to comment.