Skip to content

Commit

Permalink
[GNA] Fixed scale factors propagation for Eltwise with very different…
Browse files Browse the repository at this point in the history
… inputs ranges (openvinotoolkit#7305)

* [GNA] Fix scale factors propogation for Eltwise with very different inputs ranges

* [GNA] Added test

* [GNA] Added exception for scale factor <= 0

* [GNA] Disable tests with integer weights

* [GNA] Added assert for CNNLayer in getScaleFactor()

* [GNA] Added check if scale factor is inf

* [GNA] Fixed legacy tests
  • Loading branch information
elilobanova authored and dood-apo committed Aug 24, 2023
1 parent cbaaf0a commit 55fdc12
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 121 deletions.
48 changes: 48 additions & 0 deletions inference-engine/src/gna_plugin/frontend/layer_quantizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -699,5 +699,53 @@ using QuantI8_I8 = frontend::QuantPair<frontend::QuantI8_I8, frontend::QuantI8_I
using FakeQuantI16 = frontend::QuantPair<frontend::FakeQuantI16, frontend::FakeQuantI16>;
using FakeQuantI8 = frontend::QuantPair<frontend::FakeQuantI8, frontend::FakeQuantI16>;

enum class QuantizedDataType {
input,
output,
weights,
bias
};

/**
* @brief Returns a scale factor for specific layer data
* @param layer Layer to be quantized
* @param data_type Type of data to be quantized
* @return scale factor
*/
inline float getScaleFactor(InferenceEngine::CNNLayerPtr layer, QuantizedDataType data_type) {
IE_ASSERT(layer != nullptr);
auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
float scale_factor;
if (!quantized) {
scale_factor = 1.0f;
} else {
switch (data_type) {
case QuantizedDataType::input:
scale_factor = quantized->_src_quant.GetScale();
break;
case QuantizedDataType::output:
scale_factor = quantized->_dst_quant.GetScale();
break;
case QuantizedDataType::weights:
scale_factor = quantized->_weights_quant.GetScale();
break;
case QuantizedDataType::bias:
scale_factor = quantized->_bias_quant.GetScale();
break;
default:
THROW_GNA_LAYER_EXCEPTION(layer) << "Unsupported data type for quantization: " << static_cast<int>(data_type);
}
}

auto isZero = [](float p1) {
return std::abs(p1) <= 0.00001f;
};

if (scale_factor < 0.0 || isZero(scale_factor) || std::isinf(scale_factor)) {
THROW_GNA_LAYER_EXCEPTION(layer) << "Invalid scale factor: " << scale_factor;
}

return scale_factor;
}

} // namespace GNAPluginNS
154 changes: 82 additions & 72 deletions inference-engine/src/gna_plugin/frontend/scale_factor_calc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -490,7 +490,8 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {

if ((!fakeQuantize && quantSibling->_dst_quant.IsScaleSet()) ||
(fakeQuantize && quantSibling->_dst_quant.IsScaleSet() && !fp32eq(quantSibling->_dst_quant.GetScale(), 1.0) &&
quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale()) || infiniteLoopCount > 0) {
quantSibling->_dst_quant.GetScale() < inputQuant->_dst_quant.GetScale()) ||
quantSibling->_dst_quant.IsScaleSet() && infiniteLoopCount > 0) {
// means we already restarted propagation input memory layer
// need to search for requantiseable layer prior memory output layer
InferenceEngine::CNNLayerPtr restartedLayer;
Expand Down Expand Up @@ -657,6 +658,73 @@ class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {

template<>
class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
private:
bool requantizeEltwiseInput(InferenceEngine::EltwiseLayer* eltwiseLayer, uint8_t inputIx, int16_t maxValue,
bool fakeQuantize, ScaleFactorUpdateResult &result) {
auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
auto in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, inputIx);
bool has8BOr16BOut = LayerInfo(in).has8BOr16BOutput();
auto quantParams =
InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, inputIx));
// trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
auto quantParamsOpposite =
InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !inputIx));

while (in && !LayerInfo(in).isInput() && !LayerInfo(in).isMemory() && !LayerInfo(in).isCopy()) {
auto info = LayerInfo(in);
if (info.isActivation() || info.isConst()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
float newOutputScale;
if (has8BOr16BOut) {
newOutputScale = quantParamsOpposite->_dst_quant.GetScale() / maxValue;
} else {
newOutputScale = quantDataForInputLayer->_dst_quant.GetScale() *
quantParamsOpposite->_dst_quant.GetScale() * maxValue /
quantParams->_dst_quant.GetScale();
}
if (info.isActivation() && newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
return false;
}
gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
<< ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
<< ", was " << quantDataForInputLayer->_dst_quant.GetScale() <<"\n" << std::flush;
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
result = ScaleFactorUpdateResult(in.get());
return true;
}

if (fakeQuantize && info.isWeightableIdentity()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
auto reducer = quantData->_weights_quant.GetScale() / maxValue;
reducer = std::max(1.0f, reducer);
auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
newWeightsScale = std::max(1.0f, newWeightsScale);
quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
quantDataForInputLayer->_src_quant.GetScale());

result = ScaleFactorUpdateResult(in.get());
return true;
}
}

// if we are here it means that we are in the port 1
if (info.isFullyConnected() || info.isConvolution()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
auto newOutputScale = quantParamsOpposite->_dst_quant.GetScale() * maxValue;
auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
result = ScaleFactorUpdateResult(in.get());
return true;
}

in = InferenceEngine::CNNNetHasPrevLayer(in.get()) ? InferenceEngine::CNNNetPrevLayer(in) : nullptr;
}
return false;
}

public:
bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, int inputsSize, ScaleFactorUpdateResult &result,
bool fakeQuantize, int infiniteLoopCount) {
Expand Down Expand Up @@ -734,7 +802,7 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
}
}

if (!fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
if (bestWeightsScale > 0.0f && !fp32eq(bestWeightsScale, quantParams1->_weights_quant.GetScale())) {
quantParams1->_weights_quant.SetScale(bestWeightsScale);
quantParams1->_dst_quant.SetScale(quantParams1->_weights_quant.GetScale() * quantParams1->_src_quant.GetScale());
result = ScaleFactorUpdateResult(in1.get());
Expand All @@ -746,79 +814,22 @@ class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
quantData->_dst_quant.SetScale(quantParams1->_dst_quant.GetScale());

// eltwise will work in int16 or int8 if low precision inputs are used
auto maxValue = lowPrecision ? (std::numeric_limits<int8_t>::max() - 1) : (std::numeric_limits<int16_t>::max() - 1);
if (quantData->_weights_quant.GetScale() > maxValue + 1) {
// rescaling it's activation input
// iterating thru previous layers of eltwise
for (uint8_t i = 0; i < 2; ++i) {
InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
bool has8BOr16BOut = LayerInfo(in).has8BOr16BOutput();
auto quantParams =
InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i));
// trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
auto quantParamsOpposite =
InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));

for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
auto info = LayerInfo(in);
if (info.isSplit() || info.isSlice() || info.isConcat() || info.isNonFunctional()) {
continue;
} else if (info.has8BOr16BOutput() && info.isActivation()) {
auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
float newOutputScale;
if (has8BOr16BOut) {
newOutputScale = quantParamsOpposite->_dst_quant.GetScale() / maxValue;
} else {
newOutputScale = quantDataForActivation->_dst_quant.GetScale() *
quantParamsOpposite->_dst_quant.GetScale() * maxValue /
quantParams->_dst_quant.GetScale();
}
if (newOutputScale > static_cast<float>(std::numeric_limits<int16_t>::max()) / 2) {
break;
}
gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
<< ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
<< ", was " << quantDataForActivation->_dst_quant.GetScale() <<"\n" << std::flush;
quantDataForActivation->_dst_quant.SetScale(newOutputScale);
result = ScaleFactorUpdateResult(in.get());
return true;
} else if (info.has8BOr16BOutput()) {
break;
}

if (fakeQuantize && info.isWeightableIdentity()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
if (!fp32eq(quantDataForInputLayer->_weights_quant.GetScale(), 1.0f)) {
auto reducer = quantData->_weights_quant.GetScale() / std::numeric_limits<int16_t>::max();
reducer = std::max(1.0f, reducer);
auto newWeightsScale = quantDataForInputLayer->_weights_quant.GetScale() / reducer;
newWeightsScale = std::max(1.0f, newWeightsScale);
quantDataForInputLayer->_weights_quant.SetScale(static_cast<int32_t>(newWeightsScale));
quantDataForInputLayer->_dst_quant.SetScale(quantDataForInputLayer->_weights_quant.GetScale() *
quantDataForInputLayer->_src_quant.GetScale());

result = ScaleFactorUpdateResult(in.get());
return true;
}
}
auto maxValue = lowPrecision ? std::numeric_limits<int8_t>::max() : std::numeric_limits<int16_t>::max();
if (quantData->_weights_quant.GetScale() <= maxValue) {
return true;
}

// if we are here it means that we are in the port 1
if (info.isFullyConnected() || info.isConvolution()) {
auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
auto newOutputScale = quantParamsOpposite->_dst_quant.GetScale() * maxValue;
auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.GetScale();
quantDataForInputLayer->_dst_quant.SetScale(newOutputScale);
quantDataForInputLayer->_weights_quant.SetScale(newWeightScale);
result = ScaleFactorUpdateResult(in.get());
return true;
}
}
// rescaling it's activation input
// iterating thru previous layers of eltwise
for (uint8_t i = 0; i < 2; ++i) {
if (requantizeEltwiseInput(eltwiseLayer, i, maxValue - 1, fakeQuantize, result)) {
return true;
}
// we unable to rescale the input - results might be bad
gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
}
break;
// we unable to rescale the input - results might be bad
gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
}
break;
default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
}
return true;
Expand Down Expand Up @@ -1153,7 +1164,6 @@ class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
}
quant->_weights_quant.SetScale(quant->_weights_quant.GetScale() / weights_reducer);
}

double tmp_dst_quant_scale = quant->_weights_quant.GetScale() * quant->_src_quant.GetScale();
if (weightsSize == 1) {
auto itt = thresholds.begin();
Expand Down
Loading

0 comments on commit 55fdc12

Please sign in to comment.