diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index f29b36307..870bf52d5 100644 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "Use approximate knn search in output layer (currently only in transformer)") ->implicit_val("100 1024"); + // parameters for on-line quantization + cli.add("--optimize", + "Optimize the graph on-the-fly", false); + cli.add("--gemm-type,-g", + "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); + cli.add("--quantize-range", + "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", + 0.f); + #if 0 // @TODO: Ask Hany if there are any decoding-time options // add ULR settings addSuboptionsULR(cli); @@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { "Mixed precision for inference, set parameter type in expression graph", {"float32"}); + // parameters for on-line quantization + cli.add("--optimize", + "Optimize the graph on-the-fly", false); + cli.add("--gemm-type,-g", + "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32"); + cli.add("--quantize-range", + "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization", + 0.f); + cli.switchGroup(previous_group); // clang-format on } diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 6c7ef91ce..e4a4b0899 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // --optimize --cpu-thread=N with N > 0 are set. if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - return Expression(a, b, transA, transB, scale); + if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed || + a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) { +#if USE_FBGEMM + if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) { + auto packedB = cpu::variant::pack( + marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB); + return cpu::variant::dot(marian::Type::packed16, + a, packedB, b->shape(), transA, transB, scale); + } else { + float quantizeRange = b->graph()->getBackend()->getQuantizeRange(); + if(fbgemm::fbgemmHasAvx512Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx512, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::dot(marian::Type::packed8avx512, + a, packedB, b->shape(), transA, transB, scale); + } else if(fbgemm::fbgemmHasAvx2Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx2, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::dot(marian::Type::packed8avx2, + a, packedB, b->shape(), transA, transB, scale); + } else { + ABORT( + "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed " + "GEMM"); + } + } +#else + ABORT("Packed GEMM is not available in this build"); +#endif // USE_FBGEMM + } else { + return Expression( + a, b, transA, transB, scale); + } } else if(isFloat(aElementType) && isIntgemm(bElementType)) { return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { @@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::dot(a, + return cpu::variant::dot(b->value_type(), + a, b, b->shape(), transA, @@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { if(device == DeviceType::cpu) { if(isFloat(aElementType) && isFloat(bElementType)) { - return affineDefault(a, b, bias, transA, transB, scale); + if(a->graph()->getBackend()->isOptimized()) { + if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed || + a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) { +#if USE_FBGEMM + if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) { + auto packedB = cpu::variant::pack( + marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB); + return cpu::variant::affine(marian::Type::packed16, + a, packedB, b->shape(), bias, transA, transB, scale); + } else { + float quantizeRange = b->graph()->getBackend()->getQuantizeRange(); + if(fbgemm::fbgemmHasAvx512Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx512, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::affine(marian::Type::packed8avx512, + a, packedB, b->shape(), bias, transA, transB, scale); + } else if(fbgemm::fbgemmHasAvx2Support()) { + auto packedB = cpu::variant::pack(marian::Type::packed8avx2, + b, + cpu::variant::PackMatrix::B, + transB, + quantizeRange); + return cpu::variant::affine(marian::Type::packed8avx2, + a, packedB, b->shape(), bias, transA, transB, scale); + } else { + ABORT( + "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed " + "GEMM"); + } + } +#else + ABORT("Packed GEMM is not available in this build"); +#endif // USE_FBGEMM + } else { + return affineDefault(a, b, bias, transA, transB, scale); + } + } else { + return affineDefault(a, b, bias, transA, transB, scale); + } } else if(isFloat(aElementType) && isIntgemm(bElementType)) { return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale); } else if(isFloat(aElementType) && isPacked(bElementType)) { @@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) { // and this cpu lookup is executed only once and the state is kept in FBGEMM. if(fbgemm::fbgemmHasAvx2Support()) { // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B). - return cpu::variant::affine(a, + return cpu::variant::affine(b->value_type(), + a, b, b->shape(), bias, diff --git a/src/layers/generic.h b/src/layers/generic.h index 8f390bd7d..9af033df5 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -177,6 +177,8 @@ static inline std::function activationByName(const std::string& actN return (ActivationFunction*)swish; else if (actName == "gelu") return (ActivationFunction*)gelu; + else if (actName == "sigmoid") + return (ActivationFunction*)sigmoid; else if (actName == "") // return identity function if activation name is empty return [](Expr x) { return x; }; ABORT("Invalid activation name '{}'", actName); diff --git a/src/tensors/backend.h b/src/tensors/backend.h index 160b828d3..e0e93039e 100644 --- a/src/tensors/backend.h +++ b/src/tensors/backend.h @@ -5,6 +5,14 @@ namespace marian { +// GEMM type enum +typedef enum { + Auto = 0, // auto tuning between available GEMMs + Float32 = 1, // MKL based GEMM, fp32 + FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing + FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing +} GemmType; + class Backend { protected: DeviceId deviceId_; @@ -21,6 +29,19 @@ class Backend { // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name. virtual void setDevice() = 0; virtual void synchronize() = 0; + + // for CPU, sets to use optimized code for inference. + // for GPU, this is invalid. for gpu, isOptimized() function always returns false. + virtual void setOptimized(bool optimize) = 0; + virtual bool isOptimized() = 0; + // for CPU, selects different GEMM types for the inference. + // for GPU, there's no gemm type. so, it does nothing. + virtual void setGemmType(std::string gemmType) = 0; + virtual GemmType getGemmType() = 0; + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + virtual void setQuantizeRange(float range) = 0; + virtual float getQuantizeRange() = 0; }; Ptr BackendByDeviceId(DeviceId deviceId, size_t seed); diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h index 398e24240..f52ff6a33 100644 --- a/src/tensors/cpu/backend.h +++ b/src/tensors/cpu/backend.h @@ -10,10 +10,34 @@ namespace marian { namespace cpu { class Backend : public marian::Backend { +protected: + bool optimized_{false}; + GemmType gemmType_{GemmType::Float32}; + float quantizeRange_{0.f}; + public: Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {} void setDevice() override {} void synchronize() override {} + + // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU. + void setOptimized(bool optimize) override { optimized_ = optimize; } + bool isOptimized() override { return optimized_; } + // for CPU only, selects different GEMM types for the inference. Does nothing for GPU. + void setGemmType(std::string gemmType) override { + if (gemmType == "auto") gemmType_ = GemmType::Auto; + else if (gemmType == "float32") gemmType_ = GemmType::Float32; +#if USE_FBGEMM + else if (gemmType == "packed16") gemmType_ = GemmType::FbFp16Packed; + else if (gemmType.find("packed8") == 0) gemmType_ = GemmType::FbInt8Packed; +#endif // USE_FBGEMM + else ABORT("Unknown GEMM type - '{}'", gemmType); + } + GemmType getGemmType() override { return gemmType_; } + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + void setQuantizeRange(float range) override { quantizeRange_ = range; } + float getQuantizeRange() override { return quantizeRange_; } }; } // namespace cpu diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h index fb07bbad5..2c376d6e2 100644 --- a/src/tensors/cpu/fbgemm/expanded_gemm.h +++ b/src/tensors/cpu/fbgemm/expanded_gemm.h @@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { int nrow_; int ncol_; uint64_t packsize_; + float quantizeRange_; FbgemmPacked8PackNodeOp(Expr a, PackMatrix packMat, marian::Type packType, - bool transpose) - : UnaryNodeOp(a, newShape(a, transpose), Type::uint8), + bool transpose, + float quantizeRange) + : UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8), packMat_(packMat), packType_(packType), - transpose_(transpose) { + transpose_(transpose), + quantizeRange_(quantizeRange){ if(packMat != PackMatrix::B) ABORT("Only prepacking of B (weight matrix) is supported"); if(!memoize_) @@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { transpose_, nrow_, ncol_, - packsize_)) + packsize_, + quantizeRange_)) }; #else // USE_FBGEMM ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled."); @@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp { const std::string type() override { return "packMatInt8"; } #if USE_FBGEMM - Shape newShape(Expr a, bool transpose) { - fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_); + Shape newShape(Expr a, marian::Type packType, bool transpose) { + fbgemmPacked8PackInfo( + a->shape(), + packType, + transpose, + nrow_, + ncol_, + packsize_); Shape outShape({(int)packsize_}); return outShape; } #else - Shape newShape(Expr /*a*/, bool /*transpose*/) { + Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) { ABORT("Packed GEMM requires a build with USE_FBGEMM enabled"); return Shape(); } @@ -282,10 +292,17 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { size_t k_; bool transA_; bool transB_; + Type elementType_; public: - FbgemmPacked8AffineNodeOp(const std::vector& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/) - : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ { + FbgemmPacked8AffineNodeOp(Type elementType, + const std::vector& nodes, + Shape bShape, + bool transA, + bool transB, + float /*scalar*/) + : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32), + elementType_(elementType) { transA_ = transA; transB_ = transB; m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1]; @@ -324,7 +341,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { #if USE_FBGEMM // Do addBias only if it has a bias term if (children().size() > 2) { - nodeOps = { NodeOp(fbgemmPacked8Gemm(val_, + nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_, + val_, child(0)->val(), child(1)->val(), m_, @@ -334,7 +352,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { transB_); marian::cpu::integer::AddBias(val_, child(2)->val())) }; } else { - nodeOps = { NodeOp(fbgemmPacked8Gemm(val_, + nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_, + val_, child(0)->val(), child(1)->val(), m_, @@ -358,39 +377,46 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp { const std::string type() override { return "gemmPacked8"; } }; -static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) { +static inline Expr affine(Type elementType, + Expr a, + Expr b, + Shape bShape, + Expr c, + bool transA, + bool transB, + float scalar) { std::vector nodes = {a, b, c}; - Type elementType = b->value_type(); if (elementType == Type::packed16) return Expression(nodes, bShape, transA, transB, scalar); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(nodes, bShape, transA, transB, scalar); + return Expression( + elementType, nodes, bShape, transA, transB, scalar); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; } } -static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) { +static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) { if (elementType == Type::packed16) return Expression(a, packMat, transpose); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(a, packMat, elementType, transpose); + return Expression(a, packMat, elementType, transpose, quantizeRange); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; } } -static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) { +static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) { std::vector nodes = {a, b}; - Type elementType = b->value_type(); if (elementType == Type::packed16) return Expression(nodes, bShape, transA, transB, scalar); else if (isPacked(elementType) && sizeOf(elementType) == 1) - return Expression(nodes, bShape, transA, transB, scalar); + return Expression( + elementType, nodes, bShape, transA, transB, scalar); else { ABORT("Only int8 and fp16 are available. {}", elementType); return nullptr; diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp index 65dca1f70..dd81d0f7f 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.cpp +++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp @@ -360,10 +360,10 @@ void fbgemmPacked8Pack(marian::Tensor out, const float* data = inData; float val = 0; - - // Use half of the quantization range to prevent overflow of VPMADDUBSW - constexpr static int quantizedRange = 127; - constexpr static int quantizedMax = 63; + + // Use half of the quantization range to prevent overflow of VPMADDUBSW + constexpr static int quantizedRange = 127; + constexpr static int quantizedMax = 63; // This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range. for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.) @@ -371,32 +371,32 @@ void fbgemmPacked8Pack(marian::Tensor out, double mean = 0, sqrSum = 0; for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats val = getVal2dArr(data, ii, jj, k, n, transpose); - // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range - if(quantRangeStdDevs == 0.f) { - if(min > val) - min = val; - if(max < val) - max = val; - } else { - // Quantize by std.dev. range - mean += val; - sqrSum += val * val; - } - } - // If a quantization range (in multiples of std. dev.) is given with a non-zero value, - // it calculate the range for this column (different quantization scale/offset are used for each column) - if(quantRangeStdDevs != 0.f) { - mean /= k; - sqrSum /= k; - sqrSum -= mean * mean; - sqrSum = sqrt(sqrSum); - min = (float)(mean - quantRangeStdDevs * sqrSum); - max = (float)(mean + quantRangeStdDevs * sqrSum); - } - // based on the quantization range, this computes the scale and offset for the quantization - quantScaleB[jj] = (max - min) / quantizedRange; - quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]); - } + // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range + if(quantRangeStdDevs == 0.f) { + if(min > val) + min = val; + if(max < val) + max = val; + } else { + // Quantize by std.dev. range + mean += val; + sqrSum += val * val; + } + } + // If a quantization range (in multiples of std. dev.) is given with a non-zero value, + // it calculate the range for this column (different quantization scale/offset are used for each column) + if(quantRangeStdDevs != 0.f) { + mean /= k; + sqrSum /= k; + sqrSum -= mean * mean; + sqrSum = sqrt(sqrSum); + min = (float)(mean - quantRangeStdDevs * sqrSum); + max = (float)(mean + quantRangeStdDevs * sqrSum); + } + // based on the quantization range, this computes the scale and offset for the quantization + quantScaleB[jj] = (max - min) / quantizedRange; + quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]); + } // 2. quantize int8_t* quantized = 0; @@ -410,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out, TensorQuantizationParams bQuantParam; bQuantParam.scale = quantScaleB[jj]; bQuantParam.zero_point = quantZeropointB[jj]; - bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW + bQuantParam.precision = 7; // Use half of the quantization range to prevent overflow of VPMADDUBSW if (transpose) fbgemm::Quantize(data + jj * k, quantized + jj * k, k, bQuantParam); @@ -536,7 +536,8 @@ void fbgemmPacked16Gemm(marian::Tensor C, // k: the number of columns in A and the number of rows in B // transA: whether A matrix is transposed or not // transB: whether B matrix is transposed or not -void fbgemmPacked8Gemm(marian::Tensor C, +void fbgemmPacked8Gemm(Type packType, + marian::Tensor C, const marian::Tensor A, const marian::Tensor B, const size_t m, @@ -544,9 +545,6 @@ void fbgemmPacked8Gemm(marian::Tensor C, const size_t k, const int transA, const int transB) { - // pack type - marian::Type packType = B->type(); - const fbgemm::BlockingFactors* params = getBlockingFactors(packType); // Check if the packed format matches with the available AVX instruction set in the machine diff --git a/src/tensors/cpu/fbgemm/packed_gemm.h b/src/tensors/cpu/fbgemm/packed_gemm.h index 694860d48..e5740a434 100644 --- a/src/tensors/cpu/fbgemm/packed_gemm.h +++ b/src/tensors/cpu/fbgemm/packed_gemm.h @@ -135,7 +135,8 @@ void fbgemmPacked16Gemm(marian::Tensor C, // k: the number of columns in A and rows in B // transA: transpose of A matrix // transB: transpose of B matrix -void fbgemmPacked8Gemm(marian::Tensor C, +void fbgemmPacked8Gemm(Type packType, + marian::Tensor C, const marian::Tensor A, const marian::Tensor B, const size_t m, diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h index 75cc604da..410b41a49 100644 --- a/src/tensors/gpu/backend.h +++ b/src/tensors/gpu/backend.h @@ -64,6 +64,36 @@ class Backend : public marian::Backend { return cusparseHandle_; } + // for CPU, sets to use optimized code for inference. + // for GPU, this is invalid. for gpu, isOptimized() function always returns false. + void setOptimized(bool optimize) override { + LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize); + } + bool isOptimized() override { + LOG_ONCE(info, "isOptimized() not supported for GPU"); + return false; + }; + + // for CPU, selects different GEMM types for the inference. + // for GPU, there's no gemm type. so, it does nothing. + void setGemmType(std::string gemmType) override { + LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType); + } + GemmType getGemmType() override { + LOG_ONCE(info, "getGemmType() not supported for GPU"); + return GemmType::Float32; + } + + // for CPU, sets quantization range of weight matrices for the inference. + // for GPU, there's no quantization. so, it does nothing. + void setQuantizeRange(float range) override { + LOG_ONCE(info, "setQuantizeRange() not supported for GPU_{}", range); + } + float getQuantizeRange() override { + LOG_ONCE(info, "getQuantizeRange() not supported for GPU"); + return 0.f; + } + CudaCompute getCudaComputeCapability() { return compute_; } private: diff --git a/src/translator/translator.h b/src/translator/translator.h index fe01065b6..579f126de 100644 --- a/src/translator/translator.h +++ b/src/translator/translator.h @@ -89,6 +89,11 @@ class Translate : public ModelTask { auto prec = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(prec[0])); graph->setDevice(device); + if (device.type == DeviceType::cpu) { + graph->getBackend()->setOptimized(options_->get("optimize")); + graph->getBackend()->setGemmType(options_->get("gemm-type")); + graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); + } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_[id] = graph; @@ -282,6 +287,11 @@ class TranslateService : public ModelServiceTask { auto precison = options_->get>("precision", {"float32"}); graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph graph->setDevice(device); + if (device.type == DeviceType::cpu) { + graph->getBackend()->setOptimized(options_->get("optimize")); + graph->getBackend()->setGemmType(options_->get("gemm-type")); + graph->getBackend()->setQuantizeRange(options_->get("quantize-range")); + } graph->reserveWorkspaceMB(options_->get("workspace")); graphs_.push_back(graph);