Online quantization (#847)

* Enable on-line packing/quantization * Add half precision min/max quantization for model weights * Change default quantization of B matrix to min/max, revert a false commit for AggregateAll * Fixed missing half quantization * Fix quantization range for A * Set all default values for the quantize range to 0.f * Use 7 bits clip for the weight matrix quantization to avoid an overflow of VPMADDUBSW
marian-nmt · May 25, 2021 · 9fa166b · 9fa166b
1 parent 3133a9b
commit 9fa166b
Show file tree

Hide file tree

Showing 10 changed files with 270 additions and 59 deletions.
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
@@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
 
+  // parameters for on-line quantization
+  cli.add<bool>("--optimize",
+      "Optimize the graph on-the-fly", false);
+  cli.add<std::string>("--gemm-type,-g",
+     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
+  cli.add<float>("--quantize-range",
+     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
+     0.f);
+
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
   addSuboptionsULR(cli);
@@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
       "Mixed precision for inference, set parameter type in expression graph",
       {"float32"});
 
+  // parameters for on-line quantization
+  cli.add<bool>("--optimize",
+      "Optimize the graph on-the-fly", false);
+  cli.add<std::string>("--gemm-type,-g",
+     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
+  cli.add<float>("--quantize-range",
+     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
+     0.f);
+
   cli.switchGroup(previous_group);
   // clang-format on
 }

diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
@@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
   // --optimize --cpu-thread=N with N > 0 are set.
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      return Expression<DotNodeOp>(a, b, transA, transB, scale);
+      if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
+        a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
+#if USE_FBGEMM
+        if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
+          auto packedB = cpu::variant::pack(
+              marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
+          return cpu::variant::dot(marian::Type::packed16,
+              a, packedB, b->shape(), transA, transB, scale);
+        } else {
+          float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
+          if(fbgemm::fbgemmHasAvx512Support()) {
+            auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
+                                              b,
+                                              cpu::variant::PackMatrix::B,
+                                              transB,
+                                              quantizeRange);
+            return cpu::variant::dot(marian::Type::packed8avx512,
+                a, packedB, b->shape(), transA, transB, scale);
+          } else if(fbgemm::fbgemmHasAvx2Support()) {
+            auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
+                                              b,
+                                              cpu::variant::PackMatrix::B,
+                                              transB,
+                                              quantizeRange);
+            return cpu::variant::dot(marian::Type::packed8avx2,
+                a, packedB, b->shape(), transA, transB, scale);
+          } else {
+            ABORT(
+                "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
+                "GEMM");
+          }
+        }
+#else
+        ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+      } else {
+        return Expression<DotNodeOp>(
+          a, b, transA, transB, scale);
+      }
     } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
       return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
@@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::dot(a,
+        return cpu::variant::dot(b->value_type(),
+                                 a,
                                  b,
                                  b->shape(),
                                  transA,
@@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
 
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      return affineDefault(a, b, bias, transA, transB, scale);
+      if(a->graph()->getBackend()->isOptimized()) {
+        if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
+          a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
+#if USE_FBGEMM
+          if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
+            auto packedB = cpu::variant::pack(
+                marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
+            return cpu::variant::affine(marian::Type::packed16,
+                a, packedB, b->shape(), bias, transA, transB, scale);
+          } else {
+            float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
+            if(fbgemm::fbgemmHasAvx512Support()) {
+              auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
+                                                b,
+                                                cpu::variant::PackMatrix::B,
+                                                transB,
+                                                quantizeRange);
+              return cpu::variant::affine(marian::Type::packed8avx512,
+                  a, packedB, b->shape(), bias, transA, transB, scale);
+            } else if(fbgemm::fbgemmHasAvx2Support()) {
+              auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
+                                                b,
+                                                cpu::variant::PackMatrix::B,
+                                                transB,
+                                                quantizeRange);
+              return cpu::variant::affine(marian::Type::packed8avx2,
+                  a, packedB, b->shape(), bias, transA, transB, scale);
+            } else {
+              ABORT(
+                  "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
+                  "GEMM");
+            }
+          }
+#else
+          ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+        } else {
+          return affineDefault(a, b, bias, transA, transB, scale);
+        }
+      } else {
+        return affineDefault(a, b, bias, transA, transB, scale);
+      }
     } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
       return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
@@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::affine(a,
+        return cpu::variant::affine(b->value_type(),
+                                    a,
                                     b,
                                     b->shape(),
                                     bias,

diff --git a/src/layers/generic.h b/src/layers/generic.h
@@ -177,6 +177,8 @@ static inline std::function<Expr(Expr)> activationByName(const std::string& actN
     return (ActivationFunction*)swish;
   else if (actName == "gelu")
     return (ActivationFunction*)gelu;
+  else if (actName == "sigmoid")
+    return (ActivationFunction*)sigmoid;
   else if (actName == "") // return identity function if activation name is empty
     return [](Expr x) { return x; };
   ABORT("Invalid activation name '{}'", actName);

diff --git a/src/tensors/backend.h b/src/tensors/backend.h
@@ -5,6 +5,14 @@
 
 namespace marian {
 
+// GEMM type enum
+typedef enum {
+  Auto = 0,            // auto tuning between available GEMMs
+  Float32 = 1,         // MKL based GEMM, fp32
+  FbFp16Packed = 10,   // FBGEMM based fp16 GEMM with packing
+  FbInt8Packed = 11    // FBGEMM based int8 GEMM with packing
+} GemmType;
+
 class Backend {
 protected:
   DeviceId deviceId_;
@@ -21,6 +29,19 @@ class Backend {
   // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
   virtual void setDevice() = 0;
   virtual void synchronize() = 0;
+
+  // for CPU, sets to use optimized code for inference.
+  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
+  virtual void setOptimized(bool optimize) = 0;
+  virtual bool isOptimized() = 0;
+  // for CPU, selects different GEMM types for the inference.
+  // for GPU, there's no gemm type. so, it does nothing.
+  virtual void setGemmType(std::string gemmType) = 0;
+  virtual GemmType getGemmType() = 0;
+  // for CPU, sets quantization range of weight matrices for the inference.
+  // for GPU, there's no quantization. so, it does nothing.
+  virtual void setQuantizeRange(float range) = 0;
+  virtual float getQuantizeRange() = 0;
 };
 
 Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);

diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
@@ -10,10 +10,34 @@ namespace marian {
 namespace cpu {
 
 class Backend : public marian::Backend {
+protected:
+  bool optimized_{false};
+  GemmType gemmType_{GemmType::Float32};
+  float quantizeRange_{0.f};
+
 public:
   Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
   void setDevice() override {}
   void synchronize() override {}
+
+  // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
+  void setOptimized(bool optimize) override { optimized_ = optimize; }
+  bool isOptimized() override { return optimized_; }
+  // for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
+  void setGemmType(std::string gemmType) override {
+    if      (gemmType == "auto")        gemmType_ = GemmType::Auto;
+    else if (gemmType == "float32")     gemmType_ = GemmType::Float32;
+#if USE_FBGEMM
+    else if (gemmType == "packed16")    gemmType_ = GemmType::FbFp16Packed;
+    else if (gemmType.find("packed8") == 0)  gemmType_ = GemmType::FbInt8Packed;
+#endif // USE_FBGEMM
+    else ABORT("Unknown GEMM type - '{}'", gemmType);
+  }
+  GemmType getGemmType() override { return gemmType_; }
+  // for CPU, sets quantization range of weight matrices for the inference.
+  // for GPU, there's no quantization. so, it does nothing.
+  void setQuantizeRange(float range) override { quantizeRange_ = range; }
+  float getQuantizeRange() override { return quantizeRange_; }
 };
 
 }  // namespace cpu

diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h
@@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   int nrow_;
   int ncol_;
   uint64_t packsize_;
+  float quantizeRange_;
 
   FbgemmPacked8PackNodeOp(Expr a,
                           PackMatrix packMat,
                           marian::Type packType,
-                          bool transpose)
-      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+                          bool transpose,
+                          float quantizeRange)
+      : UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8),
         packMat_(packMat),
         packType_(packType),
-        transpose_(transpose) {
+        transpose_(transpose),
+        quantizeRange_(quantizeRange){
     if(packMat != PackMatrix::B)
       ABORT("Only prepacking of B (weight matrix) is supported");
     if(!memoize_)
@@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
                                      transpose_,
                                      nrow_,
                                      ncol_,
-                                     packsize_))
+                                     packsize_,
+                                     quantizeRange_))
     };
 #else // USE_FBGEMM
     ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
@@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   const std::string type() override { return "packMatInt8"; }
 
 #if USE_FBGEMM
-  Shape newShape(Expr a, bool transpose) {
-    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
+  Shape newShape(Expr a, marian::Type packType, bool transpose) {
+    fbgemmPacked8PackInfo(
+        a->shape(),
+        packType,
+        transpose,
+        nrow_,
+        ncol_,
+        packsize_);
     Shape outShape({(int)packsize_});
     return outShape;
   }
 #else
-  Shape newShape(Expr /*a*/, bool /*transpose*/) {
+  Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) {
     ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
     return Shape();
   }
@@ -282,10 +292,17 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
   size_t k_;
   bool transA_;
   bool transB_;
+  Type elementType_;
 
 public:
- FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
-   : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
+  FbgemmPacked8AffineNodeOp(Type elementType,
+                            const std::vector<Expr>& nodes,
+                            Shape bShape,
+                            bool transA,
+                            bool transB,
+                            float /*scalar*/)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        elementType_(elementType) {
     transA_ = transA;
     transB_ = transB;
     m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@@ -324,7 +341,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
 #if USE_FBGEMM
     // Do addBias only if it has a bias term
     if (children().size() > 2) {
-      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
+                                           val_,
                                            child(0)->val(),
                                            child(1)->val(),
                                            m_,
@@ -334,7 +352,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
                                            transB_);
                        marian::cpu::integer::AddBias(val_, child(2)->val())) };
     } else {
-      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
+                                           val_,
                                            child(0)->val(),
                                            child(1)->val(),
                                            m_,
@@ -358,39 +377,46 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
   const std::string type() override { return "gemmPacked8"; }
 };
 
-static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
+static inline Expr affine(Type elementType,
+                          Expr a,
+                          Expr b,
+                          Shape bShape,
+                          Expr c,
+                          bool transA,
+                          bool transB,
+                          float scalar) {
   std::vector<Expr> nodes = {a, b, c};
-  Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
+        elementType, nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
   }
 }
 
-static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
+static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) {
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
+    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, quantizeRange);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
   }
 }
 
-static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
+static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
   std::vector<Expr> nodes = {a, b};
-  Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
+        elementType, nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;