diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index f29b36307..870bf52d5 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
      "Use approximate knn search in output layer (currently only in transformer)")
      ->implicit_val("100 1024");
 
+  // parameters for on-line quantization
+  cli.add<bool>("--optimize",
+      "Optimize the graph on-the-fly", false);
+  cli.add<std::string>("--gemm-type,-g",
+     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
+  cli.add<float>("--quantize-range",
+     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
+     0.f);
+
 #if 0 // @TODO: Ask Hany if there are any decoding-time options
   // add ULR settings
   addSuboptionsULR(cli);
@@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
       "Mixed precision for inference, set parameter type in expression graph",
       {"float32"});
 
+  // parameters for on-line quantization
+  cli.add<bool>("--optimize",
+      "Optimize the graph on-the-fly", false);
+  cli.add<std::string>("--gemm-type,-g",
+     "GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
+  cli.add<float>("--quantize-range",
+     "Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
+     0.f);
+
   cli.switchGroup(previous_group);
   // clang-format on
 }
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 6c7ef91ce..e4a4b0899 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
   // --optimize --cpu-thread=N with N > 0 are set.
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      return Expression<DotNodeOp>(a, b, transA, transB, scale);
+      if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
+        a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
+#if USE_FBGEMM
+        if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
+          auto packedB = cpu::variant::pack(
+              marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
+          return cpu::variant::dot(marian::Type::packed16,
+              a, packedB, b->shape(), transA, transB, scale);
+        } else {
+          float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
+          if(fbgemm::fbgemmHasAvx512Support()) {
+            auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
+                                              b,
+                                              cpu::variant::PackMatrix::B,
+                                              transB,
+                                              quantizeRange);
+            return cpu::variant::dot(marian::Type::packed8avx512,
+                a, packedB, b->shape(), transA, transB, scale);
+          } else if(fbgemm::fbgemmHasAvx2Support()) {
+            auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
+                                              b,
+                                              cpu::variant::PackMatrix::B,
+                                              transB,
+                                              quantizeRange);
+            return cpu::variant::dot(marian::Type::packed8avx2,
+                a, packedB, b->shape(), transA, transB, scale);
+          } else {
+            ABORT(
+                "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
+                "GEMM");
+          }
+        }
+#else
+        ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+      } else {
+        return Expression<DotNodeOp>(
+          a, b, transA, transB, scale);
+      }
     } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
       return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
@@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::dot(a,
+        return cpu::variant::dot(b->value_type(),
+                                 a,
                                  b,
                                  b->shape(),
                                  transA,
@@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
 
   if(device == DeviceType::cpu) {
     if(isFloat(aElementType) && isFloat(bElementType)) {
-      return affineDefault(a, b, bias, transA, transB, scale);
+      if(a->graph()->getBackend()->isOptimized()) {
+        if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
+          a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
+#if USE_FBGEMM
+          if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
+            auto packedB = cpu::variant::pack(
+                marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
+            return cpu::variant::affine(marian::Type::packed16,
+                a, packedB, b->shape(), bias, transA, transB, scale);
+          } else {
+            float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
+            if(fbgemm::fbgemmHasAvx512Support()) {
+              auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
+                                                b,
+                                                cpu::variant::PackMatrix::B,
+                                                transB,
+                                                quantizeRange);
+              return cpu::variant::affine(marian::Type::packed8avx512,
+                  a, packedB, b->shape(), bias, transA, transB, scale);
+            } else if(fbgemm::fbgemmHasAvx2Support()) {
+              auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
+                                                b,
+                                                cpu::variant::PackMatrix::B,
+                                                transB,
+                                                quantizeRange);
+              return cpu::variant::affine(marian::Type::packed8avx2,
+                  a, packedB, b->shape(), bias, transA, transB, scale);
+            } else {
+              ABORT(
+                  "AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
+                  "GEMM");
+            }
+          }
+#else
+          ABORT("Packed GEMM is not available in this build");
+#endif  // USE_FBGEMM
+        } else {
+          return affineDefault(a, b, bias, transA, transB, scale);
+        }
+      } else {
+        return affineDefault(a, b, bias, transA, transB, scale);
+      }
     } else if(isFloat(aElementType) && isIntgemm(bElementType)) {
       return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
     } else if(isFloat(aElementType) && isPacked(bElementType)) {
@@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
       // and this cpu lookup is executed only once and the state is kept in FBGEMM.
       if(fbgemm::fbgemmHasAvx2Support()) {
         // This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
-        return cpu::variant::affine(a,
+        return cpu::variant::affine(b->value_type(),
+                                    a,
                                     b,
                                     b->shape(),
                                     bias,
diff --git a/src/layers/generic.h b/src/layers/generic.h
index 8f390bd7d..9af033df5 100644
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -177,6 +177,8 @@ static inline std::function<Expr(Expr)> activationByName(const std::string& actN
     return (ActivationFunction*)swish;
   else if (actName == "gelu")
     return (ActivationFunction*)gelu;
+  else if (actName == "sigmoid")
+    return (ActivationFunction*)sigmoid;
   else if (actName == "") // return identity function if activation name is empty
     return [](Expr x) { return x; };
   ABORT("Invalid activation name '{}'", actName);
diff --git a/src/tensors/backend.h b/src/tensors/backend.h
index 160b828d3..e0e93039e 100644
--- a/src/tensors/backend.h
+++ b/src/tensors/backend.h
@@ -5,6 +5,14 @@
 
 namespace marian {
 
+// GEMM type enum
+typedef enum {
+  Auto = 0,            // auto tuning between available GEMMs
+  Float32 = 1,         // MKL based GEMM, fp32
+  FbFp16Packed = 10,   // FBGEMM based fp16 GEMM with packing
+  FbInt8Packed = 11    // FBGEMM based int8 GEMM with packing
+} GemmType;
+
 class Backend {
 protected:
   DeviceId deviceId_;
@@ -21,6 +29,19 @@ class Backend {
   // for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
   virtual void setDevice() = 0;
   virtual void synchronize() = 0;
+
+  // for CPU, sets to use optimized code for inference.
+  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
+  virtual void setOptimized(bool optimize) = 0;
+  virtual bool isOptimized() = 0;
+  // for CPU, selects different GEMM types for the inference.
+  // for GPU, there's no gemm type. so, it does nothing.
+  virtual void setGemmType(std::string gemmType) = 0;
+  virtual GemmType getGemmType() = 0;
+  // for CPU, sets quantization range of weight matrices for the inference.
+  // for GPU, there's no quantization. so, it does nothing.
+  virtual void setQuantizeRange(float range) = 0;
+  virtual float getQuantizeRange() = 0;
 };
 
 Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
diff --git a/src/tensors/cpu/backend.h b/src/tensors/cpu/backend.h
index 398e24240..f52ff6a33 100644
--- a/src/tensors/cpu/backend.h
+++ b/src/tensors/cpu/backend.h
@@ -10,10 +10,34 @@ namespace marian {
 namespace cpu {
 
 class Backend : public marian::Backend {
+protected:
+  bool optimized_{false};
+  GemmType gemmType_{GemmType::Float32};
+  float quantizeRange_{0.f};
+
 public:
   Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
   void setDevice() override {}
   void synchronize() override {}
+
+  // for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
+  void setOptimized(bool optimize) override { optimized_ = optimize; }
+  bool isOptimized() override { return optimized_; }
+  // for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
+  void setGemmType(std::string gemmType) override {
+    if      (gemmType == "auto")        gemmType_ = GemmType::Auto;
+    else if (gemmType == "float32")     gemmType_ = GemmType::Float32;
+#if USE_FBGEMM
+    else if (gemmType == "packed16")    gemmType_ = GemmType::FbFp16Packed;
+    else if (gemmType.find("packed8") == 0)  gemmType_ = GemmType::FbInt8Packed;
+#endif // USE_FBGEMM
+    else ABORT("Unknown GEMM type - '{}'", gemmType);
+  }
+  GemmType getGemmType() override { return gemmType_; }
+  // for CPU, sets quantization range of weight matrices for the inference.
+  // for GPU, there's no quantization. so, it does nothing.
+  void setQuantizeRange(float range) override { quantizeRange_ = range; }
+  float getQuantizeRange() override { return quantizeRange_; }
 };
 
 }  // namespace cpu
diff --git a/src/tensors/cpu/fbgemm/expanded_gemm.h b/src/tensors/cpu/fbgemm/expanded_gemm.h
index fb07bbad5..2c376d6e2 100644
--- a/src/tensors/cpu/fbgemm/expanded_gemm.h
+++ b/src/tensors/cpu/fbgemm/expanded_gemm.h
@@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   int nrow_;
   int ncol_;
   uint64_t packsize_;
+  float quantizeRange_;
 
   FbgemmPacked8PackNodeOp(Expr a,
                           PackMatrix packMat,
                           marian::Type packType,
-                          bool transpose)
-      : UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
+                          bool transpose,
+                          float quantizeRange)
+      : UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8),
         packMat_(packMat),
         packType_(packType),
-        transpose_(transpose) {
+        transpose_(transpose),
+        quantizeRange_(quantizeRange){
     if(packMat != PackMatrix::B)
       ABORT("Only prepacking of B (weight matrix) is supported");
     if(!memoize_)
@@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
                                      transpose_,
                                      nrow_,
                                      ncol_,
-                                     packsize_))
+                                     packsize_,
+                                     quantizeRange_))
     };
 #else // USE_FBGEMM
     ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
@@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
   const std::string type() override { return "packMatInt8"; }
 
 #if USE_FBGEMM
-  Shape newShape(Expr a, bool transpose) {
-    fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
+  Shape newShape(Expr a, marian::Type packType, bool transpose) {
+    fbgemmPacked8PackInfo(
+        a->shape(),
+        packType,
+        transpose,
+        nrow_,
+        ncol_,
+        packsize_);
     Shape outShape({(int)packsize_});
     return outShape;
   }
 #else
-  Shape newShape(Expr /*a*/, bool /*transpose*/) {
+  Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) {
     ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
     return Shape();
   }
@@ -282,10 +292,17 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
   size_t k_;
   bool transA_;
   bool transB_;
+  Type elementType_;
 
 public:
- FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
-   : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
+  FbgemmPacked8AffineNodeOp(Type elementType,
+                            const std::vector<Expr>& nodes,
+                            Shape bShape,
+                            bool transA,
+                            bool transB,
+                            float /*scalar*/)
+      : NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
+        elementType_(elementType) {
     transA_ = transA;
     transB_ = transB;
     m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
@@ -324,7 +341,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
 #if USE_FBGEMM
     // Do addBias only if it has a bias term
     if (children().size() > 2) {
-      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
+                                           val_,
                                            child(0)->val(),
                                            child(1)->val(),
                                            m_,
@@ -334,7 +352,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
                                            transB_);
                        marian::cpu::integer::AddBias(val_, child(2)->val())) };
     } else {
-      nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
+      nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
+                                           val_,
                                            child(0)->val(),
                                            child(1)->val(),
                                            m_,
@@ -358,39 +377,46 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
   const std::string type() override { return "gemmPacked8"; }
 };
 
-static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
+static inline Expr affine(Type elementType,
+                          Expr a,
+                          Expr b,
+                          Shape bShape,
+                          Expr c,
+                          bool transA,
+                          bool transB,
+                          float scalar) {
   std::vector<Expr> nodes = {a, b, c};
-  Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
+        elementType, nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
   }
 }
 
-static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
+static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) {
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
+    return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, quantizeRange);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
   }
 }
 
-static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
+static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
   std::vector<Expr> nodes = {a, b};
-  Type elementType = b->value_type();
 
   if (elementType == Type::packed16)
     return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
   else if (isPacked(elementType) && sizeOf(elementType) == 1)
-    return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
+    return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
+        elementType, nodes, bShape, transA, transB, scalar);
   else {
     ABORT("Only int8 and fp16 are available. {}", elementType);
     return nullptr;
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.cpp b/src/tensors/cpu/fbgemm/packed_gemm.cpp
index 65dca1f70..dd81d0f7f 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.cpp
+++ b/src/tensors/cpu/fbgemm/packed_gemm.cpp
@@ -360,10 +360,10 @@ void fbgemmPacked8Pack(marian::Tensor out,
 
   const float* data = inData;
   float val = 0;
-
-  // Use half of the quantization range to prevent overflow of VPMADDUBSW
-  constexpr static int quantizedRange = 127;
-  constexpr static int quantizedMax = 63;
+
+  // Use half of the quantization range to prevent overflow of VPMADDUBSW
+  constexpr static int quantizedRange = 127;
+  constexpr static int quantizedMax = 63;
 
   // This routine compute the quantization range for each column - either one of min/max range or quantRangeStdDevs sigma range.
   for (size_t jj = 0; jj < n; jj++) { // for each column, collect stats (min/max or mean/std.dev.)
@@ -371,32 +371,32 @@ void fbgemmPacked8Pack(marian::Tensor out,
     double mean = 0, sqrSum = 0;
     for (size_t ii = 0; ii < k; ii++) { // in a column, go throuhg all the rows and collect stats
       val = getVal2dArr(data, ii, jj, k, n, transpose);
-      // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
-      if(quantRangeStdDevs == 0.f) {
-        if(min > val)
-          min = val;
-        if(max < val)
-          max = val;
-      } else {
-        // Quantize by std.dev. range
-        mean += val;
-        sqrSum += val * val;
-      }
-    }
-    // If a quantization range (in multiples of std. dev.) is given with a non-zero value,
-    // it calculate the range for this column (different quantization scale/offset are used for each column)
-    if(quantRangeStdDevs != 0.f) {
-      mean /= k;
-      sqrSum /= k;
-      sqrSum -= mean * mean;
-      sqrSum = sqrt(sqrSum);
-      min = (float)(mean - quantRangeStdDevs * sqrSum);
-      max = (float)(mean + quantRangeStdDevs * sqrSum);
-    }
-    // based on the quantization range, this computes the scale and offset for the quantization
-    quantScaleB[jj] = (max - min) / quantizedRange;
-    quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]);
-  }
+      // If quantRangeStdDevs is 0.f, min/max values of the columns is used as a quantization range
+      if(quantRangeStdDevs == 0.f) {
+        if(min > val)
+          min = val;
+        if(max < val)
+          max = val;
+      } else {
+        // Quantize by std.dev. range
+        mean += val;
+        sqrSum += val * val;
+      }
+    }
+    // If a quantization range (in multiples of std. dev.) is given with a non-zero value,
+    // it calculate the range for this column (different quantization scale/offset are used for each column)
+    if(quantRangeStdDevs != 0.f) {
+      mean /= k;
+      sqrSum /= k;
+      sqrSum -= mean * mean;
+      sqrSum = sqrt(sqrSum);
+      min = (float)(mean - quantRangeStdDevs * sqrSum);
+      max = (float)(mean + quantRangeStdDevs * sqrSum);
+    }
+    // based on the quantization range, this computes the scale and offset for the quantization
+    quantScaleB[jj] = (max - min) / quantizedRange;
+    quantZeropointB[jj] = (int32_t)(quantizedMax - max / quantScaleB[jj]);
+  }
 
   // 2. quantize
   int8_t* quantized = 0;
@@ -410,7 +410,7 @@ void fbgemmPacked8Pack(marian::Tensor out,
     TensorQuantizationParams bQuantParam;
     bQuantParam.scale = quantScaleB[jj];
     bQuantParam.zero_point = quantZeropointB[jj];
-    bQuantParam.precision = 7;  // Use half of the quantization range to prevent overflow of VPMADDUBSW
+    bQuantParam.precision = 7;  // Use half of the quantization range to prevent overflow of VPMADDUBSW
 
     if (transpose)
       fbgemm::Quantize<int8_t>(data + jj * k, quantized + jj * k, k, bQuantParam);
@@ -536,7 +536,8 @@ void fbgemmPacked16Gemm(marian::Tensor C,
 // k: the number of columns in A and the number of rows in B
 // transA: whether A matrix is transposed or not
 // transB: whether B matrix is transposed or not
-void fbgemmPacked8Gemm(marian::Tensor C,
+void fbgemmPacked8Gemm(Type packType,
+                       marian::Tensor C,
                        const marian::Tensor A,
                        const marian::Tensor B,
                        const size_t m,
@@ -544,9 +545,6 @@ void fbgemmPacked8Gemm(marian::Tensor C,
                        const size_t k,
                        const int transA,
                        const int transB) {
-  // pack type
-  marian::Type packType = B->type();
-
   const fbgemm::BlockingFactors* params = getBlockingFactors(packType);
 
   // Check if the packed format matches with the available AVX instruction set in the machine
diff --git a/src/tensors/cpu/fbgemm/packed_gemm.h b/src/tensors/cpu/fbgemm/packed_gemm.h
index 694860d48..e5740a434 100644
--- a/src/tensors/cpu/fbgemm/packed_gemm.h
+++ b/src/tensors/cpu/fbgemm/packed_gemm.h
@@ -135,7 +135,8 @@ void fbgemmPacked16Gemm(marian::Tensor C,
 // k: the number of columns in A and rows in B
 // transA: transpose of A matrix
 // transB: transpose of B matrix
-void fbgemmPacked8Gemm(marian::Tensor C,
+void fbgemmPacked8Gemm(Type packType,
+                       marian::Tensor C,
                        const marian::Tensor A,
                        const marian::Tensor B,
                        const size_t m,
diff --git a/src/tensors/gpu/backend.h b/src/tensors/gpu/backend.h
index 75cc604da..410b41a49 100644
--- a/src/tensors/gpu/backend.h
+++ b/src/tensors/gpu/backend.h
@@ -64,6 +64,36 @@ class Backend : public marian::Backend {
     return cusparseHandle_;
   }
 
+  // for CPU, sets to use optimized code for inference.
+  // for GPU, this is invalid. for gpu, isOptimized() function always returns false.
+  void setOptimized(bool optimize) override {
+    LOG_ONCE(info, "setOptimized() not supported for GPU_{}", optimize);
+  }
+  bool isOptimized() override {
+    LOG_ONCE(info, "isOptimized() not supported for GPU");
+    return false;
+  };
+
+  // for CPU, selects different GEMM types for the inference.
+  // for GPU, there's no gemm type. so, it does nothing.
+  void setGemmType(std::string gemmType) override {
+    LOG_ONCE(info, "setGemmType() not supported for GPU_{}", gemmType);
+  }
+  GemmType getGemmType() override {
+    LOG_ONCE(info, "getGemmType() not supported for GPU");
+    return GemmType::Float32;
+  }
+
+  // for CPU, sets quantization range of weight matrices for the inference.
+  // for GPU, there's no quantization. so, it does nothing.
+  void setQuantizeRange(float range) override {
+    LOG_ONCE(info, "setQuantizeRange() not supported for GPU_{}", range);
+  }
+  float getQuantizeRange() override {
+    LOG_ONCE(info, "getQuantizeRange() not supported for GPU");
+    return 0.f;
+  }
+
   CudaCompute getCudaComputeCapability() { return compute_; }
 
 private:
diff --git a/src/translator/translator.h b/src/translator/translator.h
index fe01065b6..579f126de 100644
--- a/src/translator/translator.h
+++ b/src/translator/translator.h
@@ -89,6 +89,11 @@ class Translate : public ModelTask {
         auto prec = options_->get<std::vector<std::string>>("precision", {"float32"});
         graph->setDefaultElementType(typeFromString(prec[0]));
         graph->setDevice(device);
+        if (device.type == DeviceType::cpu) {
+          graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
+          graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
+          graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
+        }
         graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
         graphs_[id] = graph;
 
@@ -282,6 +287,11 @@ class TranslateService : public ModelServiceTask {
       auto precison = options_->get<std::vector<std::string>>("precision", {"float32"});
       graph->setDefaultElementType(typeFromString(precison[0])); // only use first type, used for parameter type in graph
       graph->setDevice(device);
+      if (device.type == DeviceType::cpu) {
+        graph->getBackend()->setOptimized(options_->get<bool>("optimize"));
+        graph->getBackend()->setGemmType(options_->get<std::string>("gemm-type"));
+        graph->getBackend()->setQuantizeRange(options_->get<float>("quantize-range"));
+      }
       graph->reserveWorkspaceMB(options_->get<size_t>("workspace"));
       graphs_.push_back(graph);