Skip to content

Commit

Permalink
Online quantization (#847)
Browse files Browse the repository at this point in the history
* Enable on-line packing/quantization
* Add half precision min/max quantization for model weights
* Change default quantization of B matrix to min/max, revert a false commit for AggregateAll
* Fixed missing half quantization
* Fix quantization range for A
* Set all default values for the quantize range to 0.f
* Use 7 bits clip for the weight matrix quantization to avoid an overflow of VPMADDUBSW
  • Loading branch information
ykim362 authored May 25, 2021
1 parent 3133a9b commit 9fa166b
Show file tree
Hide file tree
Showing 10 changed files with 270 additions and 59 deletions.
18 changes: 18 additions & 0 deletions src/common/config_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,6 +696,15 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"Use approximate knn search in output layer (currently only in transformer)")
->implicit_val("100 1024");

// parameters for on-line quantization
cli.add<bool>("--optimize",
"Optimize the graph on-the-fly", false);
cli.add<std::string>("--gemm-type,-g",
"GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
cli.add<float>("--quantize-range",
"Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
0.f);

#if 0 // @TODO: Ask Hany if there are any decoding-time options
// add ULR settings
addSuboptionsULR(cli);
Expand Down Expand Up @@ -747,6 +756,15 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
"Mixed precision for inference, set parameter type in expression graph",
{"float32"});

// parameters for on-line quantization
cli.add<bool>("--optimize",
"Optimize the graph on-the-fly", false);
cli.add<std::string>("--gemm-type,-g",
"GEMM Type to be used for on-line quantization/packing: float32, packed16, packed8", "float32");
cli.add<float>("--quantize-range",
"Range for the on-line quantiziation of weight matrix in multiple of this range and standard deviation, 0.0 means min/max quantization",
0.f);

cli.switchGroup(previous_group);
// clang-format on
}
Expand Down
89 changes: 85 additions & 4 deletions src/graph/expression_operators.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,45 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
// --optimize --cpu-thread=N with N > 0 are set.
if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
return Expression<DotNodeOp>(a, b, transA, transB, scale);
if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
#if USE_FBGEMM
if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
auto packedB = cpu::variant::pack(
marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
return cpu::variant::dot(marian::Type::packed16,
a, packedB, b->shape(), transA, transB, scale);
} else {
float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
if(fbgemm::fbgemmHasAvx512Support()) {
auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
b,
cpu::variant::PackMatrix::B,
transB,
quantizeRange);
return cpu::variant::dot(marian::Type::packed8avx512,
a, packedB, b->shape(), transA, transB, scale);
} else if(fbgemm::fbgemmHasAvx2Support()) {
auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
b,
cpu::variant::PackMatrix::B,
transB,
quantizeRange);
return cpu::variant::dot(marian::Type::packed8avx2,
a, packedB, b->shape(), transA, transB, scale);
} else {
ABORT(
"AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
"GEMM");
}
}
#else
ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else {
return Expression<DotNodeOp>(
a, b, transA, transB, scale);
}
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
return cpu::integer::affineOrDot(a, b, nullptr, transA, transB, scale);
} else if(isFloat(aElementType) && isPacked(bElementType)) {
Expand All @@ -495,7 +533,8 @@ Expr dot(Expr a, Expr b, bool transA, bool transB, float scale) {
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of dot product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::dot(a,
return cpu::variant::dot(b->value_type(),
a,
b,
b->shape(),
transA,
Expand Down Expand Up @@ -541,7 +580,48 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {

if(device == DeviceType::cpu) {
if(isFloat(aElementType) && isFloat(bElementType)) {
return affineDefault(a, b, bias, transA, transB, scale);
if(a->graph()->getBackend()->isOptimized()) {
if(b->memoize() && (a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed ||
a->graph()->getBackend()->getGemmType() == GemmType::FbInt8Packed)) {
#if USE_FBGEMM
if(a->graph()->getBackend()->getGemmType() == GemmType::FbFp16Packed) {
auto packedB = cpu::variant::pack(
marian::Type::packed16, b, cpu::variant::PackMatrix::B, transB);
return cpu::variant::affine(marian::Type::packed16,
a, packedB, b->shape(), bias, transA, transB, scale);
} else {
float quantizeRange = b->graph()->getBackend()->getQuantizeRange();
if(fbgemm::fbgemmHasAvx512Support()) {
auto packedB = cpu::variant::pack(marian::Type::packed8avx512,
b,
cpu::variant::PackMatrix::B,
transB,
quantizeRange);
return cpu::variant::affine(marian::Type::packed8avx512,
a, packedB, b->shape(), bias, transA, transB, scale);
} else if(fbgemm::fbgemmHasAvx2Support()) {
auto packedB = cpu::variant::pack(marian::Type::packed8avx2,
b,
cpu::variant::PackMatrix::B,
transB,
quantizeRange);
return cpu::variant::affine(marian::Type::packed8avx2,
a, packedB, b->shape(), bias, transA, transB, scale);
} else {
ABORT(
"AVX2 is not available. At least, AVX2 is needed to use fbgemm-based packed "
"GEMM");
}
}
#else
ABORT("Packed GEMM is not available in this build");
#endif // USE_FBGEMM
} else {
return affineDefault(a, b, bias, transA, transB, scale);
}
} else {
return affineDefault(a, b, bias, transA, transB, scale);
}
} else if(isFloat(aElementType) && isIntgemm(bElementType)) {
return cpu::integer::affineOrDot(a, b, bias, transA, transB, scale);
} else if(isFloat(aElementType) && isPacked(bElementType)) {
Expand All @@ -553,7 +633,8 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
// and this cpu lookup is executed only once and the state is kept in FBGEMM.
if(fbgemm::fbgemmHasAvx2Support()) {
// This variant of affine product can handle matrix multiplications with packed8 and packed16 weight matrix (B).
return cpu::variant::affine(a,
return cpu::variant::affine(b->value_type(),
a,
b,
b->shape(),
bias,
Expand Down
2 changes: 2 additions & 0 deletions src/layers/generic.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ static inline std::function<Expr(Expr)> activationByName(const std::string& actN
return (ActivationFunction*)swish;
else if (actName == "gelu")
return (ActivationFunction*)gelu;
else if (actName == "sigmoid")
return (ActivationFunction*)sigmoid;
else if (actName == "") // return identity function if activation name is empty
return [](Expr x) { return x; };
ABORT("Invalid activation name '{}'", actName);
Expand Down
21 changes: 21 additions & 0 deletions src/tensors/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@

namespace marian {

// GEMM type enum
typedef enum {
Auto = 0, // auto tuning between available GEMMs
Float32 = 1, // MKL based GEMM, fp32
FbFp16Packed = 10, // FBGEMM based fp16 GEMM with packing
FbInt8Packed = 11 // FBGEMM based int8 GEMM with packing
} GemmType;

class Backend {
protected:
DeviceId deviceId_;
Expand All @@ -21,6 +29,19 @@ class Backend {
// for GPU only, calls cudaSetDevice, does nothing on CPU. Maybe change name.
virtual void setDevice() = 0;
virtual void synchronize() = 0;

// for CPU, sets to use optimized code for inference.
// for GPU, this is invalid. for gpu, isOptimized() function always returns false.
virtual void setOptimized(bool optimize) = 0;
virtual bool isOptimized() = 0;
// for CPU, selects different GEMM types for the inference.
// for GPU, there's no gemm type. so, it does nothing.
virtual void setGemmType(std::string gemmType) = 0;
virtual GemmType getGemmType() = 0;
// for CPU, sets quantization range of weight matrices for the inference.
// for GPU, there's no quantization. so, it does nothing.
virtual void setQuantizeRange(float range) = 0;
virtual float getQuantizeRange() = 0;
};

Ptr<Backend> BackendByDeviceId(DeviceId deviceId, size_t seed);
Expand Down
24 changes: 24 additions & 0 deletions src/tensors/cpu/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,34 @@ namespace marian {
namespace cpu {

class Backend : public marian::Backend {
protected:
bool optimized_{false};
GemmType gemmType_{GemmType::Float32};
float quantizeRange_{0.f};

public:
Backend(DeviceId deviceId, size_t seed) : marian::Backend(deviceId, seed) {}
void setDevice() override {}
void synchronize() override {}

// for CPU & inference only, sets to use optimized code for inference. Does nothing for GPU.
void setOptimized(bool optimize) override { optimized_ = optimize; }
bool isOptimized() override { return optimized_; }
// for CPU only, selects different GEMM types for the inference. Does nothing for GPU.
void setGemmType(std::string gemmType) override {
if (gemmType == "auto") gemmType_ = GemmType::Auto;
else if (gemmType == "float32") gemmType_ = GemmType::Float32;
#if USE_FBGEMM
else if (gemmType == "packed16") gemmType_ = GemmType::FbFp16Packed;
else if (gemmType.find("packed8") == 0) gemmType_ = GemmType::FbInt8Packed;
#endif // USE_FBGEMM
else ABORT("Unknown GEMM type - '{}'", gemmType);
}
GemmType getGemmType() override { return gemmType_; }
// for CPU, sets quantization range of weight matrices for the inference.
// for GPU, there's no quantization. so, it does nothing.
void setQuantizeRange(float range) override { quantizeRange_ = range; }
float getQuantizeRange() override { return quantizeRange_; }
};

} // namespace cpu
Expand Down
64 changes: 45 additions & 19 deletions src/tensors/cpu/fbgemm/expanded_gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,15 +138,18 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
int nrow_;
int ncol_;
uint64_t packsize_;
float quantizeRange_;

FbgemmPacked8PackNodeOp(Expr a,
PackMatrix packMat,
marian::Type packType,
bool transpose)
: UnaryNodeOp(a, newShape(a, transpose), Type::uint8),
bool transpose,
float quantizeRange)
: UnaryNodeOp(a, newShape(a, packType, transpose), Type::uint8),
packMat_(packMat),
packType_(packType),
transpose_(transpose) {
transpose_(transpose),
quantizeRange_(quantizeRange){
if(packMat != PackMatrix::B)
ABORT("Only prepacking of B (weight matrix) is supported");
if(!memoize_)
Expand All @@ -161,7 +164,8 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
transpose_,
nrow_,
ncol_,
packsize_))
packsize_,
quantizeRange_))
};
#else // USE_FBGEMM
ABORT("FbgemmPacked8PackNodeOp can only be used with FBGEMM enabled.");
Expand All @@ -177,13 +181,19 @@ struct FbgemmPacked8PackNodeOp : public UnaryNodeOp {
const std::string type() override { return "packMatInt8"; }

#if USE_FBGEMM
Shape newShape(Expr a, bool transpose) {
fbgemmPacked8PackInfo(a->shape(), packType_, transpose, nrow_, ncol_, packsize_);
Shape newShape(Expr a, marian::Type packType, bool transpose) {
fbgemmPacked8PackInfo(
a->shape(),
packType,
transpose,
nrow_,
ncol_,
packsize_);
Shape outShape({(int)packsize_});
return outShape;
}
#else
Shape newShape(Expr /*a*/, bool /*transpose*/) {
Shape newShape(Expr /*a*/, marian::Type /*packType*/, bool /*transpose*/) {
ABORT("Packed GEMM requires a build with USE_FBGEMM enabled");
return Shape();
}
Expand Down Expand Up @@ -282,10 +292,17 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
size_t k_;
bool transA_;
bool transB_;
Type elementType_;

public:
FbgemmPacked8AffineNodeOp(const std::vector<Expr>& nodes, Shape bShape, bool transA, bool transB, float /*scalar*/)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32)/*, scalar_(scalar) */ {
FbgemmPacked8AffineNodeOp(Type elementType,
const std::vector<Expr>& nodes,
Shape bShape,
bool transA,
bool transB,
float /*scalar*/)
: NaryNodeOp(nodes, newShape(nodes[0], bShape, transA, transB), Type::float32),
elementType_(elementType) {
transA_ = transA;
transB_ = transB;
m_ = nodes[0]->shape().elements() / nodes[0]->shape()[-1];
Expand Down Expand Up @@ -324,7 +341,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
#if USE_FBGEMM
// Do addBias only if it has a bias term
if (children().size() > 2) {
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
val_,
child(0)->val(),
child(1)->val(),
m_,
Expand All @@ -334,7 +352,8 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
transB_);
marian::cpu::integer::AddBias(val_, child(2)->val())) };
} else {
nodeOps = { NodeOp(fbgemmPacked8Gemm(val_,
nodeOps = { NodeOp(fbgemmPacked8Gemm(elementType_,
val_,
child(0)->val(),
child(1)->val(),
m_,
Expand All @@ -358,39 +377,46 @@ class FbgemmPacked8AffineNodeOp : public NaryNodeOp {
const std::string type() override { return "gemmPacked8"; }
};

static inline Expr affine(Expr a, Expr b, Shape bShape, Expr c, bool transA, bool transB, float scalar) {
static inline Expr affine(Type elementType,
Expr a,
Expr b,
Shape bShape,
Expr c,
bool transA,
bool transB,
float scalar) {
std::vector<Expr> nodes = {a, b, c};
Type elementType = b->value_type();

if (elementType == Type::packed16)
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
elementType, nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
}
}

static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose) {
static inline Expr pack(Type elementType, Expr a, PackMatrix packMat, bool transpose, float quantizeRange = 0.f) {
if (elementType == Type::packed16)
return Expression<FbgemmPacked16PackNodeOp>(a, packMat, transpose);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose);
return Expression<cpu::variant::FbgemmPacked8PackNodeOp>(a, packMat, elementType, transpose, quantizeRange);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
}
}

static inline Expr dot(Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
static inline Expr dot(Type elementType, Expr a, Expr b, Shape bShape, bool transA, bool transB, float scalar) {
std::vector<Expr> nodes = {a, b};
Type elementType = b->value_type();

if (elementType == Type::packed16)
return Expression<FbgemmPacked16AffineNodeOp>(nodes, bShape, transA, transB, scalar);
else if (isPacked(elementType) && sizeOf(elementType) == 1)
return Expression<FbgemmPacked8AffineNodeOp>(nodes, bShape, transA, transB, scalar);
return Expression<cpu::variant::FbgemmPacked8AffineNodeOp>(
elementType, nodes, bShape, transA, transB, scalar);
else {
ABORT("Only int8 and fp16 are available. {}", elementType);
return nullptr;
Expand Down
Loading

0 comments on commit 9fa166b

Please sign in to comment.