From 133e49d04e5f870b2bfcf5776e9bc43c8da8b829 Mon Sep 17 00:00:00 2001 From: Animesh Jain Date: Thu, 5 Sep 2019 18:45:57 +0000 Subject: [PATCH] [QNN][TFLite] Parsing TFLite quantized models. --- python/tvm/relay/frontend/tflite.py | 175 +++++++++++++++++-- tests/python/frontend/tflite/test_forward.py | 45 +++++ 2 files changed, 204 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index a519c6fd8b44..237c9c10d38f 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -24,6 +24,7 @@ from .. import expr as _expr from .. import module as _module from .. import op as _op +from .. import qnn as _qnn from ... import nd as _nd from .common import ExprTable from .common import infer_shape as _infer_shape @@ -32,10 +33,11 @@ class TensorWrapper(object): """Tensor wrapper for TFLite Tensor""" - def __init__(self, tensor_idx, tensor, buffer): + def __init__(self, tensor_idx, tensor, buffer, qnn_params=None): self.tensor_idx = tensor_idx self.tensor = tensor self.buffer = buffer + self.qnn_params = qnn_params class OperatorConverter(object): """Operator Converted for converting TFLite ops to Relay ops""" @@ -160,7 +162,19 @@ def get_tensors(self, tensors_idx_list): tensor = self.subgraph.Tensors(tensor_idx) buffer_idx = tensor.Buffer() buffer = self.model.Buffers(buffer_idx) - return_list.append(TensorWrapper(tensor_idx, tensor, buffer)) + + # Check if the tensors are quantized. Parse if yes. + qnn_params = None + tflite_qnn_params = tensor.Quantization() + if tflite_qnn_params is not None: + scale = float(tflite_qnn_params.ScaleAsNumpy()) + zero_point = int(tflite_qnn_params.ZeroPointAsNumpy()) + # Check that the scale and zero points are valid. + if scale != 0 or zero_point != 0: + qnn_params = dict() + qnn_params['scale'] = scale + qnn_params['zero_point'] = zero_point + return_list.append(TensorWrapper(tensor_idx, tensor, buffer, qnn_params)) return return_list def get_tensor_value(self, tensor_wrapper): @@ -200,6 +214,10 @@ def get_tensor_type_str(self, tensor_type): raise NotImplementedError("Tensor type {} is currently not supported" .format(str(tensor_type))) + def has_same_qnn_params(self, lhs_tensor, rhs_tensor): + return lhs_tensor.qnn_params['scale'] == rhs_tensor.qnn_params['scale'] and \ + lhs_tensor.qnn_params['zero_point'] == rhs_tensor.qnn_params['zero_point'] + def convert_conv2d(self, op): """Convert TFLite conv2d""" return self.convert_conv(op, "conv2d") @@ -238,8 +256,15 @@ def convert_reshape(self, op): target_shape = reshape_options.NewShapeAsNumpy() in_expr = self.get_expr(input_tensor_idx) - out = _op.reshape(in_expr, newshape=tuple(target_shape)) + # If the tensors are quantized, ensure that input/output qnn params are same. + if input_tensor.qnn_params: + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "There should be only 1 output tensor" + output_tensor = output_tensors[0] + assert self.has_same_qnn_params(input_tensor, output_tensor), \ + "TFLite reshape requires input and output scale and zero points to be equal" + out = _op.reshape(in_expr, newshape=tuple(target_shape)) return out def _convert_resize(self, method, op): @@ -324,10 +349,33 @@ def convert_softmax(self, op): input_tensor = input_tensors[0] input_tensor_idx = input_tensor.tensor_idx + + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "output tensors length should be 1" + output_tensor = output_tensors[0] + output_tensor_type = output_tensor.tensor.Type() + output_tensor_type_str = self.get_tensor_type_str(output_tensor_type) + params = {'axis': 1} # 1 is channel in_expr = self.get_expr(input_tensor_idx) + + # TODO - Naive softmax int8 implementation leads to bad accuracy. Currently, we can + # dequantize to FP32 and perform softmax on FP32. We can investigate an integer only softmax + # implementation in future. + if input_tensor.qnn_params: + in_expr = _qnn.op.dequantize(data=in_expr, + input_scale=input_tensor.qnn_params['scale'], + input_zero_point=input_tensor.qnn_params['zero_point']) + out = _op.nn.softmax(in_expr, **params) + # Go back to integer dataype if the original operator was quantized. + if output_tensor.qnn_params: + out = _qnn.op.quantize(data=out, + output_scale=output_tensor.qnn_params['scale'], + output_zero_point=output_tensor.qnn_params['zero_point'], + out_dtype=output_tensor_type_str) + return out def convert_tanh(self, op): @@ -380,7 +428,8 @@ def convert_concatenation(self, op): in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors] output_tensors = self.get_output_tensors(op) - assert len(output_tensors) == 1, "output tensors should be 1" + assert len(output_tensors) == 1, "output tensors length should be 1" + output_tensor = output_tensors[0] assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions op_options = op.BuiltinOptions() @@ -389,12 +438,27 @@ def convert_concatenation(self, op): concatenation_axis = concatenation_options.Axis() fused_activation_fn = concatenation_options.FusedActivationFunction() - # with axis in N H W C - out = _op.concatenate(in_exprs, axis=concatenation_axis) + if not input_tensors[0].qnn_params: + out = _op.concatenate(in_exprs, axis=concatenation_axis) + else: + input_scales = [input_tensor.qnn_params['scale'] for input_tensor in input_tensors] + input_zero_points = \ + [input_tensor.qnn_params['zero_point'] for input_tensor in input_tensors] + out = _qnn.op.concatenate(in_exprs, + input_scales=input_scales, + input_zero_points=input_zero_points, + output_scale=output_tensor.qnn_params['scale'], + output_zero_point=output_tensor.qnn_params['zero_point'], + axis=concatenation_axis) # if we have activation fn if fused_activation_fn != ActivationFunctionType.NONE: - out = self.convert_fused_activation_function(out, fused_activation_fn) + if not output_tensor.qnn_params: + out = self.convert_fused_activation_function(out, fused_activation_fn) + else: + raise tvm.error.OpNotImplemented( + 'Operator {} with fused activation is not supported yet.' + .format('qnn.op.concatenate')) return out def _convert_elemwise(self, relay_op, op): @@ -557,6 +621,12 @@ def convert_fully_connected(self, op): input_tensor_idx = input_tensor.tensor_idx weight_tensor = input_tensors[1] + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "output tensors length should be 1" + output_tensor = output_tensors[0] + output_tensor_type = output_tensor.tensor.Type() + output_tensor_type_str = self.get_tensor_type_str(output_tensor_type) + input_tensor_shape = input_tensor.tensor.ShapeAsNumpy() weight_tensor_shape = weight_tensor.tensor.ShapeAsNumpy() @@ -584,7 +654,13 @@ def convert_fully_connected(self, op): weight_value = self.get_tensor_value(weight_tensor) weight_expr = self.exp_tab.new_const(weight_value, dtype=weight_tensor_type_str) - out = _op.nn.dense(in_expr, weight_expr) + if input_tensor.qnn_params: + out = _qnn.op.dense(in_expr, weight_expr, + input_zero_point=input_tensor.qnn_params['zero_point'], + kernel_zero_point=weight_tensor.qnn_params['zero_point'], + out_dtype='int32') + else: + out = _op.nn.dense(in_expr, weight_expr) # if we have bias if len(input_tensors) == 3: @@ -599,7 +675,23 @@ def convert_fully_connected(self, op): # If we have fused activations if fused_activation_fn != ActivationFunctionType.NONE: - out = self.convert_fused_activation_function(out, fused_activation_fn) + if not output_tensor.qnn_params: + out = self.convert_fused_activation_function(out, fused_activation_fn) + else: + raise tvm.error.OpNotImplemented( + 'Operator {} with fused activation is not supported yet.' + .format('qnn.op.dense')) + + # Finally if the dense is quantized. Add a requantize at the end. + if output_tensor.qnn_params: + input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale'] + input_zero_point = 0 + out = _qnn.op.requantize(out, + input_scale=input_scale, + input_zero_point=input_zero_point, + output_scale=output_tensor.qnn_params['scale'], + output_zero_point=output_tensor.qnn_params['zero_point'], + out_dtype=output_tensor_type_str) return out @@ -671,6 +763,12 @@ def convert_conv(self, op, conv_type): input_tensor_idx = input_tensor.tensor_idx weight_tensor = input_tensors[1] + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "output tensors length should be 1" + output_tensor = output_tensors[0] + output_tensor_type = output_tensor.tensor.Type() + output_tensor_type_str = self.get_tensor_type_str(output_tensor_type) + is_depthwise_conv = False if conv_type == 'conv2d': assert op.BuiltinOptionsType() == BuiltinOptions.Conv2DOptions @@ -758,7 +856,14 @@ def convert_conv(self, op, conv_type): raise tvm.error.OpAttributeUnImplemented( 'Padding format {} is not supported for operator Conv.'.format(padding)) - out = _op.nn.conv2d(data=in_expr, weight=weight_expr, **params) + if input_tensor.qnn_params: + qnn_conv2d_params = dict(params) + qnn_conv2d_params['input_zero_point'] = input_tensor.qnn_params['zero_point'] + qnn_conv2d_params['kernel_zero_point'] = weight_tensor.qnn_params['zero_point'] + qnn_conv2d_params['out_dtype'] = 'int32' + out = _qnn.op.conv2d(in_expr, weight_expr, **qnn_conv2d_params) + else: + out = _op.nn.conv2d(in_expr, weight_expr, **params) # if we have bias if len(input_tensors) == 3: @@ -774,7 +879,23 @@ def convert_conv(self, op, conv_type): # If we have fused activations if fused_activation_fn != ActivationFunctionType.NONE: - out = self.convert_fused_activation_function(out, fused_activation_fn) + if not output_tensor.qnn_params: + out = self.convert_fused_activation_function(out, fused_activation_fn) + else: + raise tvm.error.OpNotImplemented( + 'Operator {} with fused activation is not supported yet.' + .format('qnn.op.conv2d')) + + # Finally if the conv is quantized. Add a requantize at the end. + if output_tensor.qnn_params: + input_scale = input_tensor.qnn_params['scale'] * weight_tensor.qnn_params['scale'] + input_zero_point = 0 + out = _qnn.op.requantize(out, + input_scale=input_scale, + input_zero_point=input_zero_point, + output_scale=output_tensor.qnn_params['scale'], + output_zero_point=output_tensor.qnn_params['zero_point'], + out_dtype=output_tensor_type_str) return out @@ -879,6 +1000,12 @@ def convert_pool2d(self, op, pool_type): input_tensor = input_tensors[0] input_tensor_idx = input_tensor.tensor_idx + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "output tensors should be 1" + output_tensor = output_tensors[0] + output_tensor_type = output_tensor.tensor.Type() + output_tensor_type_str = self.get_tensor_type_str(output_tensor_type) + assert op.BuiltinOptionsType() == BuiltinOptions.Pool2DOptions op_options = op.BuiltinOptions() pool2d_options = Pool2DOptions() @@ -909,8 +1036,19 @@ def convert_pool2d(self, op, pool_type): 'Padding format {} for operator Pool2D is not supported.'.format(padding)) if pool_type == "average": - out = _op.nn.avg_pool2d(in_expr, **params) + if input_tensor.qnn_params: + assert self.has_same_qnn_params(input_tensor, output_tensor), \ + 'TFLite avg_pool2dreshape requires input and output scale' \ + 'and zero points to be equal' + out = _op.cast(in_expr, dtype="int32") + out = _op.nn.avg_pool2d(out, **params) + out = _op.cast(out, dtype=output_tensor_type_str) + else: + out = _op.nn.avg_pool2d(in_expr, **params) elif pool_type == "max": + if input_tensor.qnn_params: + assert self.has_same_qnn_params(input_tensor, output_tensor), \ + "qnn.op.max_pool2d requires input and output qnn params to be same" out = _op.nn.max_pool2d(in_expr, **params) else: raise tvm.error.OpNotImplemented( @@ -918,8 +1056,12 @@ def convert_pool2d(self, op, pool_type): # If we have fused activations if fused_activation_fn != ActivationFunctionType.NONE: - out = self.convert_fused_activation_function(out, fused_activation_fn) - + if input_tensor.qnn_params: + raise tvm.error.OpNotImplemented( + 'Operator {} with fused activation is not supported yet.' + .format('qnn.op.pool2d')) + else: + out = self.convert_fused_activation_function(out, fused_activation_fn) return out def convert_pad(self, op): @@ -962,7 +1104,7 @@ def convert_pack(self, op): in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors] output_tensors = self.get_output_tensors(op) - assert len(output_tensors) == 1, "output tensors should be 1" + assert len(output_tensors) == 1, "output tensors length should be 1" assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions op_options = op.BuiltinOptions() @@ -1210,4 +1352,5 @@ def from_tflite(model, shape_dict, dtype_dict): outputs = [exp_tab.get_expr(get_tensor_name(subgraph, i)) for i in model_outputs] outputs = outputs[0] if len(outputs) == 1 else _expr.Tuple(outputs) func = _expr.Function(analysis.free_vars(outputs), outputs) - return _module.Module.from_expr(func), params + mod = _module.Module.from_expr(func) + return mod, params diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 670e85ba8384..1b2e8afca979 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -979,6 +979,46 @@ def test_forward_inception_v4_net(): tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5) +def test_forward_qnn_inception_v1_net(): + """Test the Quantized TFLite Inception model.""" + # InceptionV1 + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz", + "inception_v1_224_quant.tflite") + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + # Checking the labels because the requantize implementation is different between TFLite and + # Relay. This cause final output numbers to mismatch. So, testing accuracy via labels. + np.random.seed(0) + data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8') + tflite_output = run_tflite_graph(tflite_model_buf, data) + tflite_predictions = np.squeeze(tflite_output) + tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1] + tvm_output = run_tvm_graph(tflite_model_buf, data, 'input') + tvm_predictions = np.squeeze(tvm_output) + tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1] + tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels) + +def test_forward_qnn_mobilenet_v1_net(): + """Test the Quantized TFLite Mobilenet V1 model.""" + # MobilenetV1 + tflite_model_file = tf_testing.get_workload_official( + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "mobilenet_v1_1.0_224_quant.tflite") + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + # Checking the labels because the requantize implementation is different between TFLite and + # Relay. This cause final output numbers to mismatch. So, testing accuracy via labels. + np.random.seed(0) + data = np.random.random_integers(low=0, high=128, size=(1, 224, 224, 3)).astype('uint8') + tflite_output = run_tflite_graph(tflite_model_buf, data) + tflite_predictions = np.squeeze(tflite_output) + tflite_sorted_labels = tflite_predictions.argsort()[-3:][::-1] + tvm_output = run_tvm_graph(tflite_model_buf, data, 'input') + tvm_predictions = np.squeeze(tvm_output) + tvm_sorted_labels = tvm_predictions.argsort()[-3:][::-1] + tvm.testing.assert_allclose(tvm_sorted_labels, tflite_sorted_labels) + ####################################################################### # SSD Mobilenet # ------------- @@ -1048,3 +1088,8 @@ def test_forward_ssd_mobilenet_v1(): test_forward_inception_v3_net() test_forward_inception_v4_net() test_forward_ssd_mobilenet_v1() + + # End to End quantized + # TODO - MobilenetV2 fails for now. Remove when fixed. + test_forward_qnn_inception_v1_net() + test_forward_qnn_mobilenet_v1_net()