Add DynamicTensorQuantizer class and use it for gradient quantization…

… in aqt_einsum PiperOrigin-RevId: 569497504
google · Sep 29, 2023 · 452ab61 · 452ab61
1 parent db2bdaa
commit 452ab61
Show file tree

Hide file tree

Showing 5 changed files with 776 additions and 298 deletions.
diff --git a/aqt/tensorflow/aqt_einsum.py b/aqt/tensorflow/aqt_einsum.py
@@ -324,29 +324,6 @@ def get_einsum_transpose(eq: str, swap_ans: bool = False) -> str:
   return '{},{}->{}'.format(out_dims, y_dims, x_dims)
 
 
-def _maybe_random(
-    random_gen: Optional[tf.random.Generator],
-    shape: Iterable[int],
-    dtype: tf.dtypes.DType,
-) -> Optional[tf.Tensor]:
-  """Maybe generate random floats in [-0.5, 0.5] to perturb gradients."""
-  if random_gen is None:
-    return None
-  return random_gen.uniform(shape, -0.5, 0.5, dtype=dtype)
-
-
-def _round(
-    x_quantizer: aqt_tensor.TensorQuantizer,
-    x: tf.Tensor,
-    random: Optional[tf.Tensor],
-    train: bool,
-) -> tf.Tensor:
-  if random is None:
-    return x_quantizer._to_quant(x, train=train)
-  assert x.shape == random.shape, (x.shape, random.shape)
-  return x_quantizer._to_quant(x + random, train=train)
-
-
 def einsum(
     eq: str,  #
     lhs_quantizer: aqt_tensor.TensorQuantizer,
@@ -355,9 +332,8 @@ def einsum(
     rhs: tf.Tensor,
     train: bool = True,
     quantize_bwd: bool = False,
-    lhs_grad_quantizer: Optional[aqt_tensor.TensorQuantizer] = None,
-    rhs_grad_quantizer: Optional[aqt_tensor.TensorQuantizer] = None,
-    random_gen: Optional[tf.random.Generator] = None,
+    lhs_grad_quantizer: Optional[aqt_tensor.DynamicTensorQuantizer] = None,
+    rhs_grad_quantizer: Optional[aqt_tensor.DynamicTensorQuantizer] = None,
     **tf_einsum_kwargs,
 ) -> tf.Tensor:
   """Performs a quantized two-argument :py:func:`tf.einsum`.
@@ -382,9 +358,6 @@ def einsum(
       the einsum equation, `grad,rhs->lhs_grad`, in the backward pass.
     rhs_grad_quantizer: A `TensorQuantizer` for grad, which is used to quantize
       the einsum equation, `grad,lhs->rhs_grad`, in the backward pass.
-    random_gen: A `tf.random.Generator` used to generate random numbers between
-      [0.5, 0.5] added to the gradients before quantization in the backward
-      pass.
     **tf_einsum_kwargs: Keyword arguments to pass onto `einsum`.
 
   Returns:
@@ -526,9 +499,17 @@ def bwd(grad: tf.Tensor) -> tf.Tensor:
           lhs_scaled = lhs_scale * lhs
           rhs_scaled = rhs_scale * rhs
 
+          if quantize_bwd:
+            # Stochastic rounding is necessary for gradient quantization. We
+            # call uniform() once and share it across both scaled gradients to
+            # avoid potential bottlenecks with random number generation.
+            random = tf.random.uniform(
+                tf.shape(grad), -0.5, 0.5, dtype=grad.dtype
+            )
+
           def _bwd(
               eq: str,
-              grad_quantizer: Optional[aqt_tensor.TensorQuantizer],
+              grad_quantizer: Optional[aqt_tensor.DynamicTensorQuantizer],
               y_quantizer: aqt_tensor.TensorQuantizer,
               grad: tf.Tensor,
               qy: tf.Tensor,
@@ -552,19 +533,18 @@ def _bwd(
                 # We assume the backward-pass quantization is dynamic so no need
                 # to pass weight when updating stats but still need _last_update
                 # to switch tensor configs.
-                update = grad_quantizer.update(
-                    grad,
-                    weight=None,
-                    event_count=lhs_quantizer._last_update,
+                grad_scale, grad_inv_scale = (
+                    grad_quantizer._get_dynamic_quant_scale(
+                        grad,
+                        weight=None,
+                        event_count=lhs_quantizer._last_update,
+                        train=train,
+                    )
                 )
-                with tf.control_dependencies([update]):
-                  grad_scale, grad_inv_scale = grad_quantizer._get_quant_scale(
-                      train
-                  )
-                  grad_scaled = grad_scale * grad
-                  random = _maybe_random(random_gen, grad.shape, grad.dtype)
-                  qgrad = _round(grad_quantizer, grad_scaled, random, train)
-                  assert len(grad_inv_scale.shape) == len(qgrad.shape)
+                grad_scaled = grad_scale * grad
+                grad_scaled = grad_scaled + random
+                qgrad = grad_quantizer._to_quant(grad_scaled, train=train)
+                assert len(grad_inv_scale.shape) == len(qgrad.shape)
             else:
               qgrad = grad
               grad_inv_scale = None

diff --git a/aqt/tensorflow/aqt_einsum_test.py b/aqt/tensorflow/aqt_einsum_test.py
@@ -93,7 +93,6 @@ def _einsum_op(
     quantize_bwd: bool = False,
     lhs_bwd_config: Optional[aqt_config.AqtScheduleConfig] = None,
     rhs_bwd_config: Optional[aqt_config.AqtScheduleConfig] = None,
-    random_noise_seed: Optional[int] = 1234,
     **einsum_kwargs,
 ) -> tf.Tensor:
   """Updates quantizers at event_count=0 and computes einsum."""
@@ -105,17 +104,13 @@ def _einsum_op(
     lhs_bwd_tq, rhs_bwd_tq = None, None
     grad_shape = aqt_einsum.get_out_shape(eq, lhs.shape, rhs.shape)
     if lhs_bwd_config:
-      lhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      lhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, lhs_bwd_config, name="lhs_bwd"
       )
     if rhs_bwd_config:
-      rhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      rhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, rhs_bwd_config, name="rhs_bwd"
       )
-    if quantize_bwd and random_noise_seed is not None:
-      random_gen = tf.random.Generator.from_seed(random_noise_seed)
-    else:
-      random_gen = None
 
   event_count = tf.constant(0, tf.int64)
   updates = [
@@ -133,7 +128,6 @@ def _einsum_op(
         quantize_bwd,
         lhs_bwd_tq,
         rhs_bwd_tq,
-        random_gen=random_gen,
         **einsum_kwargs,
     )
 
@@ -623,6 +617,9 @@ def _get_grad_config(eq: str,
       bwd_eq = aqt_einsum.get_einsum_transpose(eq, swap_ans=swap_ans)
       # 16 bits to preserve gradients
       grad_config, _ = _exact_schedule_config(16, bwd_eq, 1.0)
+      grad_config.use_quantized_variable = False
+      for tc in grad_config.tensor_configs:
+        tc.freeze_scale_at_begin = False
       return grad_config
 
     lhs_bwd_config = _get_grad_config(eq, False)
@@ -721,16 +718,14 @@ def test_vars_over_inputs_at_inference(self, eq, quantize_bwd):
     rhs_tq = aqt_tensor.TensorQuantizer(rhs.shape, rhs_config, name="rhs")
     if quantize_bwd:
       grad_shape = aqt_einsum.get_out_shape(eq, lhs.shape, rhs.shape)
-      lhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      lhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, lhs_bwd_config, name="lhs_bwd"
       )
-      rhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      rhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, rhs_bwd_config, name="rhs_bwd"
       )
-      random_gen = tf.random.Generator.from_seed(1234)
     else:
       lhs_bwd_tq = rhs_bwd_tq = None
-      random_gen = None
 
     # Update at least once to initialize scale, then grab the expected
     # value while in training mode.
@@ -750,7 +745,6 @@ def test_vars_over_inputs_at_inference(self, eq, quantize_bwd):
           quantize_bwd=quantize_bwd,
           lhs_grad_quantizer=lhs_bwd_tq,
           rhs_grad_quantizer=rhs_bwd_tq,
-          random_gen=random_gen,
       )
 
     with self.cached_session() as sess, sess.as_default():
@@ -767,7 +761,6 @@ def test_vars_over_inputs_at_inference(self, eq, quantize_bwd):
           quantize_bwd=quantize_bwd,
           lhs_grad_quantizer=lhs_bwd_tq,
           rhs_grad_quantizer=rhs_bwd_tq,
-          random_gen=random_gen,
       )
 
       self.assertAllEqual(actual, expected)
@@ -792,16 +785,14 @@ def test_float_config_not_save_quantized_var(self, eq, quantize_bwd):
     rhs_tq = aqt_tensor.TensorQuantizer(rhs.shape, rhs_config, name="rhs")
     if quantize_bwd:
       grad_shape = aqt_einsum.get_out_shape(eq, lhs.shape, rhs.shape)
-      lhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      lhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, lhs_bwd_config, name="lhs_bwd"
       )
-      rhs_bwd_tq = aqt_tensor.TensorQuantizer(
+      rhs_bwd_tq = aqt_tensor.DynamicTensorQuantizer(
           grad_shape, rhs_bwd_config, name="rhs_bwd"
       )
-      random_gen = tf.random.Generator.from_seed(1234)
     else:
       lhs_bwd_tq = rhs_bwd_tq = None
-      random_gen = None
 
     event_count = tf.constant(0, tf.int64)
 
@@ -820,7 +811,6 @@ def test_float_config_not_save_quantized_var(self, eq, quantize_bwd):
           quantize_bwd=quantize_bwd,
           lhs_grad_quantizer=lhs_bwd_tq,
           rhs_grad_quantizer=rhs_bwd_tq,
-          random_gen=random_gen,
       )
       # Although the input tensors are non-zeros, the result of einsum with
       # inference mode should be zeros because lhs uses zero-initialized
@@ -875,7 +865,6 @@ def test_exact_grads(self, eq, quantize_bwd):
           )
       )
 
-    random_noise_seed = 1234 if quantize_bwd else None
     actual_fwd = _einsum_op(
         eq,
         lhs,
@@ -885,7 +874,6 @@ def test_exact_grads(self, eq, quantize_bwd):
         quantize_bwd=quantize_bwd,
         lhs_bwd_config=lhs_bwd_config,
         rhs_bwd_config=rhs_bwd_config,
-        random_noise_seed=random_noise_seed,
     )
     expected_fwd = tf.einsum(eq, lhs, rhs)
 
@@ -980,7 +968,7 @@ def test_consistent_bwd_improves_grads(self, eq):
             eq, quantize_bwd=True, dynamic_bwd_quant=True,
         )
     )
-    def get_perturbed_gradients(random_noise_seed):
+    def get_perturbed_gradients(step_i):
       actual_fwd = _einsum_op(
           eq,
           lhs,
@@ -990,17 +978,13 @@ def get_perturbed_gradients(random_noise_seed):
           quantize_bwd=True,
           lhs_bwd_config=lhs_bwd_config,
           rhs_bwd_config=rhs_bwd_config,
-          random_noise_seed=random_noise_seed,
-          varscope_name=f"einsum_seed_{random_noise_seed}",
+          varscope_name=f"einsum_seed_{step_i}",
       )
       return tf.gradients([actual_fwd], [lhs, rhs])
 
     exact_fwd = tf.einsum(eq, lhs, rhs)
     exact = tf.gradients([exact_fwd], [lhs, rhs])
 
-    biased = get_perturbed_gradients(None)
-    biased_errors = [tf.linalg.norm(i - j) for i, j in zip(biased, exact)]
-
     num_samples = 8
     qgrad_samples = [get_perturbed_gradients(i) for i in range(num_samples)]
     estimate1 = qgrad_samples[0]
@@ -1015,13 +999,11 @@ def get_error(estimate):
     with self.cached_session() as sess, sess.as_default():
       tf.global_variables_initializer().run()
 
-      for biased_g, exact_g, sample_error, ensemble_err, biased_err in zip(
-          biased, exact, sample_errors, ensemble_errors, biased_errors
-          ):
-        # Check dynamic backward quant is inexact
-        self.assertNotAllEqual(biased_g, exact_g)
-        # unbiased estimate should have smaller errors than the biased one
-        self.assertAllLess(ensemble_err, biased_err)
+      for estimate1_g, exact_g, sample_error, ensemble_err in zip(
+          estimate1, exact, sample_errors, ensemble_errors
+      ):
+        # Check dynamic backward quant should be close
+        self.assertAllClose(estimate1_g, exact_g, rtol=1e-2)
         # the unbiased estimate should eventually converge or make improvement
         self.assertAllLess(ensemble_err, sample_error)