[ TEST ] Add more unittest and fixes for mixed precsion

This PR includes more unittest and fixes for mixed precsion. . Model Unittest . 2 fc layer which generate NaN or Inf Gradient from Troch. . MSE Loss and Check whole procedure of the mixed precsion training. . Even if the FC model only have one weight, but it is good enough to validate the mixed precsion. . Torch model also work similar way of NNTrainer. . Some fixes about the exeuction order of apply gradient when the mixed precision is on. . Update SGD to support Mixed Precision training **Changes proposed in this PR:** - Added TOC generator for README.md Resolves: **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon <[email protected]>
jijoongmoon · May 24, 2024 · f95f152 · f95f152
1 parent 6c29394
commit f95f152
Show file tree

Hide file tree

Showing 13 changed files with 398 additions and 273 deletions.
diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install
@@ -16,6 +16,7 @@
 /usr/include/nntrainer/blas_interface.h
 /usr/include/nntrainer/var_grad.h
 /usr/include/nntrainer/weight.h
+/usr/include/nntrainer/blas_avx.h
 # todo: update dataset headers
 /usr/include/nntrainer/databuffer.h
 /usr/include/nntrainer/databuffer_factory.h

diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
@@ -426,24 +426,34 @@ bool NetworkGraph::backwarding(
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
 
     if (!is_valid) {
-      std::cout << "Gradient has NaN" << std::endl;
+      std::cout << ln->getName() << " : Gradient has NaN --> "
+                << ln->getRunContext().getLossScale() << std::endl;
       break;
     }
   }
 
   if (!is_valid) {
     /** if has NaN
-     * 1. reset the loss scale.
+     * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
      * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
      * 3. return false --> run backwarding again;
      */
     float scale = (*iter_)->getRunContext().getLossScale();
-    float s = scale > 1.5f ? scale - 0.5f : 1.0f;
+
+    NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
+      << "Loss Scale Factor is 1.0f";
+
+    float s = scale > 1.5f ? scale * 0.5f : 1.0f;
 
     resetLossScale(s);
 
     auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
 
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      ln->needsOutputSetZero(true);
+    }
+
     for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
       auto &ln = *iter;
       PROFILE_TIME_START(profile_keys.at(ln->getType()));
@@ -479,9 +489,12 @@ bool NetworkGraph::backwarding(
   }
   nan_count++;
 
-  if (nan_count > 10) {
+  /** @todo : handle as property : growth_interval : default --> 2000 */
+
+  if (nan_count > 2000) {
     float scale = (*iter_)->getRunContext().getLossScale();
-    float s = scale + 2.0f;
+    /** @todo growth_factor : default --> 2.0 */
+    float s = scale * 2.0f;
     resetLossScale(s);
     nan_count = 0;
   }
@@ -1251,7 +1264,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
          */
         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
                                          last_grad_access) ||
-            (rc.isGradientClipByGlobalNorm(i) &&
+            ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
                                                 last_grad_access))) {
           rc.getWeightObject(i).setAsGradientLastAccess();

diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
@@ -180,6 +180,7 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
   inplace(InPlace::NONE),
   needs_calc_derivative(false),
   needs_calc_gradient(false),
+
   output_connections(),
   run_context(nullptr),
   layer_node_props(
@@ -190,7 +191,8 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
     new RealizationPropsType(props::Flatten(), props::Activation())),
   loss(new props::Loss()),
   regularization_loss(0.0f),
-  exec_order({0, 0, 0, 0}) {
+  exec_order({0, 0, 0, 0}),
+  needs_output_set_zero(false) {
   if (layer && layer->getType() == TimeDistLayer::type) {
     std::get<props::Distribute>(*layer_node_props).set(true);
   }
@@ -751,8 +753,21 @@ LayerNode::refinalize(const std::vector<TensorDim> &input_dims) {
  */
 void LayerNode::forwarding(bool training) {
   loss->set(run_context->getRegularizationLoss());
+
   PROFILE_TIME_START(forward_event_key);
+  if (needsOutputSetZero()) {
+    for (unsigned int i = 0; i < run_context->getNumOutputs(); ++i) {
+      run_context->getOutput(i).setValue(0);
+      run_context->getOutgoingDerivative(i).setValue(0);
+    }
+
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      run_context->getWeightGrad(i).setValue(0);
+    }
+  }
+
   layer->forwarding(*run_context, training);
+  needsOutputSetZero(false);
   PROFILE_TIME_END(forward_event_key);
   TRACE_MEMORY() << getName() + ": F";
   TRACE_TIME() << getName() + ": F";

diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
@@ -879,6 +879,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
     needs_calc_derivative = nb;
   }
 
+  /**
+   * @brief Set if the layer output needs reinitialization @mixed precsion
+   *
+   * @param nb true if the layer needs to do reinitialization, eles false
+   */
+  void needsOutputSetZero(bool nb) { needs_output_set_zero = nb; }
+
   /**
    * @brief Set if the layer needs to do calculation of gradients
    *
@@ -900,6 +907,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
    */
   bool needsCalcGradient() { return needs_calc_gradient; }
 
+  /**
+   * @brief Set if the layer needs to reinitialization @mixed precsion
+   *
+   * @param nb true if the layer needs reinitialization, eles false
+   */
+  bool needsOutputSetZero() { return needs_output_set_zero; }
+
 private:
   /**
    * @brief     Get the Input Layers object
@@ -966,6 +980,9 @@ properties in the context/graph unless intended. */
   ExecutionOrder exec_order; /**< order/location of execution for this node
                                    in forward and backwarding operations */
 
+  bool needs_output_set_zero; /**< cache if this layer needs reinitialization
+                                 output  */
+
   /**
    * @brief   Get the effective layer managed by this layer node
    *

diff --git a/nntrainer/optimizers/sgd.cpp b/nntrainer/optimizers/sgd.cpp
@@ -16,7 +16,20 @@
 namespace nntrainer {
 
 void SGD::applyGradient(RunOptimizerContext &context) {
-  context.applyGradient(context.getLearningRate());
+  // @todo This could go inside the context.
+  Tensor empty_tensor;
+
+  Tensor &x_grad =
+    context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+      ? context.getGradient()
+      : empty_tensor;
+
+  if (x_grad.empty()) {
+    x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
+    context.applyLossScale(x_grad);
+  }
+
+  context.applyGradient(context.getLearningRate(), x_grad);
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
@@ -407,7 +407,8 @@ std::vector<Weight *> Manager::requestWeights(
      * order with the max exec order where it will be used for clipping and then
      * applied to the weight.
      */
-    if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm)) {
+    if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
+        isMixedPrecision()) {
       grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
       // TODO: We need double check if it is OK not to add PERSIST_END_ORDER
       // here or add other conditions

diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
@@ -495,6 +495,11 @@ class Manager {
     exec_mode = mode;
   };
 
+  /**
+   * @brief     return if it is mixed precsion
+   */
+  bool isMixedPrecision() { return !istrequal(tensor_dtype[0], "FP32"); }
+
 private:
   /** @todo: merge this list to one */
   std::vector<std::unique_ptr<Weight>> weights_v2; /**< weights for the layers

diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
@@ -356,7 +356,6 @@ class Weight : public Var_Grad {
    */
   void setLossScale(float scale) { loss_scale = scale; };
 
-
   /**
    * @brief get loss scale
    *

diff --git a/packaging/unittest_models_v3.tar.gz b/packaging/unittest_models_v3.tar.gz