nnstreamer · jijoongmoon · May 2, 2024 · May 7, 2024 · May 7, 2024 · May 7, 2024
@@ -15,4 +15,4 @@ e = executable('knn_sample',
   install_dir: application_install_dir
 )
 
-test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
+test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
@@ -64,9 +64,19 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+   extra_defines += '-DUSE_AVX=1'
+   if get_option('platform') == 'tizen'
+      add_project_arguments(['-mavx2'], language: ['c','cpp'])
+   else
+      add_project_arguments(['-march=native'], language: ['c','cpp'])
+   endif
+   message('-march=native added for AVX hardware acceleration.')
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +115,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif

@@ -768,9 +768,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
    * node is going to be used with in-place optimizations.
    */
   auto out_specs = init_context.getOutSpecs();
+
   /// @note try move inplace control to finalize
   bool shared_var = false, shared_grad = false;
-  if (lnode->executeInPlace() != InPlace::NONE) {
+  if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     for (unsigned int i = 0; i < out_specs.size(); ++i) {
       auto &s = out_specs.at(i);
@@ -1556,8 +1557,9 @@ void NetworkGraph::requestOptimizerVariable(
       const TensorDim &dim = w->getDim();
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
-        dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
-        w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+        dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
+        w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+        Tensor::Initializer::ZEROS));
     }
   }
 }

@@ -33,8 +33,7 @@ namespace nntrainer {
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 InputLayer::InputLayer() :
-  Layer(),
-  input_props(props::Normalization(), props::Standardization()) {}
+  Layer(), input_props(props::Normalization(), props::Standardization()) {}
 
 void InputLayer::setProperty(const std::vector<std::string> &values) {
   auto remain_props = loadProperties(values, input_props);
@@ -47,7 +46,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
   if (!context.executeInPlace()) {
     Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-    hidden_.copy(input_);
+    hidden_.copyData(input_);
   }
 
   if (std::get<props::Normalization>(input_props))
@@ -70,7 +69,21 @@ void InputLayer::finalize(InitLayerContext &context) {
 
   std::vector<TensorDim> output_dims = context.getInputDimensions();
 
+  for (auto &d : output_dims) {
+    d.setDataType(context.getActivationDataType());
+  }
+
   context.setOutputDimensions(output_dims);
+
+  is_inplace = true;
+
+  /**
+   * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the
+   * activation data type is not fp32, then it does not support in-place
+   * operation.
+   */
+  if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32)
+    is_inplace = false;
 }
 
 } /* namespace nntrainer */
@@ -82,7 +82,7 @@ class InputLayer : public Layer {
   /**
    * @copydoc Layer::supportInPlace()
    */
-  bool supportInPlace() const override { return true; }
+  bool supportInPlace() const override { return is_inplace; }
 
   /**
    * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
@@ -105,6 +105,7 @@ class InputLayer : public Layer {
 
 private:
   std::tuple<props::Normalization, props::Standardization> input_props;
+  bool is_inplace;
 };
 } // namespace nntrainer
 

@@ -169,6 +169,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
   return weights[idx]->getGradientRef();
 }
 
+/**
+ * @brief Get the Weight Gradient tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight grad tensor
+ */
+Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const {
+  if (!weights[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable weight.");
+  return weights[idx]->getVariableFP32Ref();
+}
+
 /**
  * @brief Get the Weight Optimizer Variable tensor object
  *

@@ -463,6 +463,15 @@ class RunLayerContext {
   Tensor &getWeightGrad(unsigned int idx) const;
 
   /**
+   * @brief Get the Weight Gradient tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return Tensor& Reference to the weight grad tensor
+   */
+  Tensor &getWeightFP32(unsigned int idx) const;
+
+  /**
+
    * @brief Get the Weight Optimizer Variable tensor object
    *
    * @param idx Identifier of the weight

@@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
   const std::vector<TensorDim> getOutputDimensions() const;
   /**
    * @brief Get the Weight object
+   * currently, only unittest uses this func.
    *
    * @param idx Identifier of the weight
    * @return Weight& Reference to the weight
@@ -495,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
     NNTR_THROW_IF(!run_context, std::runtime_error)
       << __func__ << " layer needs to be finalized first!";
     if (run_context->weightHasGradient(idx)) {
-      return Weight(run_context->getWeight(idx),
-                    run_context->getWeightGrad(idx),
-                    run_context->getWeightName(idx));
+      return Weight(
+        run_context->getWeight(idx), run_context->getWeightGrad(idx),
+        run_context->getWeightFP32(idx), run_context->getWeightName(idx));
     } else {
-      return Weight(run_context->getWeight(idx), Tensor(),
+      return Weight(run_context->getWeight(idx), Tensor(), Tensor(),
                     run_context->getWeightName(idx));
     }
   }

@@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
-  Tensor &y = context.getInput(SINGLE_INOUT_IDX);
+
+  Tensor empty_tensor;
+  Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+                  ml::train::TensorDim::DataType::FP32
+                ? context.getInput(SINGLE_INOUT_IDX)
+                : empty_tensor;
+
+  if (y.empty())
+    y = context.getInput(SINGLE_INOUT_IDX)
+          .clone(ml::train::TensorDim::DataType::FP32);
 
   // hidden_ <- y2 - y;
   if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {

@@ -36,7 +36,15 @@ Adam::~Adam() {}
 enum AdamParams { wm, wv };
 
 std::vector<TensorDim> Adam::getOptimizerVariableDim(const TensorDim &dim) {
-  return {dim, dim};
+  /**
+   * @note We assume the optimizer parameters should be full precsion to
+   * maintain the accuracy even in mixed precision training.
+   */
+  TensorDim wm_dim(dim);
+  TensorDim wv_dim(dim);
+  wm_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  wv_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  return {wm_dim, wv_dim};
 }
 
 void Adam::exportTo(Exporter &exporter,
@@ -64,7 +72,15 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const {
 }
 
 void Adam::applyGradient(RunOptimizerContext &context) {
-  Tensor &x_grad = context.getGradient();
+  Tensor empty_tensor;
+
+  Tensor &x_grad =
+    context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+      ? context.getGradient()
+      : empty_tensor;
+
+  if (x_grad.empty())
+    x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
 
   auto &beta1 = std::get<PropsB1>(adam_props).get();
   auto &beta2 = std::get<PropsB2>(adam_props).get();
@@ -91,7 +107,7 @@ void Adam::applyGradient(RunOptimizerContext &context) {
     denom.add_i(epsilon);
     wm.divide(denom, x_grad);
 
-    context.applyGradient(context.getLearningRate() / biasCorrection1);
+    context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
 
   } else {
     std::function<double(double)> sqrtEps = [epsilon](double f) {
@@ -100,8 +116,9 @@ void Adam::applyGradient(RunOptimizerContext &context) {
 
     x_grad = wv.apply<float>(sqrtEps, x_grad);
     x_grad.multiply_i(wm);
-    context.applyGradient(getUpdatedLearningRate(context.getIteration(),
-                                                 context.getLearningRate()));
+    context.applyGradient(
+      getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
+      x_grad);
   }
 }
 

@@ -42,4 +42,11 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
 void RunOptimizerContext::applyGradient(double lr) const {
   weight->applyGradient(lr);
 }
+
+/**
+ * @brief   Apply the gradient with the given learning rate and gradient
+ */
+void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const {
+  weight->applyGradient(lr, updated_grad);
+}
 } // namespace nntrainer
@@ -35,9 +35,7 @@ class RunOptimizerContext {
    *
    */
   RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) :
-    weight(w),
-    iteration(iter),
-    learning_rate(lr) {}
+    weight(w), iteration(iter), learning_rate(lr) {}
 
   /**
    * @brief Get the Weight tensor object
@@ -75,6 +73,16 @@ class RunOptimizerContext {
    */
   void applyGradient(double lr) const;
 
+  /**
+   * @brief   Apply the gradient with the given learning rate and updated
+   * gradient
+   *
+   * @param lr learning rate
+   * @param updated_grad gradient tensor which is updated. (usually it could be
+   * fp32)
+   */
+  void applyGradient(double lr, Tensor &updated_grad) const;
+
   /**
    * @brief   Get the current iteration value
    *