Skip to content

Commit

Permalink
[ TEST ] Add more unittest and fixes for mixed precsion
Browse files Browse the repository at this point in the history
This PR includes more unittest and fixes for mixed precsion.
. Model Unittest
  . 2 fc layer which generate NaN or Inf Gradient from Troch.
  . MSE Loss and Check whole procedure of the mixed precsion training.
  . Even if the FC model only have one weight, but it is good enough
  to validate the mixed precsion.
  . Torch model also work similar way of NNTrainer.
  . Some fixes about the exeuction order of apply gradient when the
  mixed precision is on.
  . Update SGD to support Mixed Precision training

**Changes proposed in this PR:**
- Added TOC generator for README.md

Resolves:

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <[email protected]>
  • Loading branch information
jijoongmoon committed May 24, 2024
1 parent 6c29394 commit f95f152
Show file tree
Hide file tree
Showing 13 changed files with 398 additions and 273 deletions.
1 change: 1 addition & 0 deletions debian/nntrainer-dev.install
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
/usr/include/nntrainer/blas_interface.h
/usr/include/nntrainer/var_grad.h
/usr/include/nntrainer/weight.h
/usr/include/nntrainer/blas_avx.h
# todo: update dataset headers
/usr/include/nntrainer/databuffer.h
/usr/include/nntrainer/databuffer_factory.h
Expand Down
25 changes: 19 additions & 6 deletions nntrainer/graph/network_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,24 +426,34 @@ bool NetworkGraph::backwarding(
PROFILE_TIME_END(profile_keys.at(ln->getType()));

if (!is_valid) {
std::cout << "Gradient has NaN" << std::endl;
std::cout << ln->getName() << " : Gradient has NaN --> "
<< ln->getRunContext().getLossScale() << std::endl;
break;
}
}

if (!is_valid) {
/** if has NaN
* 1. reset the loss scale.
* 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
* 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
* 3. return false --> run backwarding again;
*/
float scale = (*iter_)->getRunContext().getLossScale();
float s = scale > 1.5f ? scale - 0.5f : 1.0f;

NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
<< "Loss Scale Factor is 1.0f";

float s = scale > 1.5f ? scale * 0.5f : 1.0f;

resetLossScale(s);

auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
ln->needsOutputSetZero(true);
}

for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
auto &ln = *iter;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
Expand Down Expand Up @@ -479,9 +489,12 @@ bool NetworkGraph::backwarding(
}
nan_count++;

if (nan_count > 10) {
/** @todo : handle as property : growth_interval : default --> 2000 */

if (nan_count > 2000) {
float scale = (*iter_)->getRunContext().getLossScale();
float s = scale + 2.0f;
/** @todo growth_factor : default --> 2.0 */
float s = scale * 2.0f;
resetLossScale(s);
nan_count = 0;
}
Expand Down Expand Up @@ -1251,7 +1264,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
*/
if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access) ||
(rc.isGradientClipByGlobalNorm(i) &&
((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access))) {
rc.getWeightObject(i).setAsGradientLastAccess();
Expand Down
17 changes: 16 additions & 1 deletion nntrainer/layers/layer_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
inplace(InPlace::NONE),
needs_calc_derivative(false),
needs_calc_gradient(false),

output_connections(),
run_context(nullptr),
layer_node_props(
Expand All @@ -190,7 +191,8 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
new RealizationPropsType(props::Flatten(), props::Activation())),
loss(new props::Loss()),
regularization_loss(0.0f),
exec_order({0, 0, 0, 0}) {
exec_order({0, 0, 0, 0}),
needs_output_set_zero(false) {
if (layer && layer->getType() == TimeDistLayer::type) {
std::get<props::Distribute>(*layer_node_props).set(true);
}
Expand Down Expand Up @@ -751,8 +753,21 @@ LayerNode::refinalize(const std::vector<TensorDim> &input_dims) {
*/
void LayerNode::forwarding(bool training) {
loss->set(run_context->getRegularizationLoss());

PROFILE_TIME_START(forward_event_key);
if (needsOutputSetZero()) {
for (unsigned int i = 0; i < run_context->getNumOutputs(); ++i) {
run_context->getOutput(i).setValue(0);
run_context->getOutgoingDerivative(i).setValue(0);
}

for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
run_context->getWeightGrad(i).setValue(0);
}
}

layer->forwarding(*run_context, training);
needsOutputSetZero(false);
PROFILE_TIME_END(forward_event_key);
TRACE_MEMORY() << getName() + ": F";
TRACE_TIME() << getName() + ": F";
Expand Down
17 changes: 17 additions & 0 deletions nntrainer/layers/layer_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -879,6 +879,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
needs_calc_derivative = nb;
}

/**
* @brief Set if the layer output needs reinitialization @mixed precsion
*
* @param nb true if the layer needs to do reinitialization, eles false
*/
void needsOutputSetZero(bool nb) { needs_output_set_zero = nb; }

/**
* @brief Set if the layer needs to do calculation of gradients
*
Expand All @@ -900,6 +907,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
*/
bool needsCalcGradient() { return needs_calc_gradient; }

/**
* @brief Set if the layer needs to reinitialization @mixed precsion
*
* @param nb true if the layer needs reinitialization, eles false
*/
bool needsOutputSetZero() { return needs_output_set_zero; }

private:
/**
* @brief Get the Input Layers object
Expand Down Expand Up @@ -966,6 +980,9 @@ properties in the context/graph unless intended. */
ExecutionOrder exec_order; /**< order/location of execution for this node
in forward and backwarding operations */

bool needs_output_set_zero; /**< cache if this layer needs reinitialization
output */

/**
* @brief Get the effective layer managed by this layer node
*
Expand Down
15 changes: 14 additions & 1 deletion nntrainer/optimizers/sgd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,20 @@
namespace nntrainer {

void SGD::applyGradient(RunOptimizerContext &context) {
context.applyGradient(context.getLearningRate());
// @todo This could go inside the context.
Tensor empty_tensor;

Tensor &x_grad =
context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
? context.getGradient()
: empty_tensor;

if (x_grad.empty()) {
x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
context.applyLossScale(x_grad);
}

context.applyGradient(context.getLearningRate(), x_grad);
}

} // namespace nntrainer
3 changes: 2 additions & 1 deletion nntrainer/tensor/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,8 @@ std::vector<Weight *> Manager::requestWeights(
* order with the max exec order where it will be used for clipping and then
* applied to the weight.
*/
if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm)) {
if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
isMixedPrecision()) {
grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
// TODO: We need double check if it is OK not to add PERSIST_END_ORDER
// here or add other conditions
Expand Down
5 changes: 5 additions & 0 deletions nntrainer/tensor/manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,11 @@ class Manager {
exec_mode = mode;
};

/**
* @brief return if it is mixed precsion
*/
bool isMixedPrecision() { return !istrequal(tensor_dtype[0], "FP32"); }

private:
/** @todo: merge this list to one */
std::vector<std::unique_ptr<Weight>> weights_v2; /**< weights for the layers
Expand Down
1 change: 0 additions & 1 deletion nntrainer/tensor/weight.h
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@ class Weight : public Var_Grad {
*/
void setLossScale(float scale) { loss_scale = scale; };


/**
* @brief get loss scale
*
Expand Down
Binary file modified packaging/unittest_models_v3.tar.gz
Binary file not shown.
Loading

0 comments on commit f95f152

Please sign in to comment.