From 62d381a446512d6a5ba46b86dc9a1e9188132789 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 11 Sep 2020 13:33:36 -0700
Subject: [PATCH 01/13] Initial cherry-pick

---
 src/executor/cuda_graphs.h        | 529 ++++++++++++++++++++++++++++++
 src/executor/exec_pass.h          |  10 +
 src/executor/graph_executor.cc    |  40 ++-
 src/imperative/imperative_utils.h |  15 +
 4 files changed, 581 insertions(+), 13 deletions(-)
 create mode 100644 src/executor/cuda_graphs.h
diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
new file mode 100644
index 000000000000..512efae0b73c
--- /dev/null
+++ b/src/executor/cuda_graphs.h
@@ -0,0 +1,529 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file cuda_graphs.h
+ * \brief Wrappers for use of CUDA Graphs API
+ */
+#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_
+#define MXNET_EXECUTOR_CUDA_GRAPHS_H_
+
+#include <mxnet/base.h>
+#include <vector>
+#include <string>
+#include <map>
+#include <set>
+
+#include "./exec_pass.h"
+#include "../common/cuda_utils.h"
+
+#if MXNET_USE_CUDA
+#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10010)
+#else
+#define CUDA_GRAPHS_AVAILABLE (0)
+#endif
+
+#if CUDA_GRAPHS_AVAILABLE
+
+namespace mxnet {
+namespace cuda_graphs {
+
+inline std::string CudaDim3ToString(const dim3& dims) {
+  std::stringstream ss;
+  if (dims.z != 1)
+    ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")";
+  else if (dims.y != 1)
+    ss << "(" << dims.x << "," << dims.y << ")";
+  else
+    ss << "(" << dims.x << ")";
+  return ss.str();
+}
+
+#define CU_CALL(func)                                                         \
+  {                                                                           \
+    CUresult e = (func);                                                      \
+    if (e != CUDA_SUCCESS) {                                                  \
+      std::cerr << "cuda driver failure code: " << e << std::endl;            \
+      exit(1);                                                                \
+    }                                                                         \
+  }
+
+// Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.)
+inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) {
+  CUgraphNode cu_node = node;
+  CUgraphNodeType t;
+  CU_CALL(cuGraphNodeGetType(cu_node, &t));
+  return t;
+}
+
+// Return the list of CUDA Graph nodes from a graph
+inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
+  size_t numNodes;
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast<cudaGraphNode_t*>(nullptr), &numNodes));
+  if (numNodes == 0)
+    return std::vector<cudaGraphNode_t>();
+  std::vector<cudaGraphNode_t> graphNodes(numNodes);
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes));
+  return graphNodes;
+}
+
+inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
+  std::stringstream ss;
+
+  // The following introspection calls are made through the driver API in order to bypass
+  // problems that would arise if multiple statically-linked copies of the runtime exist.
+
+  CUgraphNode cu_node = node;
+  CUgraphNodeType t;
+  CU_CALL(cuGraphNodeGetType(cu_node, &t));
+  switch (t) {
+    case CU_GRAPH_NODE_TYPE_KERNEL:
+      {
+        CUDA_KERNEL_NODE_PARAMS kparams;
+        auto err = cuGraphKernelNodeGetParams(cu_node, &kparams);
+        if (err == CUDA_SUCCESS) {
+          ss << "GPUKernel@" << kparams.func;
+          dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ);
+          dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ);
+          ss << "<<<gridDim=" << CudaDim3ToString(gridDim)
+             << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>";
+          ss << "(...";
+          if (kparams.sharedMemBytes != 0)
+            ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes;
+          ss << ")";
+        } else {
+          ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err;
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMCPY:
+      {
+        cudaMemcpy3DParms mparams = {};
+        CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams));
+        // If memcpy is seen, return without setting up runnable executor
+        switch (mparams.kind) {
+          case cudaMemcpyHostToHost: ss << "Host->Host "; break;
+          case cudaMemcpyHostToDevice: ss << "Host->Device "; break;
+          case cudaMemcpyDeviceToHost: ss << "Device->Host "; break;
+          case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break;
+          default: break;
+        }
+        ss << "Memcpy";
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMSET:
+      {
+        cudaMemsetParams mparams = {};
+        CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams));
+        if (mparams.height == 1 && mparams.elementSize == 1) {
+          ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value
+             << ", count=" << mparams.width << ")";
+        } else {
+          if (mparams.elementSize == 1)
+            ss << "cudaMemset2D";
+          else
+            ss << "MemSet<elemBytes=" << mparams.elementSize << ">";
+          ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch
+             << ", value=" << mparams.value << ", width=" << mparams.width
+             << ", height=" << mparams.height << ")";
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break;
+    case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break;
+    case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break;
+    default: ss << "Unknown/Invalid node type " << t;
+  }
+  return ss.str();
+}
+
+
+// CUDA Graphs are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphDeleter {
+ public:
+  void operator() (cudaGraph_t graph) {
+    if (graph != nullptr)
+      CUDA_CALL(cudaGraphDestroy(graph));
+  }
+};
+
+// CUDA Graphs Executors are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphExecDeleter {
+ public:
+  void operator() (cudaGraphExec_t graph_exec) {
+    if (graph_exec != nullptr)
+      CUDA_CALL(cudaGraphExecDestroy(graph_exec));
+  }
+};
+
+// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'),
+// characterized by a starting index in the OpExecutor list and a number of ops.
+class CudaGraphsSubSegExec {
+ public:
+  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops,
+                       bool ops_are_cuda_graph_compatible = true) :
+  from_op_idx_(from_op_idx),
+  num_ops_(num_ops),
+  graph_(nullptr),
+  graph_exec_(nullptr) {
+    if (ops_are_cuda_graph_compatible) {
+      MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
+      MakeGraphExec();
+    }
+  }
+
+  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu,
+              bool verbose) {
+    // Current executor should be Runnable with the same parameters
+    CHECK(IsRunnable());
+    MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_);
+
+    cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
+    cudaGraphNode_t error_node;
+    CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
+                                  &error_node, &update_result));
+    // If update fails make a new executor, discarding old one.
+    if (update_result != cudaGraphExecUpdateSuccess)
+      MakeGraphExec();
+  }
+
+  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 const RunContext &rctx,
+                 bool is_gpu) {
+    if (IsRunnable()) {
+      auto s = rctx.get_stream<gpu>();
+      const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+      CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s));
+    } else {
+      // No CUDA Graph could be made for this portion of the OpSegment.  Run conventionally.
+      for (int i = 0; i != num_ops_; ++i)
+        exec_list[from_op_idx_ + i]->Run(rctx, is_gpu);
+    }
+  }
+
+  bool IsRunnable() { return graph_exec_ != nullptr; }
+
+ private:
+  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops) {
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // Create CUDA Graph
+    // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers
+    // to sync their streams without disturbing this capture.
+    CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal));
+    // Run those oprs in the sub segment while capturing- no actual GPU work is launched.
+    for (int i = 0; i != num_ops; ++i)
+      exec_list[from_op_idx + i]->Run(rctx, is_gpu);
+    cudaGraph_t cuda_graph = nullptr;
+    CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph));
+    graph_.reset(cuda_graph, CudaGraphDeleter());
+
+    if (verbose) {
+      std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
+      size_t num_nodes = graph_nodes.size();
+      LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
+      for (size_t i = 0; i != num_nodes; ++i) {
+        LOG(INFO) << "    node " << i << " = "
+                  << CudaGraphNodeToString(graph_nodes[i]);
+      }
+    }
+  }
+
+  void MakeGraphExec() {
+      cudaGraphExec_t cuda_graph_exec;
+      cudaGraphNode_t error_node;
+      char log_buffer[1000];
+
+      CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(),
+                                     &error_node, log_buffer, 1000));
+      graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
+
+      // At this point we have a CUDA Graph executor
+      static int num_graph_creations_logged = 0;
+      static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
+      if (num_graph_creations_logged < max_log_entries) {
+        num_graph_creations_logged++;
+        LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
+        if (num_graph_creations_logged == max_log_entries)
+          LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
+      }
+  }
+
+  int from_op_idx_;
+  int num_ops_;
+  using cudaGraphStruct_t = typename std::remove_pointer<cudaGraph_t>::type;
+  using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type;
+  std::shared_ptr<cudaGraphStruct_t> graph_;
+  std::shared_ptr<cudaGraphExecStruct_t> graph_exec_;
+};
+
+// The CudaGraph executor and associated Tempspace ptrs for which it is valid.
+struct CudaGraphInfo {
+  std::vector<CudaGraphsSubSegExec> cuda_graph_subseg_execs;
+  bool has_been_run_conventionally = false;
+  std::vector<void *> tempspace_dptrs;
+};
+// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and
+// the state of the is_train flag of the OpContext.  If the tempspace_dptrs change, we
+// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph.
+struct CudaGraphCacheKey {
+  cudaStream_t cu_s;
+  bool is_train;
+  // overload '<' so CudaGraphCacheKey can be used as a std::map key
+  bool operator<(const CudaGraphCacheKey &other) const {
+    return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train);
+  }
+};
+using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>;
+
+class CudaGraphsExec {
+ public:
+  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 bool is_gpu,
+                 const char *opr_names) :
+  verbose_(false), is_enabled_(false) {
+    opr_names_ = opr_names ? std::string(opr_names) : std::string();
+    if (is_gpu) {
+      is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false);
+      verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false);
+      SetTempSpaces(exec_list);
+    }
+  }
+
+  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu) {
+    // If this a CPU op or CUDA Graphs use isn't possible, run normally and return
+    if (!is_gpu || !is_enabled_) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      return;
+    }
+
+    // Also if we're in a warm-up period where tempspace pointers are likely
+    // to change, run normally and return
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // All the ops in the bulked segment will have the same setting of is_train as the first op
+    const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
+    const CudaGraphCacheKey key = {cu_s, is_train};
+    // Look-up the CUDA Graph info for this combo of stream and is_train setting
+    // This may create a default-initialized new entry.
+    auto &cuda_graph_info = cache_[key];
+    if (!cuda_graph_info.has_been_run_conventionally) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      cuda_graph_info.has_been_run_conventionally = true;
+      return;
+    }
+
+    // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors'
+    //     (there might be more than one executor if some ops in the segment are not capturable)
+    auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s);
+
+    // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'.
+    if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 &&
+        cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) {
+      // Update all runnable executors.  Non-runnable executors launch their ops conventionally.
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          subseg_exec.Update(exec_list, rctx, is_gpu, verbose_);
+      }
+    } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) {
+      // No executors exist yet, so create them.
+      if (verbose_)
+        LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_;
+      // Make one or more CUDA Graphs, avoiding ops that are not compatible.
+      for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) {
+          if (OpOK(exec_list[last_op_idx]))
+            num_good_ops++;
+          else
+            break;
+        }
+        if (num_good_ops > 0) {
+          CreateSubExecOverRegion(exec_list, rctx, is_gpu,
+                                  first_op_idx,
+                                  first_op_idx + num_good_ops,
+                                  &cuda_graph_info.cuda_graph_subseg_execs);
+          first_op_idx += num_good_ops;
+        }
+        if (first_op_idx != exec_list.size()) {
+          // We had to have hit an op that was not OK.
+          if (verbose_) {
+            LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]"
+                      << " of op segment "  << opr_names_;
+          }
+          CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false);
+          cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg);
+          first_op_idx++;
+        }
+      }
+      // During graph capture, the ops may be asking for the tempworkspace.  This should
+      // not alter the base pointers, since this op seg has been executed before on this
+      // stream (i.e. on this gpu worker).  Safest to double-check this though.
+      auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s);
+      if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs)
+        LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use.";
+      cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs;
+    }
+    // Now execute the CUDA Graph that we either just created or looked-up in the cache.
+    if (verbose_) {
+      int runnable_execs = 0;
+      int bypassed_ops = 0;
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          runnable_execs++;
+        else
+          bypassed_ops++;
+      }
+      LOG(INFO) << "Launching " << runnable_execs
+                << " captured CUDA graph(s) for op segment " << opr_names_;
+      if (bypassed_ops > 0)
+        LOG(INFO) << "    (bypassing " << bypassed_ops << " un-capturable ops)";
+    }
+    for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs)
+      subseg_exec.RunSubSeg(exec_list, rctx, is_gpu);
+  }
+
+ private:
+  // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx).  If such a graph
+  // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a
+  // number of smaller graphs that avoid those ops with the memcpys.
+  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                               const RunContext &rctx,
+                               bool is_gpu,
+                               size_t from_op_idx,
+                               size_t upto_op_idx,
+                               std::vector<CudaGraphsSubSegExec> *cuda_graph_subseg_execs) {
+    // Optimistically try to create a CUDA Graph of the entire op segment region
+
+    int num_ops = upto_op_idx - from_op_idx;
+    CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops);
+    if (full_opseg.IsRunnable()) {
+      cuda_graph_subseg_execs->push_back(full_opseg);
+    } else {
+      if (verbose_)
+        LOG(INFO) << "  Graph was not runnable- creating op sub-segments...";
+      // Enter fall-back approach to making many sub-execs
+      for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) {
+          CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1);
+          if (single_opseg.IsRunnable())
+            num_good_ops++;
+          // Is it time to create a subseg exec from accumulated good ops?
+          if (num_good_ops > 0 &&
+              (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) {
+            if (verbose_)
+              LOG(INFO) << "Capturing CUDA graph of op sub segment["
+                        << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]"
+                        << " of op segment "  << opr_names_;
+            CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_,
+                                            first_op_idx, num_good_ops);
+            CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation";
+            cuda_graph_subseg_execs->push_back(good_opseg);
+            first_op_idx += num_good_ops;
+          }
+          // If the last single op was not runnable, use the exec to handle that op conventionally
+          if (!single_opseg.IsRunnable()) {
+            if (verbose_) {
+              LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]"
+                        << " of op segment "  << opr_names_;
+              // Generate throw-away exec in order to produce a diagnostic listing of graph nodes
+              CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1);
+            }
+            cuda_graph_subseg_execs->push_back(single_opseg);
+            first_op_idx++;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Is the Op OK to make part of a CUDA Graph?
+  bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
+    static bool allow_dropout_resource =
+      dmlc::GetEnv("MXNET_CUDA_GRAPHS_ALLOW_DROPOUT", false);
+    for (auto& resource : exec->op_ctx.requested) {
+      if (!(resource.req.type == ResourceRequest::kTempSpace ||
+            (resource.req.type == ResourceRequest::kCuDNNDropoutDesc && allow_dropout_resource))) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Determine Tempspaces used by ops.  Other resource uses disable CUDA Graphs.
+  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list) {
+    // Gather info about the ops use of TempSpace.
+    if (is_enabled_) {
+      std::set<Resource *> tempspaces_set;
+      for (auto& exec : exec_list) {
+        for (auto& resource : exec->op_ctx.requested) {
+          if (resource.req.type == ResourceRequest::kTempSpace) {
+            tempspaces_set.insert(&resource);
+          }
+        }
+      }
+      tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end());
+    }
+  }
+
+  // Return the addresses of the gpu TempSpace areas
+  std::vector<void *> GetGPUTempspacePtrs(mshadow::Stream<gpu> *s) {
+    std::vector<void *> ret;
+    for (const auto& resource : tempspaces_) {
+      // Ask for minimal allocation to get base pointer without increasing the size
+      auto *base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_;
+      ret.push_back(static_cast<void *>(base_ptr));
+    }
+    return ret;
+  }
+
+  CudaGraphCache cache_;
+  std::vector<Resource *> tempspaces_;
+  std::string opr_names_;
+  bool verbose_;
+  bool is_enabled_;
+};
+
+#endif  // CUDA_GRAPHS_AVAILABLE
+
+
+}  // namespace cuda_graphs
+}  // namespace mxnet
+#endif  // MXNET_EXECUTOR_CUDA_GRAPHS_H_
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index e3d2fa459bc3..b5d3b2471b8d 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -100,6 +100,16 @@ class OpExecutor {
    * \param rctx The runtime context passed in by environment.
    */
   virtual void Run(RunContext rctx, bool is_gpu) = 0;
+  /*!
+   * \brief run the operators of a vector of execs, given runtime context on device.
+   *  This function call does not synchronize the stream.
+   * \param rctx The runtime context passed in by environment.
+   */
+  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> > &execs,
+                     RunContext rctx, bool is_gpu) {
+    for (auto &exec : execs)
+      exec->Run(rctx, is_gpu);
+  }
   /*! \return the execution type */
   virtual ExecType exec_type() const = 0;
   /*! \return return engine variable for operator states */
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 13bab2e544bf..c7febc4712f1 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -31,6 +31,7 @@
 
 #include "./exec_pass.h"
 #include "./graph_executor.h"
+#include "./cuda_graphs.h"
 #include "../profiler/profiler.h"
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
@@ -84,7 +85,8 @@ void GraphExecutor::Forward(bool is_train) {
 void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) {
   size_t sstep = static_cast<size_t>(step);
   if (sstep >= num_forward_nodes_) {
-    *step_left = 0; return;
+    *step_left = 0;
+    return;
   }
   RunOps(is_train, sstep, sstep + 1);
   *step_left = static_cast<int>(num_forward_nodes_ - sstep - 1);
@@ -166,11 +168,12 @@ void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool is_tra
 }
 
 void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
-  nnvm::Symbol s; s.outputs = graph_.outputs;
+  nnvm::Symbol s;
+  s.outputs = graph_.outputs;
   s.Print(os);
   // message to be backward compatible with the memonger
   size_t total_bytes = graph_.GetAttr<size_t>("storage_allocated_bytes");
-  os << "Total " << (total_bytes >> 20UL) <<" MB allocated\n";
+  os << "Total " << (total_bytes >> 20UL) << " MB allocated\n";
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
@@ -337,7 +340,8 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
     g = exec::EliminateCommonExpr(std::move(g));
   need_grad_ = false;
   for (OpReqType req : grad_req_types) {
-    if (req != kNullOp) need_grad_ = true;
+    if (req != kNullOp)
+      need_grad_ = true;
   }
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
@@ -1312,12 +1316,12 @@ void GraphExecutor::InitCachedOps() {
       // call on complete only if it is async op
       if (!is_async) {
         if (is_gpu) {
-        #if MXNET_USE_CUDA
+#if MXNET_USE_CUDA
           // Wait GPU kernel to finish.
           ctx.get_stream<gpu>()->Wait();
-        #else
+#else
           LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-        #endif
+#endif
         }
         on_complete();
       }
@@ -1607,21 +1611,31 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     opr_names += inode.source->op()->name + ",";
   }
 
-  if (pctx == nullptr) return ret;
+  if (pctx == nullptr)
+    return ret;
   ret.ctx = *pctx;
   Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars);
 
   bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
+
+#if CUDA_GRAPHS_AVAILABLE
+  // Provide initialized `cuda_graphs_exec`, which when captured
+  // by exec_fun, acts like a static variable inside the mutable closure.
+  cuda_graphs::CudaGraphsExec cuda_graphs_exec(exec_list, is_gpu, opr_names.c_str());
+  auto exec_fun = [cuda_graphs_exec, exec_list, is_gpu] (
+      RunContext rctx, Engine::CallbackOnComplete on_complete) mutable {
+    // Run all opr in the sub-graph with CUDA graphs executor if possible
+    cuda_graphs_exec.RunAll(exec_list, rctx, is_gpu);
+#else
   auto exec_fun = [exec_list, is_gpu] (
-      RunContext ctx, Engine::CallbackOnComplete on_complete) {
+      RunContext rctx, Engine::CallbackOnComplete on_complete) {
     // Run all opr in the sub-graph
-    for (auto &exec : exec_list) {
-      exec->Run(ctx, is_gpu);
-    }
+    OpExecutor::RunAll(exec_list, rctx, is_gpu);
+#endif
     if (is_gpu) {
 #if MXNET_USE_CUDA
       // Wait GPU kernel to finish.
-      ctx.get_stream<gpu>()->Wait();
+      rctx.get_stream<gpu>()->Wait();
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 156013857d6a..4933808ad892 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -26,6 +26,7 @@
 #include <map>
 #include <string>
 #include "../executor/graph_executor.h"
+#include "../executor/cuda_graphs.h"
 #include "../executor/exec_pass.h"
 #include "../c_api/c_api_common.h"
 #include "../common/utils.h"
@@ -985,12 +986,26 @@ inline Engine::OprHandle CreateEngineOp(
   bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask;
   bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync;
 
+#if CUDA_GRAPHS_AVAILABLE
+  // Provide initialized `cuda_graphs_exec`, which when captured
+  // by exec_fun, acts like a static variable inside the mutable closure.
+  cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names);
+  auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] (
+      RunContext ctx, Engine::CallbackOnComplete on_complete) mutable {
+    if (is_async) {
+      execs[0]->op_ctx.async_on_complete = on_complete;
+    }
+    // Run all opr in the sub-graph with CUDA graphs executor if possible
+    cuda_graphs_exec.RunAll(execs, ctx, is_gpu);
+#else
   auto exec_fun = [execs, is_async, is_gpu] (
       RunContext ctx, Engine::CallbackOnComplete on_complete) {
     if (is_async) {
       execs[0]->op_ctx.async_on_complete = on_complete;
     }
     for (const auto& exec : execs) exec->Run(ctx, is_gpu);
+    OpExecutor::RunAll(execs, ctx, is_gpu);
+#endif
     // call on complete only if it is async op
     if (!is_async) {
       if (is_gpu) {

From c15357a378a33a0fc14e1207b7d256b23f447dbe Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 11 Sep 2020 14:07:27 -0700
Subject: [PATCH 02/13] Store NodeAttrs in OpExecutor

---
 src/executor/attach_op_execs_pass.cc | 36 ++++++++++++++--------------
 src/executor/exec_pass.h             |  4 ++++
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 8f47bc29db13..8a32a249f9ac 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -45,8 +45,9 @@ namespace exec {
 // FComputeExecutor and FStatefulComputeExecutor inherit from this class
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  explicit StorageFallbackOpExecutor(const std::vector<uint32_t> &mutate_idx)
-      : mutate_idx_(mutate_idx) {}
+  explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
+                                     const std::vector<uint32_t> &mutate_idx)
+      : OpExecutor(attrs), mutate_idx_(mutate_idx) {}
 
   void Setup() override {
     init_ = false;
@@ -136,11 +137,12 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(const OpStatePtr& state,
+  explicit StatefulComputeExecutor(const NodeAttrs& attrs,
+                                   const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
+      : StorageFallbackOpExecutor(attrs, mutate_idx),
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
@@ -159,7 +161,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
-    if (!is_mkldnn.get(attrs_.op, false)) {
+    if (!is_mkldnn.get(attrs.op, false)) {
       CreateDefaultInputs(in_array, &in_array_fallback);
       fcompute_(state_, op_ctx, in_array_fallback, req, out_array);
       return;
@@ -186,10 +188,9 @@ class StatefulComputeExExecutor : public OpExecutor {
                                      const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
-      : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+      : OpExecutor(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -206,7 +207,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     InvalidateOutputs(out_array, req);
 #endif
     PreFCompute(is_gpu);
-    fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    fcompute_(attrs, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
   }
 
@@ -216,12 +217,11 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
 
   explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
                             ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
-        attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+      : StorageFallbackOpExecutor(attrs, mutate_idx),
+        fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FCompute fcompute_;
   ExecType exec_type_;
 };
@@ -235,13 +235,13 @@ class FComputeExExecutor : public OpExecutor {
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
-    if (!is_mkldnn.get(attrs_.op, false)) {
+    if (!is_mkldnn.get(attrs.op, false)) {
       CreateDefaultInputs(in_array, &in_array_fallback);
-      fcompute_(attrs_, op_ctx, in_array_fallback, req, out_array);
+      fcompute_(attrs, op_ctx, in_array_fallback, req, out_array);
       return;
     }
 #endif
-    fcompute_(attrs_, op_ctx, in_array, req, out_array);
+    fcompute_(attrs, op_ctx, in_array, req, out_array);
   }
 
   void Setup() override {}
@@ -252,11 +252,10 @@ class FComputeExExecutor : public OpExecutor {
 
   explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute,
                               ExecType exec_type)
-      : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+      : OpExecutor(attrs), fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FComputeEx fcompute_;
   ExecType exec_type_;
 };
@@ -318,7 +317,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+                                                         state, fcompute,
                                                          exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
@@ -339,7 +339,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
           ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
     }
   } else {
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index b5d3b2471b8d..e364187f2f9c 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -31,6 +31,7 @@
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph.h>
 #include <nnvm/graph_attr_types.h>
+#include <utility>
 #include <vector>
 #include <memory>
 #include <string>
@@ -86,6 +87,9 @@ class OpExecutor {
   std::vector<OpReqType> req;
   /*! \brief runtime op context, contains allocated resources */
   OpContext op_ctx;
+  /*! \brief attributes of the node */
+  NodeAttrs attrs;
+  explicit OpExecutor(NodeAttrs  attrs) : attrs(std::move(attrs)) {}
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
   /*!

From 3945895ba8f808114462726bcf7bf25a51875a14 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 11 Sep 2020 15:14:27 -0700
Subject: [PATCH 03/13] Do not allow stateful operations in CUDA graphs and
 provide mechanism for marking ops as safe

---
 include/mxnet/op_attr_types.h | 13 +++++++++++++
 src/executor/cuda_graphs.h    | 17 +++++++++++++----
 src/operator/nn/dropout.cu    |  5 +++++
 3 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index a0ac301b1818..88d21f23ea0c 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& att
  */
 using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
 
+#if MXNET_USE_CUDA
+
+/*!
+ * \brief Register a function to determine if
+ * the operator implementation is compatible
+ * with CUDA graphs. This requires the execution
+ * to stay the same as long as the shape and type
+ * of input stays the same.
+ */
+using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;
+
+#endif
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index 512efae0b73c..105e91f9bf44 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -476,11 +476,20 @@ class CudaGraphsExec {
 
   // Is the Op OK to make part of a CUDA Graph?
   bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
-    static bool allow_dropout_resource =
-      dmlc::GetEnv("MXNET_CUDA_GRAPHS_ALLOW_DROPOUT", false);
+    static auto& fstateful = Op::GetAttr<FCreateOpState>("FCreateOpState");
+    static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    const auto& attrs = exec->attrs;
+    if (attrs.op != nullptr) {
+      const auto& f = fgraphcompatible.get(attrs.op, nullptr);
+      if (f != nullptr) {
+        return f(attrs, exec->op_ctx.is_train);
+      }
+      if (fstateful.get(attrs.op, nullptr) != nullptr) {
+        return false;
+      }
+    }
     for (auto& resource : exec->op_ctx.requested) {
-      if (!(resource.req.type == ResourceRequest::kTempSpace ||
-            (resource.req.type == ResourceRequest::kCuDNNDropoutDesc && allow_dropout_resource))) {
+      if (!resource.req.type == ResourceRequest::kTempSpace) {
         return false;
       }
     }
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 20c5714dd904..f3a99eb431a4 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -30,6 +30,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(Dropout)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                   [](const NodeAttrs&, const bool is_train) {
+    // Dropout is just passthrough during inference
+    return !is_train;
+    })
 .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)

From 599986a8dbfbf24fb4f2c698149a431884cdc3ee Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 11 Sep 2020 18:35:47 -0700
Subject: [PATCH 04/13] Guard against using ops with synchronization

---
 src/executor/attach_op_execs_pass.cc          | 38 +++++++++++--------
 src/executor/cuda_graphs.h                    | 21 +++++++++-
 src/executor/exec_pass.h                      |  6 ++-
 src/operator/contrib/adamw.cu                 | 16 ++++++++
 src/operator/numpy/linalg/np_eig.cu           |  8 ++++
 src/operator/numpy/linalg/np_eigvals.cu       |  8 ++++
 src/operator/numpy/linalg/np_norm_backward.cu |  5 +++
 src/operator/numpy/linalg/np_norm_forward.cu  |  5 +++
 src/operator/numpy/np_boolean_mask_assign.cu  |  8 ++++
 src/operator/numpy/np_constraint_check.cu     |  4 ++
 src/operator/numpy/np_percentile_op.cu        |  4 ++
 src/operator/numpy/random/np_bernoulli_op.cu  |  4 ++
 .../numpy/random/np_exponential_op.cu         |  4 ++
 src/operator/numpy/random/np_gamma_op.cu      |  4 ++
 .../numpy/random/np_multinomial_op.cu         |  4 ++
 src/operator/numpy/random/np_normal_op.cu     |  8 ++++
 src/operator/numpy/random/np_pareto_op.cu     |  4 ++
 src/operator/numpy/random/np_power_op.cu      |  4 ++
 src/operator/numpy/random/np_rayleigh_op.cu   |  4 ++
 src/operator/numpy/random/np_weibull_op.cu    |  4 ++
 src/operator/tensor/indexing_op.cu            |  4 ++
 21 files changed, 149 insertions(+), 18 deletions(-)

diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 8a32a249f9ac..ccf536920fb0 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -46,8 +46,9 @@ namespace exec {
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
   explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
+                                     const DispatchMode& dispatch_mode,
                                      const std::vector<uint32_t> &mutate_idx)
-      : OpExecutor(attrs), mutate_idx_(mutate_idx) {}
+      : OpExecutor(attrs, dispatch_mode), mutate_idx_(mutate_idx) {}
 
   void Setup() override {
     init_ = false;
@@ -138,11 +139,12 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
   }
 
   explicit StatefulComputeExecutor(const NodeAttrs& attrs,
+                                   const DispatchMode dispatch_mode,
                                    const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(attrs, mutate_idx),
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
@@ -185,10 +187,12 @@ class StatefulComputeExExecutor : public OpExecutor {
   }
 
   explicit StatefulComputeExExecutor(const NodeAttrs& attrs,
+                                     const DispatchMode& dispatch_mode,
                                      const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
-      : OpExecutor(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+      : OpExecutor(attrs, dispatch_mode), state_(state), fcompute_(fcompute),
+        exec_type_(exec_type) {}
 
  private:
   OpStatePtr state_;
@@ -215,9 +219,10 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
-                            ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(attrs, mutate_idx),
+  explicit FComputeExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode,
+                            FCompute fcompute, ExecType exec_type,
+                            const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         fcompute_(fcompute), exec_type_(exec_type) {
   }
 
@@ -250,9 +255,9 @@ class FComputeExExecutor : public OpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute,
-                              ExecType exec_type)
-      : OpExecutor(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+  explicit FComputeExExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode,
+                              FComputeEx fcompute, ExecType exec_type)
+      : OpExecutor(attrs, dispatch_mode), fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
@@ -309,7 +314,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+                                                           dispatch_modes[i], state,
                                                            fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
@@ -318,6 +324,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
       ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+                                                         dispatch_modes[i],
                                                          state, fcompute,
                                                          exec_type, mutate_index);
     }
@@ -331,8 +338,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex,
-          exec_type);
+          inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(),
+          fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
@@ -340,17 +347,18 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
       ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
+          mutate_index);
     }
   } else {
     FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
     FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
     if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<FComputeExExecutor>(
-          inode.source->attrs, fcomp_ex, exec_type);
+          inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type);
     } else if (fcompute != nullptr) {
       ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+          inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index);
     } else {
       LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
     }
diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index 105e91f9bf44..482d6f9bfde4 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -480,16 +480,33 @@ class CudaGraphsExec {
     static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
     const auto& attrs = exec->attrs;
     if (attrs.op != nullptr) {
-      const auto& f = fgraphcompatible.get(attrs.op, nullptr);
+      const auto f = fgraphcompatible.get(attrs.op, nullptr);
       if (f != nullptr) {
         return f(attrs, exec->op_ctx.is_train);
       }
       if (fstateful.get(attrs.op, nullptr) != nullptr) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting stateful operator" << attrs.op->name  << " from CUDA graph.";
+        }
+        return false;
+      }
+      if (exec->dispatch_mode == DispatchMode::kFComputeEx ||
+          exec->dispatch_mode == DispatchMode::kFComputeFallback) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator" << attrs.op->name
+                    << " from CUDA graph due to dispatch mode "
+                    << static_cast<int>(exec->dispatch_mode);
+        }
         return false;
       }
     }
     for (auto& resource : exec->op_ctx.requested) {
-      if (!resource.req.type == ResourceRequest::kTempSpace) {
+      if (!(resource.req.type == ResourceRequest::kTempSpace)) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator" << attrs.op->name
+                    << " from CUDA graph due to using the resource type "
+                    << static_cast<int>(resource.req.type);
+        }
         return false;
       }
     }
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index e364187f2f9c..4552fa173fe4 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -89,7 +89,11 @@ class OpExecutor {
   OpContext op_ctx;
   /*! \brief attributes of the node */
   NodeAttrs attrs;
-  explicit OpExecutor(NodeAttrs  attrs) : attrs(std::move(attrs)) {}
+  /*! \brief dispatch mode of the executor */
+  DispatchMode dispatch_mode;
+
+  explicit OpExecutor(NodeAttrs  attrs, DispatchMode dispatch_mode) :
+    attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {}
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
   /*!
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index 2b0040e5f6ac..1a81ba0fa99e 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -41,15 +41,31 @@ void GetScaleFloat<gpu>(mshadow::Stream<gpu> *s, const TBlob &scale_blob, float
 }
 
 NNVM_REGISTER_OP(_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", MPUpdate<gpu, AdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_mp_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", MPUpdate<gpu, MPAdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_multi_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", multiMPUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", multiMPUpdate<gpu, true>);
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu
index c0184ad221d5..ce4d5a173ced 100644
--- a/src/operator/numpy/linalg/np_eig.cu
+++ b/src/operator/numpy/linalg/np_eig.cu
@@ -29,11 +29,19 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_eig)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EighOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu
index 974dedc6172e..0b21ffe24262 100644
--- a/src/operator/numpy/linalg/np_eigvals.cu
+++ b/src/operator/numpy/linalg/np_eigvals.cu
@@ -29,11 +29,19 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_eigvals)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigvalsOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigvalsh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigvalshOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu
index 09e85ab36f19..1936cf937419 100644
--- a/src/operator/numpy/linalg/np_norm_backward.cu
+++ b/src/operator/numpy/linalg/np_norm_backward.cu
@@ -27,6 +27,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_backward_npi_norm)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs& attrs, const bool) {
+      const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+      return param.axis.value().ndim() == 2;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu
index 6feecb09a09e..7f8673cd0ec9 100644
--- a/src/operator/numpy/linalg/np_norm_forward.cu
+++ b/src/operator/numpy/linalg/np_norm_forward.cu
@@ -27,6 +27,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_norm)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs& attrs, const bool) {
+      const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+      return param.axis.value().ndim() == 2;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu
index 6fa59bea7710..909a7de63788 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cu
+++ b/src/operator/numpy/np_boolean_mask_assign.cu
@@ -263,9 +263,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu
index f83fca0e5c33..60260affdef3 100644
--- a/src/operator/numpy/np_constraint_check.cu
+++ b/src/operator/numpy/np_constraint_check.cu
@@ -39,6 +39,10 @@ void GetReduceOutput<gpu>(mshadow::Stream<gpu> *s, const TBlob &output_blob, boo
 }
 
 NNVM_REGISTER_OP(_npx_constraint_check)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", ConstraintCheckForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu
index b24c230d6216..09cdd2e30796 100644
--- a/src/operator/numpy/np_percentile_op.cu
+++ b/src/operator/numpy/np_percentile_op.cu
@@ -48,6 +48,10 @@ bool CheckInvalidInput(mshadow::Stream<gpu> *s, const QType *data,
 }
 
 NNVM_REGISTER_OP(_npi_percentile)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                   [](const NodeAttrs&, const bool) {
+    return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyPercentileForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu
index a73bf9929db3..5ac005f85db6 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cu
+++ b/src/operator/numpy/random/np_bernoulli_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_bernoulli)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBernoulliForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu
index 4740b644a60c..4161a4bf7826 100644
--- a/src/operator/numpy/random/np_exponential_op.cu
+++ b/src/operator/numpy/random/np_exponential_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_exponential)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_exponential)
diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu
index 5be15c7b9d13..8c3bca0ca44f 100644
--- a/src/operator/numpy/random/np_gamma_op.cu
+++ b/src/operator/numpy/random/np_gamma_op.cu
@@ -30,6 +30,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_gamma)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyGammaForward<gpu, double>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
index 132d67beeb6a..23de9a72bde6 100644
--- a/src/operator/numpy/random/np_multinomial_op.cu
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -43,6 +43,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) {
 }
 
 NNVM_REGISTER_OP(_npi_multinomial)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyMultinomialForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu
index d45bc2321bd7..dc5b73254b08 100644
--- a/src/operator/numpy/random/np_normal_op.cu
+++ b/src/operator/numpy/random/np_normal_op.cu
@@ -29,12 +29,20 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_normal)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_normal)
 .set_attr<FCompute>("FCompute<gpu>", NormalReparamBackward<gpu>);
 
 NNVM_REGISTER_OP(_npi_normal_n)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu
index d8a8a896e653..eecc2a8d44e8 100644
--- a/src/operator/numpy/random/np_pareto_op.cu
+++ b/src/operator/numpy/random/np_pareto_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_pareto)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_pareto)
diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu
index d5067f83bb02..154ef2e18f45 100644
--- a/src/operator/numpy/random/np_power_op.cu
+++ b/src/operator/numpy/random/np_power_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_powerd)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyPowerForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu
index d35ac073fc4d..4eb6e390a922 100644
--- a/src/operator/numpy/random/np_rayleigh_op.cu
+++ b/src/operator/numpy/random/np_rayleigh_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_rayleigh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyRayleighForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_rayleigh)
diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu
index 57d609d62768..645c8c4366d3 100644
--- a/src/operator/numpy/random/np_weibull_op.cu
+++ b/src/operator/numpy/random/np_weibull_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_weibull)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyWeibullForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_weibull)
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 6904656b304b..44be673e8dcd 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -877,6 +877,10 @@ NNVM_REGISTER_OP(one_hot)
 .set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gpu>);
 
 NNVM_REGISTER_OP(gather_nd)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);
 
 NNVM_REGISTER_OP(scatter_nd)

From f7aa6545b6ef161ed43ace557bcfd1d945df659f Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 14 Sep 2020 13:12:53 -0700
Subject: [PATCH 05/13] Cleaning

---
 src/executor/cuda_graphs.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index 482d6f9bfde4..704100bfcd7d 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -56,20 +56,11 @@ inline std::string CudaDim3ToString(const dim3& dims) {
   return ss.str();
 }
 
-#define CU_CALL(func)                                                         \
-  {                                                                           \
-    CUresult e = (func);                                                      \
-    if (e != CUDA_SUCCESS) {                                                  \
-      std::cerr << "cuda driver failure code: " << e << std::endl;            \
-      exit(1);                                                                \
-    }                                                                         \
-  }
-
 // Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.)
 inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) {
   CUgraphNode cu_node = node;
   CUgraphNodeType t;
-  CU_CALL(cuGraphNodeGetType(cu_node, &t));
+  CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
   return t;
 }
 
@@ -92,7 +83,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
 
   CUgraphNode cu_node = node;
   CUgraphNodeType t;
-  CU_CALL(cuGraphNodeGetType(cu_node, &t));
+  CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
   switch (t) {
     case CU_GRAPH_NODE_TYPE_KERNEL:
       {

From 5d507107f6732a78f3dd2e93261a4e4e69c43229 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 14 Sep 2020 14:31:38 -0700
Subject: [PATCH 06/13] Properly guard graphs

---
 src/executor/cuda_graphs.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index 704100bfcd7d..be5ea111e3c9 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -538,9 +538,9 @@ class CudaGraphsExec {
   bool is_enabled_;
 };
 
-#endif  // CUDA_GRAPHS_AVAILABLE
-
-
 }  // namespace cuda_graphs
 }  // namespace mxnet
+
+#endif  // CUDA_GRAPHS_AVAILABLE
+
 #endif  // MXNET_EXECUTOR_CUDA_GRAPHS_H_

From bb8740e064fe228325cc2d0078b861f5bfdf55bb Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 14 Sep 2020 15:12:47 -0700
Subject: [PATCH 07/13] Limit graphs to CUDA 10.2+

---
 src/executor/cuda_graphs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index be5ea111e3c9..d2e1ba659c2f 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -35,7 +35,7 @@
 #include "../common/cuda_utils.h"
 
 #if MXNET_USE_CUDA
-#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10010)
+#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020)
 #else
 #define CUDA_GRAPHS_AVAILABLE (0)
 #endif

From 1dcb30e8f14740622441dff71e675795283df002 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 15 Sep 2020 09:27:16 -0700
Subject: [PATCH 08/13] Fix the compilation when graphs are not available

---
 src/imperative/imperative_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 4933808ad892..498e868a2b39 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -1003,8 +1003,7 @@ inline Engine::OprHandle CreateEngineOp(
     if (is_async) {
       execs[0]->op_ctx.async_on_complete = on_complete;
     }
-    for (const auto& exec : execs) exec->Run(ctx, is_gpu);
-    OpExecutor::RunAll(execs, ctx, is_gpu);
+    exec::OpExecutor::RunAll(execs, ctx, is_gpu);
 #endif
     // call on complete only if it is async op
     if (!is_async) {

From 3746543dd814d21b56f964a0bf3f93cb5624e56a Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 15 Sep 2020 11:54:57 -0700
Subject: [PATCH 09/13] Guarding the libcuda.so usage behind RTC compilation
 flag

---
 src/executor/cuda_graphs.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index d2e1ba659c2f..cd39ce024903 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -56,14 +56,6 @@ inline std::string CudaDim3ToString(const dim3& dims) {
   return ss.str();
 }
 
-// Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.)
-inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) {
-  CUgraphNode cu_node = node;
-  CUgraphNodeType t;
-  CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
-  return t;
-}
-
 // Return the list of CUDA Graph nodes from a graph
 inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
   size_t numNodes;
@@ -75,6 +67,10 @@ inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
   return graphNodes;
 }
 
+// It does not really involve RTC, but requires libcuda.so,
+// which is linked only when RTC is enabled.
+#if MXNET_ENABLE_CUDA_RTC
+
 inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
   std::stringstream ss;
 
@@ -145,6 +141,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
   return ss.str();
 }
 
+#endif  // MXNET_ENABLE_CUDA_RTC
 
 // CUDA Graphs are managed in RAII fashion by smart pointers below.
 // Function objects (preferred for readability) provide the deleter function.
@@ -244,10 +241,12 @@ class CudaGraphsSubSegExec {
       std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
       size_t num_nodes = graph_nodes.size();
       LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
+#if MXNET_ENABLE_CUDA_RTC
       for (size_t i = 0; i != num_nodes; ++i) {
         LOG(INFO) << "    node " << i << " = "
                   << CudaGraphNodeToString(graph_nodes[i]);
       }
+#endif  // MXNET_ENABLE_CUDA_RTC
     }
   }
 

From a925cebc9b96cd85e7a61e98fb1f123d8f8f74a4 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 15 Sep 2020 20:37:52 -0700
Subject: [PATCH 10/13] Document the env variables

---
 docs/static_site/src/pages/api/faq/env_var.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index e0b70a658b62..831f7ee3e043 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -134,6 +134,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
   - Values: Int ```(default=<value of MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN>)```
   - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
+* MXNET_ENABLE_CUDA_GRAPHS
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible.
+  - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True.
+* MXNET_CUDA_GRAPHS_VERBOSE
+  - Values: 0(false) or  1(true) ```(default=0)```
+  - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed.
+* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
+  - Values: Int ```(default=0)```
+  - The maximum number of log messages generated by CUDA graphs executor.
 
 ## Control the Data Communication
 

From ae4ef9a7ababcacf6050b51482c65454b877900e Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Sep 2020 13:49:00 -0700
Subject: [PATCH 11/13] Add test

---
 tests/python/gpu/test_gluon_gpu.py | 74 +++++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 42a2424c7d9b..54d912eb60b1 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -22,7 +22,7 @@
 import time
 import mxnet as mx
 import multiprocessing as mp
-from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray
+from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, rand_shape_nd
 import mxnet.ndarray as nd
 import numpy as np
 import math
@@ -635,6 +635,78 @@ def test_gemms_true_fp16():
                         atol=atol, rtol=rtol)
     os.environ["MXNET_FC_TRUE_FP16"] = "0"
 
+@with_seed()
+def test_cuda_graphs():
+    class GraphTester(gluon.HybridBlock):
+        def __init__(self, function_to_test, **kwargs)
+            super(GraphTester, self).__init__(**kwargs)
+            with self.name_scope():
+                self.f = function_to_test()
+
+        def hybrid_forward(self, F, *args):
+            # We need to isolate the operation to be fully inside the graph
+            # in order for graphs usage to be possible
+            copied_args = [F.identity(a) for a in args]
+            outputs = self.f(copied_args)
+            if isinstance(outputs, (list, tuple)):
+                return [F.identity(o) for o in outputs]
+            else:
+                return F.identity(outputs)
+
+    class TestDesc:
+        def __init__(self, name, f, num_inputs=1, input_dim=4):
+            self.name = name
+            self.f = f
+            self.num_inputs = num_inputs
+            self.input_dim = input_dim
+
+        def generate_inputs(self):
+            shape = rand_shape_nd(self.input_dim)
+            return [mx.random.uniform(shape=shape) for _ in num_inputs]
+
+    tested_ops = [
+            TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2),
+            TestDesc('add_scalar', lambda: (lambda x: x + 0.5)),
+            TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(3,3))),
+            TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)),
+            TestDesc('Activation', lambda: mx.gluon.nn.Activation(act_type='tanh')),
+        ]
+
+    N = 5
+
+    graph_env = 'MXNET_ENABLE_CUDA_GRAPHS'
+    if graph_env in os.environ:
+        old_env_value = os.environ[graph_env]
+    else:
+        old_env_value = None
+
+    os.environ[graph_env] = '1'
+
+    for test_desc in tested_ops:
+        print("Testing ", test_desc.name)
+        inputs = test_desc.generate_inputs()
+        net = GraphTester(test_desc.f)
+        netg = GraphTester(test_desc.f)
+
+        # initialize parameters
+        net.initialize()
+        netg.initialize()
+
+        net(inputs)
+
+        for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+            p2.set_data(p1.data())
+
+        netg.hybridize(static_alloc=True, static_shape=True)
+
+        for _ in range(N):
+            assert_almost_equal(net(inputs), netg(inputs))
+
+
+    if old_env_value is not None:
+        os.environ[graph_env] = old_env_value
+    else:
+        del(os.environ[graph_env])
 
 if __name__ == '__main__':
     import nose

From adae98a4c6b2ca1a723cb6ffe143a0ac3b899ba3 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Sep 2020 14:09:20 -0700
Subject: [PATCH 12/13] Fix the test

---
 src/executor/cuda_graphs.h         | 10 ++--
 tests/python/gpu/test_gluon_gpu.py | 85 ++++++++++++++++++++++--------
 2 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
index cd39ce024903..e669d7d1d2e3 100644
--- a/src/executor/cuda_graphs.h
+++ b/src/executor/cuda_graphs.h
@@ -468,6 +468,7 @@ class CudaGraphsExec {
   bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
     static auto& fstateful = Op::GetAttr<FCreateOpState>("FCreateOpState");
     static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    static auto& fcompute_ex = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
     const auto& attrs = exec->attrs;
     if (attrs.op != nullptr) {
       const auto f = fgraphcompatible.get(attrs.op, nullptr);
@@ -476,14 +477,15 @@ class CudaGraphsExec {
       }
       if (fstateful.get(attrs.op, nullptr) != nullptr) {
         if (verbose_) {
-          LOG(INFO) << "Omitting stateful operator" << attrs.op->name  << " from CUDA graph.";
+          LOG(INFO) << "Omitting stateful operator " << attrs.op->name  << " from CUDA graph.";
         }
         return false;
       }
-      if (exec->dispatch_mode == DispatchMode::kFComputeEx ||
+      if ((fcompute_ex.get(attrs.op, nullptr) != nullptr &&
+           exec->dispatch_mode == DispatchMode::kFComputeEx) ||
           exec->dispatch_mode == DispatchMode::kFComputeFallback) {
         if (verbose_) {
-          LOG(INFO) << "Omitting operator" << attrs.op->name
+          LOG(INFO) << "Omitting operator " << attrs.op->name
                     << " from CUDA graph due to dispatch mode "
                     << static_cast<int>(exec->dispatch_mode);
         }
@@ -493,7 +495,7 @@ class CudaGraphsExec {
     for (auto& resource : exec->op_ctx.requested) {
       if (!(resource.req.type == ResourceRequest::kTempSpace)) {
         if (verbose_) {
-          LOG(INFO) << "Omitting operator" << attrs.op->name
+          LOG(INFO) << "Omitting operator " << attrs.op->name
                     << " from CUDA graph due to using the resource type "
                     << static_cast<int>(resource.req.type);
         }
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 54d912eb60b1..87aaf5a5566b 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -20,9 +20,10 @@
 import os
 import tempfile
 import time
+import random
 import mxnet as mx
 import multiprocessing as mp
-from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, rand_shape_nd
+from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray
 import mxnet.ndarray as nd
 import numpy as np
 import math
@@ -30,7 +31,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process
+from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed
 from test_gluon import *
 from test_loss import *
 from test_gluon_rnn import *
@@ -638,7 +639,7 @@ def test_gemms_true_fp16():
 @with_seed()
 def test_cuda_graphs():
     class GraphTester(gluon.HybridBlock):
-        def __init__(self, function_to_test, **kwargs)
+        def __init__(self, function_to_test, **kwargs):
             super(GraphTester, self).__init__(**kwargs)
             with self.name_scope():
                 self.f = function_to_test()
@@ -647,7 +648,7 @@ def hybrid_forward(self, F, *args):
             # We need to isolate the operation to be fully inside the graph
             # in order for graphs usage to be possible
             copied_args = [F.identity(a) for a in args]
-            outputs = self.f(copied_args)
+            outputs = self.f(*copied_args)
             if isinstance(outputs, (list, tuple)):
                 return [F.identity(o) for o in outputs]
             else:
@@ -661,30 +662,48 @@ def __init__(self, name, f, num_inputs=1, input_dim=4):
             self.input_dim = input_dim
 
         def generate_inputs(self):
-            shape = rand_shape_nd(self.input_dim)
-            return [mx.random.uniform(shape=shape) for _ in num_inputs]
+            shape = tuple(np.random.randint(4, 11, size=self.input_dim))
+            ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)]
+            for r in ret:
+                r.attach_grad()
+            return ret
 
     tested_ops = [
             TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2),
             TestDesc('add_scalar', lambda: (lambda x: x + 0.5)),
-            TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(3,3))),
+            TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))),
+            TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))),
             TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)),
-            TestDesc('Activation', lambda: mx.gluon.nn.Activation(act_type='tanh')),
+            TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')),
+            #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)),
+            TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()),
+            TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()),
+            TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()),
+            TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()),
+            TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()),
+            TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()),
+            TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()),
+            TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()),
+            TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()),
+            TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)),
+            TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()),
+            TestDesc('ELU', lambda: mx.gluon.nn.ELU()),
+            TestDesc('SELU', lambda: mx.gluon.nn.SELU()),
+            TestDesc('Swish', lambda: mx.gluon.nn.Swish()),
         ]
 
-    N = 5
+    N = 10
 
-    graph_env = 'MXNET_ENABLE_CUDA_GRAPHS'
-    if graph_env in os.environ:
-        old_env_value = os.environ[graph_env]
-    else:
-        old_env_value = None
-
-    os.environ[graph_env] = '1'
+    os.environ['MXNET_ENABLE_CUDA_GRAPHS'] = '1'
+    os.environ['MXNET_USE_FUSION'] = '0'
 
     for test_desc in tested_ops:
         print("Testing ", test_desc.name)
         inputs = test_desc.generate_inputs()
+        inputsg = [i.copy() for i in inputs]
+        for i in inputsg:
+            i.attach_grad()
+        seed = random.randint(0, 10000)
         net = GraphTester(test_desc.f)
         netg = GraphTester(test_desc.f)
 
@@ -692,21 +711,41 @@ def generate_inputs(self):
         net.initialize()
         netg.initialize()
 
-        net(inputs)
+        net(*inputs)
 
         for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
             p2.set_data(p1.data())
 
         netg.hybridize(static_alloc=True, static_shape=True)
 
-        for _ in range(N):
-            assert_almost_equal(net(inputs), netg(inputs))
+        print("Testing inference mode")
+        with random_seed(seed):
+            for _ in range(N):
+                assert_almost_equal(net(*inputs), netg(*inputsg))
 
+        mx.nd.waitall()
+        print("Testing training mode")
+        for _ in range(N):
+            with random_seed(seed):
+                with mx.autograd.record():
+                    out = net(*inputs)
+                out.backward()
+
+            with random_seed(seed):
+                with mx.autograd.record():
+                    outg = netg(*inputsg)
+                outg.backward()
+
+            assert_almost_equal(out, outg)
+            for i, ig in zip(inputs, inputsg):
+                assert_almost_equal(i.grad, ig.grad)
+
+            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                assert_almost_equal(p1.data(), p2.data())
+                if p1.grad_req != 'null':
+                    assert_almost_equal(p1.grad(), p2.grad())
+        mx.nd.waitall()
 
-    if old_env_value is not None:
-        os.environ[graph_env] = old_env_value
-    else:
-        del(os.environ[graph_env])
 
 if __name__ == '__main__':
     import nose

From 83b0b41a22e3e0cdbd30d6fb72eba222b6754e82 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Sep 2020 11:46:54 -0700
Subject: [PATCH 13/13] Use with_environment

---
 tests/python/gpu/test_gluon_gpu.py | 91 +++++++++++++++---------------
 1 file changed, 45 insertions(+), 46 deletions(-)

diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 28eca456a0c9..6b5b4fbbd764 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -702,57 +702,56 @@ def generate_inputs(self):
 
     N = 10
 
-    os.environ['MXNET_ENABLE_CUDA_GRAPHS'] = '1'
-    os.environ['MXNET_USE_FUSION'] = '0'
-
-    for test_desc in tested_ops:
-        print("Testing ", test_desc.name)
-        inputs = test_desc.generate_inputs()
-        inputsg = [i.copy() for i in inputs]
-        for i in inputsg:
-            i.attach_grad()
-        seed = random.randint(0, 10000)
-        net = GraphTester(test_desc.f)
-        netg = GraphTester(test_desc.f)
-
-        # initialize parameters
-        net.initialize()
-        netg.initialize()
-
-        net(*inputs)
-
-        for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-            p2.set_data(p1.data())
+    with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
+                      'MXNET_USE_FUSION': '0'}):
+        for test_desc in tested_ops:
+            print("Testing ", test_desc.name)
+            inputs = test_desc.generate_inputs()
+            inputsg = [i.copy() for i in inputs]
+            for i in inputsg:
+                i.attach_grad()
+            seed = random.randint(0, 10000)
+            net = GraphTester(test_desc.f)
+            netg = GraphTester(test_desc.f)
+
+            # initialize parameters
+            net.initialize()
+            netg.initialize()
+
+            net(*inputs)
 
-        netg.hybridize(static_alloc=True, static_shape=True)
-
-        print("Testing inference mode")
-        with random_seed(seed):
-            for _ in range(N):
-                assert_almost_equal(net(*inputs), netg(*inputsg))
+            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                p2.set_data(p1.data())
 
-        mx.nd.waitall()
-        print("Testing training mode")
-        for _ in range(N):
-            with random_seed(seed):
-                with mx.autograd.record():
-                    out = net(*inputs)
-                out.backward()
+            netg.hybridize(static_alloc=True, static_shape=True)
 
+            print("Testing inference mode")
             with random_seed(seed):
-                with mx.autograd.record():
-                    outg = netg(*inputsg)
-                outg.backward()
-
-            assert_almost_equal(out, outg)
-            for i, ig in zip(inputs, inputsg):
-                assert_almost_equal(i.grad, ig.grad)
+                for _ in range(N):
+                    assert_almost_equal(net(*inputs), netg(*inputsg))
 
-            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
-                assert_almost_equal(p1.data(), p2.data())
-                if p1.grad_req != 'null':
-                    assert_almost_equal(p1.grad(), p2.grad())
-        mx.nd.waitall()
+            mx.nd.waitall()
+            print("Testing training mode")
+            for _ in range(N):
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        out = net(*inputs)
+                    out.backward()
+
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        outg = netg(*inputsg)
+                    outg.backward()
+
+                assert_almost_equal(out, outg)
+                for i, ig in zip(inputs, inputsg):
+                    assert_almost_equal(i.grad, ig.grad)
+
+                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                    assert_almost_equal(p1.data(), p2.data())
+                    if p1.grad_req != 'null':
+                        assert_almost_equal(p1.grad(), p2.grad())
+            mx.nd.waitall()
 
 
 if __name__ == '__main__':