diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md
index e0b70a658b62..831f7ee3e043 100644
--- a/docs/static_site/src/pages/api/faq/env_var.md
+++ b/docs/static_site/src/pages/api/faq/env_var.md
@@ -134,6 +134,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0
 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD
   - Values: Int ```(default=<value of MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN>)```
   - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass.
+* MXNET_ENABLE_CUDA_GRAPHS
+  - Values: 0(false) or 1(true) ```(default=0)```
+  - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible.
+  - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True.
+* MXNET_CUDA_GRAPHS_VERBOSE
+  - Values: 0(false) or  1(true) ```(default=0)```
+  - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed.
+* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES
+  - Values: Int ```(default=0)```
+  - The maximum number of log messages generated by CUDA graphs executor.
 
 ## Control the Data Communication
 
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index a0ac301b1818..88d21f23ea0c 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function<std::vector<int> (const NodeAttrs& att
  */
 using FNeedCalibrateOutput = std::function<std::vector<int> (const NodeAttrs& attrs)>;
 
+#if MXNET_USE_CUDA
+
+/*!
+ * \brief Register a function to determine if
+ * the operator implementation is compatible
+ * with CUDA graphs. This requires the execution
+ * to stay the same as long as the shape and type
+ * of input stays the same.
+ */
+using FIsCUDAGraphsCompatible = std::function<bool (const NodeAttrs& attrs, const bool is_train)>;
+
+#endif
+
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc
index 8f47bc29db13..ccf536920fb0 100644
--- a/src/executor/attach_op_execs_pass.cc
+++ b/src/executor/attach_op_execs_pass.cc
@@ -45,8 +45,10 @@ namespace exec {
 // FComputeExecutor and FStatefulComputeExecutor inherit from this class
 class StorageFallbackOpExecutor : public OpExecutor {
  public:
-  explicit StorageFallbackOpExecutor(const std::vector<uint32_t> &mutate_idx)
-      : mutate_idx_(mutate_idx) {}
+  explicit StorageFallbackOpExecutor(const NodeAttrs& attrs,
+                                     const DispatchMode& dispatch_mode,
+                                     const std::vector<uint32_t> &mutate_idx)
+      : OpExecutor(attrs, dispatch_mode), mutate_idx_(mutate_idx) {}
 
   void Setup() override {
     init_ = false;
@@ -136,11 +138,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor {
     return state_;
   }
 
-  explicit StatefulComputeExecutor(const OpStatePtr& state,
+  explicit StatefulComputeExecutor(const NodeAttrs& attrs,
+                                   const DispatchMode dispatch_mode,
+                                   const OpStatePtr& state,
                                    const FStatefulCompute& fcompute,
                                    ExecType exec_type,
                                    const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
         state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
 
  private:
@@ -159,7 +163,7 @@ class StatefulComputeExExecutor : public OpExecutor {
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
-    if (!is_mkldnn.get(attrs_.op, false)) {
+    if (!is_mkldnn.get(attrs.op, false)) {
       CreateDefaultInputs(in_array, &in_array_fallback);
       fcompute_(state_, op_ctx, in_array_fallback, req, out_array);
       return;
@@ -183,13 +187,14 @@ class StatefulComputeExExecutor : public OpExecutor {
   }
 
   explicit StatefulComputeExExecutor(const NodeAttrs& attrs,
+                                     const DispatchMode& dispatch_mode,
                                      const OpStatePtr& state,
                                      const FStatefulComputeEx& fcompute,
                                      ExecType exec_type)
-      : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {}
+      : OpExecutor(attrs, dispatch_mode), state_(state), fcompute_(fcompute),
+        exec_type_(exec_type) {}
 
  private:
-  NodeAttrs attrs_;
   OpStatePtr state_;
   FStatefulComputeEx fcompute_;
   ExecType exec_type_;
@@ -206,7 +211,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     InvalidateOutputs(out_array, req);
 #endif
     PreFCompute(is_gpu);
-    fcompute_(attrs_, op_ctx, in_data_, req, out_data_);
+    fcompute_(attrs, op_ctx, in_data_, req, out_data_);
     PostFCompute(is_gpu);
   }
 
@@ -214,14 +219,14 @@ class FComputeExecutor : public StorageFallbackOpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute,
-                            ExecType exec_type, const std::vector<uint32_t> &mutate_idx)
-      : StorageFallbackOpExecutor(mutate_idx),
-        attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+  explicit FComputeExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode,
+                            FCompute fcompute, ExecType exec_type,
+                            const std::vector<uint32_t> &mutate_idx)
+      : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx),
+        fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FCompute fcompute_;
   ExecType exec_type_;
 };
@@ -235,13 +240,13 @@ class FComputeExExecutor : public OpExecutor {
     InvalidateOutputs(out_array, req);
     // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented
     const auto is_mkldnn = Op::GetAttr<bool>("TIsMKLDNN");
-    if (!is_mkldnn.get(attrs_.op, false)) {
+    if (!is_mkldnn.get(attrs.op, false)) {
       CreateDefaultInputs(in_array, &in_array_fallback);
-      fcompute_(attrs_, op_ctx, in_array_fallback, req, out_array);
+      fcompute_(attrs, op_ctx, in_array_fallback, req, out_array);
       return;
     }
 #endif
-    fcompute_(attrs_, op_ctx, in_array, req, out_array);
+    fcompute_(attrs, op_ctx, in_array, req, out_array);
   }
 
   void Setup() override {}
@@ -250,13 +255,12 @@ class FComputeExExecutor : public OpExecutor {
     return exec_type_;
   }
 
-  explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute,
-                              ExecType exec_type)
-      : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) {
+  explicit FComputeExExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode,
+                              FComputeEx fcompute, ExecType exec_type)
+      : OpExecutor(attrs, dispatch_mode), fcompute_(fcompute), exec_type_(exec_type) {
   }
 
  private:
-  NodeAttrs attrs_;
   FComputeEx fcompute_;
   ExecType exec_type_;
 };
@@ -310,7 +314,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
         op, "FStatefulComputeEx", vctx[i]);
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
-      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs, state,
+      ret[i] = std::make_shared<StatefulComputeExExecutor>(inode.source->attrs,
+                                                           dispatch_modes[i], state,
                                                            fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
@@ -318,7 +323,9 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(state, fcompute,
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+                                                         dispatch_modes[i],
+                                                         state, fcompute,
                                                          exec_type, mutate_index);
     }
   } else if (is_layer_backward.get(op, false)) {
@@ -331,26 +338,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state,
     // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx
     if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<StatefulComputeExExecutor>(
-          inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex,
-          exec_type);
+          inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(),
+          fcompute_ex, exec_type);
     } else {
       FStatefulCompute fcompute = common::GetFCompute<FStatefulCompute>(
           op, "FStatefulCompute", vctx[i]);
       CHECK(fcompute != nullptr)
           << "One of FStatefulCompute and FStatefulComputeEx must be registered "
           << "for stateful operator " << op->name;
-      ret[i] = std::make_shared<StatefulComputeExecutor>(
-          ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index);
+      ret[i] = std::make_shared<StatefulComputeExecutor>(inode.source->attrs,
+          dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type,
+          mutate_index);
     }
   } else {
     FCompute fcompute = common::GetFCompute<FCompute>(op, "FCompute", vctx[i]);
     FComputeEx fcomp_ex = common::GetFCompute<FComputeEx>(op, "FComputeEx", vctx[i]);
     if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) {
       ret[i] = std::make_shared<FComputeExExecutor>(
-          inode.source->attrs, fcomp_ex, exec_type);
+          inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type);
     } else if (fcompute != nullptr) {
       ret[i] = std::make_shared<FComputeExecutor>(
-          inode.source->attrs, fcompute, exec_type, mutate_index);
+          inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index);
     } else {
       LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name;
     }
diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h
new file mode 100644
index 000000000000..e669d7d1d2e3
--- /dev/null
+++ b/src/executor/cuda_graphs.h
@@ -0,0 +1,547 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2020 by Contributors
+ * \file cuda_graphs.h
+ * \brief Wrappers for use of CUDA Graphs API
+ */
+#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_
+#define MXNET_EXECUTOR_CUDA_GRAPHS_H_
+
+#include <mxnet/base.h>
+#include <vector>
+#include <string>
+#include <map>
+#include <set>
+
+#include "./exec_pass.h"
+#include "../common/cuda_utils.h"
+
+#if MXNET_USE_CUDA
+#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020)
+#else
+#define CUDA_GRAPHS_AVAILABLE (0)
+#endif
+
+#if CUDA_GRAPHS_AVAILABLE
+
+namespace mxnet {
+namespace cuda_graphs {
+
+inline std::string CudaDim3ToString(const dim3& dims) {
+  std::stringstream ss;
+  if (dims.z != 1)
+    ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")";
+  else if (dims.y != 1)
+    ss << "(" << dims.x << "," << dims.y << ")";
+  else
+    ss << "(" << dims.x << ")";
+  return ss.str();
+}
+
+// Return the list of CUDA Graph nodes from a graph
+inline std::vector<cudaGraphNode_t> GetCudaGraphNodes(cudaGraph_t cuda_graph) {
+  size_t numNodes;
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast<cudaGraphNode_t*>(nullptr), &numNodes));
+  if (numNodes == 0)
+    return std::vector<cudaGraphNode_t>();
+  std::vector<cudaGraphNode_t> graphNodes(numNodes);
+  CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes));
+  return graphNodes;
+}
+
+// It does not really involve RTC, but requires libcuda.so,
+// which is linked only when RTC is enabled.
+#if MXNET_ENABLE_CUDA_RTC
+
+inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) {
+  std::stringstream ss;
+
+  // The following introspection calls are made through the driver API in order to bypass
+  // problems that would arise if multiple statically-linked copies of the runtime exist.
+
+  CUgraphNode cu_node = node;
+  CUgraphNodeType t;
+  CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t));
+  switch (t) {
+    case CU_GRAPH_NODE_TYPE_KERNEL:
+      {
+        CUDA_KERNEL_NODE_PARAMS kparams;
+        auto err = cuGraphKernelNodeGetParams(cu_node, &kparams);
+        if (err == CUDA_SUCCESS) {
+          ss << "GPUKernel@" << kparams.func;
+          dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ);
+          dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ);
+          ss << "<<<gridDim=" << CudaDim3ToString(gridDim)
+             << ", blkDim=" << CudaDim3ToString(blockDim) << ">>>";
+          ss << "(...";
+          if (kparams.sharedMemBytes != 0)
+            ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes;
+          ss << ")";
+        } else {
+          ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err;
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMCPY:
+      {
+        cudaMemcpy3DParms mparams = {};
+        CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams));
+        // If memcpy is seen, return without setting up runnable executor
+        switch (mparams.kind) {
+          case cudaMemcpyHostToHost: ss << "Host->Host "; break;
+          case cudaMemcpyHostToDevice: ss << "Host->Device "; break;
+          case cudaMemcpyDeviceToHost: ss << "Device->Host "; break;
+          case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break;
+          default: break;
+        }
+        ss << "Memcpy";
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_MEMSET:
+      {
+        cudaMemsetParams mparams = {};
+        CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams));
+        if (mparams.height == 1 && mparams.elementSize == 1) {
+          ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value
+             << ", count=" << mparams.width << ")";
+        } else {
+          if (mparams.elementSize == 1)
+            ss << "cudaMemset2D";
+          else
+            ss << "MemSet<elemBytes=" << mparams.elementSize << ">";
+          ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch
+             << ", value=" << mparams.value << ", width=" << mparams.width
+             << ", height=" << mparams.height << ")";
+        }
+      }
+      break;
+    case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break;
+    case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break;
+    case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break;
+    default: ss << "Unknown/Invalid node type " << t;
+  }
+  return ss.str();
+}
+
+#endif  // MXNET_ENABLE_CUDA_RTC
+
+// CUDA Graphs are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphDeleter {
+ public:
+  void operator() (cudaGraph_t graph) {
+    if (graph != nullptr)
+      CUDA_CALL(cudaGraphDestroy(graph));
+  }
+};
+
+// CUDA Graphs Executors are managed in RAII fashion by smart pointers below.
+// Function objects (preferred for readability) provide the deleter function.
+class CudaGraphExecDeleter {
+ public:
+  void operator() (cudaGraphExec_t graph_exec) {
+    if (graph_exec != nullptr)
+      CUDA_CALL(cudaGraphExecDestroy(graph_exec));
+  }
+};
+
+// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'),
+// characterized by a starting index in the OpExecutor list and a number of ops.
+class CudaGraphsSubSegExec {
+ public:
+  CudaGraphsSubSegExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops,
+                       bool ops_are_cuda_graph_compatible = true) :
+  from_op_idx_(from_op_idx),
+  num_ops_(num_ops),
+  graph_(nullptr),
+  graph_exec_(nullptr) {
+    if (ops_are_cuda_graph_compatible) {
+      MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops);
+      MakeGraphExec();
+    }
+  }
+
+  void Update(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu,
+              bool verbose) {
+    // Current executor should be Runnable with the same parameters
+    CHECK(IsRunnable());
+    MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_);
+
+    cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError;
+    cudaGraphNode_t error_node;
+    CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(),
+                                  &error_node, &update_result));
+    // If update fails make a new executor, discarding old one.
+    if (update_result != cudaGraphExecUpdateSuccess)
+      MakeGraphExec();
+  }
+
+  void RunSubSeg(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 const RunContext &rctx,
+                 bool is_gpu) {
+    if (IsRunnable()) {
+      auto s = rctx.get_stream<gpu>();
+      const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+      CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s));
+    } else {
+      // No CUDA Graph could be made for this portion of the OpSegment.  Run conventionally.
+      for (int i = 0; i != num_ops_; ++i)
+        exec_list[from_op_idx_ + i]->Run(rctx, is_gpu);
+    }
+  }
+
+  bool IsRunnable() { return graph_exec_ != nullptr; }
+
+ private:
+  void MakeGraph(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                       const RunContext &rctx,
+                       bool is_gpu,
+                       bool verbose,
+                       int from_op_idx,
+                       int num_ops) {
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // Create CUDA Graph
+    // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers
+    // to sync their streams without disturbing this capture.
+    CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal));
+    // Run those oprs in the sub segment while capturing- no actual GPU work is launched.
+    for (int i = 0; i != num_ops; ++i)
+      exec_list[from_op_idx + i]->Run(rctx, is_gpu);
+    cudaGraph_t cuda_graph = nullptr;
+    CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph));
+    graph_.reset(cuda_graph, CudaGraphDeleter());
+
+    if (verbose) {
+      std::vector<cudaGraphNode_t> graph_nodes = GetCudaGraphNodes(cuda_graph);
+      size_t num_nodes = graph_nodes.size();
+      LOG(INFO) << "  Graph has " << num_nodes << " nodes:";
+#if MXNET_ENABLE_CUDA_RTC
+      for (size_t i = 0; i != num_nodes; ++i) {
+        LOG(INFO) << "    node " << i << " = "
+                  << CudaGraphNodeToString(graph_nodes[i]);
+      }
+#endif  // MXNET_ENABLE_CUDA_RTC
+    }
+  }
+
+  void MakeGraphExec() {
+      cudaGraphExec_t cuda_graph_exec;
+      cudaGraphNode_t error_node;
+      char log_buffer[1000];
+
+      CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(),
+                                     &error_node, log_buffer, 1000));
+      graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter());
+
+      // At this point we have a CUDA Graph executor
+      static int num_graph_creations_logged = 0;
+      static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0);
+      if (num_graph_creations_logged < max_log_entries) {
+        num_graph_creations_logged++;
+        LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged;
+        if (num_graph_creations_logged == max_log_entries)
+          LOG(INFO) << "Further CUDA graph creation log messages are suppressed.";
+      }
+  }
+
+  int from_op_idx_;
+  int num_ops_;
+  using cudaGraphStruct_t = typename std::remove_pointer<cudaGraph_t>::type;
+  using cudaGraphExecStruct_t = typename std::remove_pointer<cudaGraphExec_t>::type;
+  std::shared_ptr<cudaGraphStruct_t> graph_;
+  std::shared_ptr<cudaGraphExecStruct_t> graph_exec_;
+};
+
+// The CudaGraph executor and associated Tempspace ptrs for which it is valid.
+struct CudaGraphInfo {
+  std::vector<CudaGraphsSubSegExec> cuda_graph_subseg_execs;
+  bool has_been_run_conventionally = false;
+  std::vector<void *> tempspace_dptrs;
+};
+// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and
+// the state of the is_train flag of the OpContext.  If the tempspace_dptrs change, we
+// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph.
+struct CudaGraphCacheKey {
+  cudaStream_t cu_s;
+  bool is_train;
+  // overload '<' so CudaGraphCacheKey can be used as a std::map key
+  bool operator<(const CudaGraphCacheKey &other) const {
+    return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train);
+  }
+};
+using CudaGraphCache = std::map<CudaGraphCacheKey, CudaGraphInfo>;
+
+class CudaGraphsExec {
+ public:
+  CudaGraphsExec(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                 bool is_gpu,
+                 const char *opr_names) :
+  verbose_(false), is_enabled_(false) {
+    opr_names_ = opr_names ? std::string(opr_names) : std::string();
+    if (is_gpu) {
+      is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false);
+      verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false);
+      SetTempSpaces(exec_list);
+    }
+  }
+
+  void RunAll(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+              const RunContext &rctx,
+              bool is_gpu) {
+    // If this a CPU op or CUDA Graphs use isn't possible, run normally and return
+    if (!is_gpu || !is_enabled_) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      return;
+    }
+
+    // Also if we're in a warm-up period where tempspace pointers are likely
+    // to change, run normally and return
+    auto s = rctx.get_stream<gpu>();
+    const cudaStream_t cu_s = mshadow::Stream<gpu>::GetStream(s);
+    // All the ops in the bulked segment will have the same setting of is_train as the first op
+    const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train;
+    const CudaGraphCacheKey key = {cu_s, is_train};
+    // Look-up the CUDA Graph info for this combo of stream and is_train setting
+    // This may create a default-initialized new entry.
+    auto &cuda_graph_info = cache_[key];
+    if (!cuda_graph_info.has_been_run_conventionally) {
+      // Run all opr in the sub-graph
+      exec::OpExecutor::RunAll(exec_list, rctx, is_gpu);
+      cuda_graph_info.has_been_run_conventionally = true;
+      return;
+    }
+
+    // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors'
+    //     (there might be more than one executor if some ops in the segment are not capturable)
+    auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s);
+
+    // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'.
+    if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 &&
+        cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) {
+      // Update all runnable executors.  Non-runnable executors launch their ops conventionally.
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          subseg_exec.Update(exec_list, rctx, is_gpu, verbose_);
+      }
+    } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) {
+      // No executors exist yet, so create them.
+      if (verbose_)
+        LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_;
+      // Make one or more CUDA Graphs, avoiding ops that are not compatible.
+      for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) {
+          if (OpOK(exec_list[last_op_idx]))
+            num_good_ops++;
+          else
+            break;
+        }
+        if (num_good_ops > 0) {
+          CreateSubExecOverRegion(exec_list, rctx, is_gpu,
+                                  first_op_idx,
+                                  first_op_idx + num_good_ops,
+                                  &cuda_graph_info.cuda_graph_subseg_execs);
+          first_op_idx += num_good_ops;
+        }
+        if (first_op_idx != exec_list.size()) {
+          // We had to have hit an op that was not OK.
+          if (verbose_) {
+            LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]"
+                      << " of op segment "  << opr_names_;
+          }
+          CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false);
+          cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg);
+          first_op_idx++;
+        }
+      }
+      // During graph capture, the ops may be asking for the tempworkspace.  This should
+      // not alter the base pointers, since this op seg has been executed before on this
+      // stream (i.e. on this gpu worker).  Safest to double-check this though.
+      auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s);
+      if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs)
+        LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use.";
+      cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs;
+    }
+    // Now execute the CUDA Graph that we either just created or looked-up in the cache.
+    if (verbose_) {
+      int runnable_execs = 0;
+      int bypassed_ops = 0;
+      for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) {
+        if (subseg_exec.IsRunnable())
+          runnable_execs++;
+        else
+          bypassed_ops++;
+      }
+      LOG(INFO) << "Launching " << runnable_execs
+                << " captured CUDA graph(s) for op segment " << opr_names_;
+      if (bypassed_ops > 0)
+        LOG(INFO) << "    (bypassing " << bypassed_ops << " un-capturable ops)";
+    }
+    for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs)
+      subseg_exec.RunSubSeg(exec_list, rctx, is_gpu);
+  }
+
+ private:
+  // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx).  If such a graph
+  // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a
+  // number of smaller graphs that avoid those ops with the memcpys.
+  void CreateSubExecOverRegion(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list,
+                               const RunContext &rctx,
+                               bool is_gpu,
+                               size_t from_op_idx,
+                               size_t upto_op_idx,
+                               std::vector<CudaGraphsSubSegExec> *cuda_graph_subseg_execs) {
+    // Optimistically try to create a CUDA Graph of the entire op segment region
+
+    int num_ops = upto_op_idx - from_op_idx;
+    CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops);
+    if (full_opseg.IsRunnable()) {
+      cuda_graph_subseg_execs->push_back(full_opseg);
+    } else {
+      if (verbose_)
+        LOG(INFO) << "  Graph was not runnable- creating op sub-segments...";
+      // Enter fall-back approach to making many sub-execs
+      for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) {
+        int num_good_ops = 0;
+        for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) {
+          CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1);
+          if (single_opseg.IsRunnable())
+            num_good_ops++;
+          // Is it time to create a subseg exec from accumulated good ops?
+          if (num_good_ops > 0 &&
+              (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) {
+            if (verbose_)
+              LOG(INFO) << "Capturing CUDA graph of op sub segment["
+                        << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]"
+                        << " of op segment "  << opr_names_;
+            CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_,
+                                            first_op_idx, num_good_ops);
+            CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation";
+            cuda_graph_subseg_execs->push_back(good_opseg);
+            first_op_idx += num_good_ops;
+          }
+          // If the last single op was not runnable, use the exec to handle that op conventionally
+          if (!single_opseg.IsRunnable()) {
+            if (verbose_) {
+              LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]"
+                        << " of op segment "  << opr_names_;
+              // Generate throw-away exec in order to produce a diagnostic listing of graph nodes
+              CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1);
+            }
+            cuda_graph_subseg_execs->push_back(single_opseg);
+            first_op_idx++;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Is the Op OK to make part of a CUDA Graph?
+  bool OpOK(const std::shared_ptr<exec::OpExecutor> &exec) {
+    static auto& fstateful = Op::GetAttr<FCreateOpState>("FCreateOpState");
+    static auto& fgraphcompatible = Op::GetAttr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible");
+    static auto& fcompute_ex = Op::GetAttr<FComputeEx>("FComputeEx<gpu>");
+    const auto& attrs = exec->attrs;
+    if (attrs.op != nullptr) {
+      const auto f = fgraphcompatible.get(attrs.op, nullptr);
+      if (f != nullptr) {
+        return f(attrs, exec->op_ctx.is_train);
+      }
+      if (fstateful.get(attrs.op, nullptr) != nullptr) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting stateful operator " << attrs.op->name  << " from CUDA graph.";
+        }
+        return false;
+      }
+      if ((fcompute_ex.get(attrs.op, nullptr) != nullptr &&
+           exec->dispatch_mode == DispatchMode::kFComputeEx) ||
+          exec->dispatch_mode == DispatchMode::kFComputeFallback) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator " << attrs.op->name
+                    << " from CUDA graph due to dispatch mode "
+                    << static_cast<int>(exec->dispatch_mode);
+        }
+        return false;
+      }
+    }
+    for (auto& resource : exec->op_ctx.requested) {
+      if (!(resource.req.type == ResourceRequest::kTempSpace)) {
+        if (verbose_) {
+          LOG(INFO) << "Omitting operator " << attrs.op->name
+                    << " from CUDA graph due to using the resource type "
+                    << static_cast<int>(resource.req.type);
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // Determine Tempspaces used by ops.  Other resource uses disable CUDA Graphs.
+  void SetTempSpaces(const std::vector<std::shared_ptr<exec::OpExecutor> > &exec_list) {
+    // Gather info about the ops use of TempSpace.
+    if (is_enabled_) {
+      std::set<Resource *> tempspaces_set;
+      for (auto& exec : exec_list) {
+        for (auto& resource : exec->op_ctx.requested) {
+          if (resource.req.type == ResourceRequest::kTempSpace) {
+            tempspaces_set.insert(&resource);
+          }
+        }
+      }
+      tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end());
+    }
+  }
+
+  // Return the addresses of the gpu TempSpace areas
+  std::vector<void *> GetGPUTempspacePtrs(mshadow::Stream<gpu> *s) {
+    std::vector<void *> ret;
+    for (const auto& resource : tempspaces_) {
+      // Ask for minimal allocation to get base pointer without increasing the size
+      auto *base_ptr = resource->get_space_typed<gpu, 1, char>(mshadow::Shape1(1), s).dptr_;
+      ret.push_back(static_cast<void *>(base_ptr));
+    }
+    return ret;
+  }
+
+  CudaGraphCache cache_;
+  std::vector<Resource *> tempspaces_;
+  std::string opr_names_;
+  bool verbose_;
+  bool is_enabled_;
+};
+
+}  // namespace cuda_graphs
+}  // namespace mxnet
+
+#endif  // CUDA_GRAPHS_AVAILABLE
+
+#endif  // MXNET_EXECUTOR_CUDA_GRAPHS_H_
diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h
index e3d2fa459bc3..4552fa173fe4 100644
--- a/src/executor/exec_pass.h
+++ b/src/executor/exec_pass.h
@@ -31,6 +31,7 @@
 #include <mxnet/graph_attr_types.h>
 #include <nnvm/graph.h>
 #include <nnvm/graph_attr_types.h>
+#include <utility>
 #include <vector>
 #include <memory>
 #include <string>
@@ -86,6 +87,13 @@ class OpExecutor {
   std::vector<OpReqType> req;
   /*! \brief runtime op context, contains allocated resources */
   OpContext op_ctx;
+  /*! \brief attributes of the node */
+  NodeAttrs attrs;
+  /*! \brief dispatch mode of the executor */
+  DispatchMode dispatch_mode;
+
+  explicit OpExecutor(NodeAttrs  attrs, DispatchMode dispatch_mode) :
+    attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {}
   /*! \brief virtual destructor */
   virtual ~OpExecutor() {}
   /*!
@@ -100,6 +108,16 @@ class OpExecutor {
    * \param rctx The runtime context passed in by environment.
    */
   virtual void Run(RunContext rctx, bool is_gpu) = 0;
+  /*!
+   * \brief run the operators of a vector of execs, given runtime context on device.
+   *  This function call does not synchronize the stream.
+   * \param rctx The runtime context passed in by environment.
+   */
+  static void RunAll(const std::vector<std::shared_ptr<OpExecutor> > &execs,
+                     RunContext rctx, bool is_gpu) {
+    for (auto &exec : execs)
+      exec->Run(rctx, is_gpu);
+  }
   /*! \return the execution type */
   virtual ExecType exec_type() const = 0;
   /*! \return return engine variable for operator states */
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index 13bab2e544bf..c7febc4712f1 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -31,6 +31,7 @@
 
 #include "./exec_pass.h"
 #include "./graph_executor.h"
+#include "./cuda_graphs.h"
 #include "../profiler/profiler.h"
 #include "../common/utils.h"
 #include "../common/exec_utils.h"
@@ -84,7 +85,8 @@ void GraphExecutor::Forward(bool is_train) {
 void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) {
   size_t sstep = static_cast<size_t>(step);
   if (sstep >= num_forward_nodes_) {
-    *step_left = 0; return;
+    *step_left = 0;
+    return;
   }
   RunOps(is_train, sstep, sstep + 1);
   *step_left = static_cast<int>(num_forward_nodes_ - sstep - 1);
@@ -166,11 +168,12 @@ void GraphExecutor::Backward(const std::vector<NDArray>& head_grads, bool is_tra
 }
 
 void GraphExecutor::Print(std::ostream &os) const {  // NOLINT(*)
-  nnvm::Symbol s; s.outputs = graph_.outputs;
+  nnvm::Symbol s;
+  s.outputs = graph_.outputs;
   s.Print(os);
   // message to be backward compatible with the memonger
   size_t total_bytes = graph_.GetAttr<size_t>("storage_allocated_bytes");
-  os << "Total " << (total_bytes >> 20UL) <<" MB allocated\n";
+  os << "Total " << (total_bytes >> 20UL) << " MB allocated\n";
   os << "Total " << 11 << " TempSpace resource requested\n";
 }
 
@@ -337,7 +340,8 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
     g = exec::EliminateCommonExpr(std::move(g));
   need_grad_ = false;
   for (OpReqType req : grad_req_types) {
-    if (req != kNullOp) need_grad_ = true;
+    if (req != kNullOp)
+      need_grad_ = true;
   }
   if (!need_grad_) return g;
   for (size_t i = 0; i < g.outputs.size(); ++i) {
@@ -1312,12 +1316,12 @@ void GraphExecutor::InitCachedOps() {
       // call on complete only if it is async op
       if (!is_async) {
         if (is_gpu) {
-        #if MXNET_USE_CUDA
+#if MXNET_USE_CUDA
           // Wait GPU kernel to finish.
           ctx.get_stream<gpu>()->Wait();
-        #else
+#else
           LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
-        #endif
+#endif
         }
         on_complete();
       }
@@ -1607,21 +1611,31 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start,
     opr_names += inode.source->op()->name + ",";
   }
 
-  if (pctx == nullptr) return ret;
+  if (pctx == nullptr)
+    return ret;
   ret.ctx = *pctx;
   Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars);
 
   bool is_gpu = pctx->dev_mask() == gpu::kDevMask;
+
+#if CUDA_GRAPHS_AVAILABLE
+  // Provide initialized `cuda_graphs_exec`, which when captured
+  // by exec_fun, acts like a static variable inside the mutable closure.
+  cuda_graphs::CudaGraphsExec cuda_graphs_exec(exec_list, is_gpu, opr_names.c_str());
+  auto exec_fun = [cuda_graphs_exec, exec_list, is_gpu] (
+      RunContext rctx, Engine::CallbackOnComplete on_complete) mutable {
+    // Run all opr in the sub-graph with CUDA graphs executor if possible
+    cuda_graphs_exec.RunAll(exec_list, rctx, is_gpu);
+#else
   auto exec_fun = [exec_list, is_gpu] (
-      RunContext ctx, Engine::CallbackOnComplete on_complete) {
+      RunContext rctx, Engine::CallbackOnComplete on_complete) {
     // Run all opr in the sub-graph
-    for (auto &exec : exec_list) {
-      exec->Run(ctx, is_gpu);
-    }
+    OpExecutor::RunAll(exec_list, rctx, is_gpu);
+#endif
     if (is_gpu) {
 #if MXNET_USE_CUDA
       // Wait GPU kernel to finish.
-      ctx.get_stream<gpu>()->Wait();
+      rctx.get_stream<gpu>()->Wait();
 #else
       LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR;
 #endif
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 156013857d6a..498e868a2b39 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -26,6 +26,7 @@
 #include <map>
 #include <string>
 #include "../executor/graph_executor.h"
+#include "../executor/cuda_graphs.h"
 #include "../executor/exec_pass.h"
 #include "../c_api/c_api_common.h"
 #include "../common/utils.h"
@@ -985,12 +986,25 @@ inline Engine::OprHandle CreateEngineOp(
   bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask;
   bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync;
 
+#if CUDA_GRAPHS_AVAILABLE
+  // Provide initialized `cuda_graphs_exec`, which when captured
+  // by exec_fun, acts like a static variable inside the mutable closure.
+  cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names);
+  auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] (
+      RunContext ctx, Engine::CallbackOnComplete on_complete) mutable {
+    if (is_async) {
+      execs[0]->op_ctx.async_on_complete = on_complete;
+    }
+    // Run all opr in the sub-graph with CUDA graphs executor if possible
+    cuda_graphs_exec.RunAll(execs, ctx, is_gpu);
+#else
   auto exec_fun = [execs, is_async, is_gpu] (
       RunContext ctx, Engine::CallbackOnComplete on_complete) {
     if (is_async) {
       execs[0]->op_ctx.async_on_complete = on_complete;
     }
-    for (const auto& exec : execs) exec->Run(ctx, is_gpu);
+    exec::OpExecutor::RunAll(execs, ctx, is_gpu);
+#endif
     // call on complete only if it is async op
     if (!is_async) {
       if (is_gpu) {
diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu
index 2b0040e5f6ac..1a81ba0fa99e 100644
--- a/src/operator/contrib/adamw.cu
+++ b/src/operator/contrib/adamw.cu
@@ -41,15 +41,31 @@ void GetScaleFloat<gpu>(mshadow::Stream<gpu> *s, const TBlob &scale_blob, float
 }
 
 NNVM_REGISTER_OP(_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", MPUpdate<gpu, AdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_mp_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", MPUpdate<gpu, MPAdamWUpdate<gpu>>);
 
 NNVM_REGISTER_OP(_multi_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", multiMPUpdate<gpu, false>);
 
 NNVM_REGISTER_OP(_multi_mp_adamw_update)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", multiMPUpdate<gpu, true>);
 
 }  // namespace op
diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu
index 20c5714dd904..f3a99eb431a4 100644
--- a/src/operator/nn/dropout.cu
+++ b/src/operator/nn/dropout.cu
@@ -30,6 +30,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(Dropout)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                   [](const NodeAttrs&, const bool is_train) {
+    // Dropout is just passthrough during inference
+    return !is_train;
+    })
 .set_attr<FStatefulCompute>("FStatefulCompute<gpu>", DropoutCompute<gpu>);
 
 NNVM_REGISTER_OP(_backward_Dropout)
diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu
index c0184ad221d5..ce4d5a173ced 100644
--- a/src/operator/numpy/linalg/np_eig.cu
+++ b/src/operator/numpy/linalg/np_eig.cu
@@ -29,11 +29,19 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_eig)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EighOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu
index 974dedc6172e..0b21ffe24262 100644
--- a/src/operator/numpy/linalg/np_eigvals.cu
+++ b/src/operator/numpy/linalg/np_eigvals.cu
@@ -29,11 +29,19 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_eigvals)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigvalsOpForward<gpu>);
 
 #if MXNET_USE_CUSOLVER == 1
 
 NNVM_REGISTER_OP(_npi_eigvalsh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", EigvalshOpForward<gpu>);
 
 #endif
diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu
index 09e85ab36f19..1936cf937419 100644
--- a/src/operator/numpy/linalg/np_norm_backward.cu
+++ b/src/operator/numpy/linalg/np_norm_backward.cu
@@ -27,6 +27,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_backward_npi_norm)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs& attrs, const bool) {
+      const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+      return param.axis.value().ndim() == 2;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeBackward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu
index 6feecb09a09e..7f8673cd0ec9 100644
--- a/src/operator/numpy/linalg/np_norm_forward.cu
+++ b/src/operator/numpy/linalg/np_norm_forward.cu
@@ -27,6 +27,11 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_norm)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs& attrs, const bool) {
+      const NumpyNormParam& param = nnvm::get<NumpyNormParam>(attrs.parsed);
+      return param.axis.value().ndim() == 2;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormComputeForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu
index 6fa59bea7710..909a7de63788 100644
--- a/src/operator/numpy/np_boolean_mask_assign.cu
+++ b/src/operator/numpy/np_boolean_mask_assign.cu
@@ -263,9 +263,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs,
 }
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBooleanAssignForwardGPU);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu
index f83fca0e5c33..60260affdef3 100644
--- a/src/operator/numpy/np_constraint_check.cu
+++ b/src/operator/numpy/np_constraint_check.cu
@@ -39,6 +39,10 @@ void GetReduceOutput<gpu>(mshadow::Stream<gpu> *s, const TBlob &output_blob, boo
 }
 
 NNVM_REGISTER_OP(_npx_constraint_check)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", ConstraintCheckForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu
index b24c230d6216..09cdd2e30796 100644
--- a/src/operator/numpy/np_percentile_op.cu
+++ b/src/operator/numpy/np_percentile_op.cu
@@ -48,6 +48,10 @@ bool CheckInvalidInput(mshadow::Stream<gpu> *s, const QType *data,
 }
 
 NNVM_REGISTER_OP(_npi_percentile)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+                                   [](const NodeAttrs&, const bool) {
+    return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyPercentileForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu
index a73bf9929db3..5ac005f85db6 100644
--- a/src/operator/numpy/random/np_bernoulli_op.cu
+++ b/src/operator/numpy/random/np_bernoulli_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_bernoulli)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyBernoulliForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu
index 4740b644a60c..4161a4bf7826 100644
--- a/src/operator/numpy/random/np_exponential_op.cu
+++ b/src/operator/numpy/random/np_exponential_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_exponential)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyExponentialForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_exponential)
diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu
index 5be15c7b9d13..8c3bca0ca44f 100644
--- a/src/operator/numpy/random/np_gamma_op.cu
+++ b/src/operator/numpy/random/np_gamma_op.cu
@@ -30,6 +30,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_gamma)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyGammaForward<gpu, double>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu
index 132d67beeb6a..23de9a72bde6 100644
--- a/src/operator/numpy/random/np_multinomial_op.cu
+++ b/src/operator/numpy/random/np_multinomial_op.cu
@@ -43,6 +43,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) {
 }
 
 NNVM_REGISTER_OP(_npi_multinomial)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyMultinomialForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu
index d45bc2321bd7..dc5b73254b08 100644
--- a/src/operator/numpy/random/np_normal_op.cu
+++ b/src/operator/numpy/random/np_normal_op.cu
@@ -29,12 +29,20 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_normal)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_normal)
 .set_attr<FCompute>("FCompute<gpu>", NormalReparamBackward<gpu>);
 
 NNVM_REGISTER_OP(_npi_normal_n)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyNormalForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu
index d8a8a896e653..eecc2a8d44e8 100644
--- a/src/operator/numpy/random/np_pareto_op.cu
+++ b/src/operator/numpy/random/np_pareto_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_pareto)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyParetoForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_pareto)
diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu
index d5067f83bb02..154ef2e18f45 100644
--- a/src/operator/numpy/random/np_power_op.cu
+++ b/src/operator/numpy/random/np_power_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_powerd)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyPowerForward<gpu>);
 
 }  // namespace op
diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu
index d35ac073fc4d..4eb6e390a922 100644
--- a/src/operator/numpy/random/np_rayleigh_op.cu
+++ b/src/operator/numpy/random/np_rayleigh_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_rayleigh)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyRayleighForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_rayleigh)
diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu
index 57d609d62768..645c8c4366d3 100644
--- a/src/operator/numpy/random/np_weibull_op.cu
+++ b/src/operator/numpy/random/np_weibull_op.cu
@@ -29,6 +29,10 @@ namespace mxnet {
 namespace op {
 
 NNVM_REGISTER_OP(_npi_weibull)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", NumpyWeibullForward<gpu>);
 
 NNVM_REGISTER_OP(_backward_broadcast_weibull)
diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu
index 6904656b304b..44be673e8dcd 100644
--- a/src/operator/tensor/indexing_op.cu
+++ b/src/operator/tensor/indexing_op.cu
@@ -877,6 +877,10 @@ NNVM_REGISTER_OP(one_hot)
 .set_attr<FCompute>("FCompute<gpu>", OneHotOpForward<gpu>);
 
 NNVM_REGISTER_OP(gather_nd)
+.set_attr<FIsCUDAGraphsCompatible>("FIsCUDAGraphsCompatible",
+    [](const NodeAttrs&, const bool) {
+      return false;
+    })
 .set_attr<FCompute>("FCompute<gpu>", GatherNDForwardGPU);
 
 NNVM_REGISTER_OP(scatter_nd)
diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py
index 60a90c9f5c0c..6b5b4fbbd764 100644
--- a/tests/python/gpu/test_gluon_gpu.py
+++ b/tests/python/gpu/test_gluon_gpu.py
@@ -20,6 +20,7 @@
 import os
 import tempfile
 import time
+import random
 import mxnet as mx
 import multiprocessing as mp
 from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, environment
@@ -30,7 +31,7 @@
 
 curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process
+from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed
 from test_gluon import *
 from test_loss import *
 from test_gluon_rnn import *
@@ -643,6 +644,115 @@ def test_gemms_true_fp16():
     assert_almost_equal(ref_results.asnumpy(), results_trueFP16.asnumpy(),
                         atol=atol, rtol=rtol)
 
+@with_seed()
+def test_cuda_graphs():
+    class GraphTester(gluon.HybridBlock):
+        def __init__(self, function_to_test, **kwargs):
+            super(GraphTester, self).__init__(**kwargs)
+            with self.name_scope():
+                self.f = function_to_test()
+
+        def hybrid_forward(self, F, *args):
+            # We need to isolate the operation to be fully inside the graph
+            # in order for graphs usage to be possible
+            copied_args = [F.identity(a) for a in args]
+            outputs = self.f(*copied_args)
+            if isinstance(outputs, (list, tuple)):
+                return [F.identity(o) for o in outputs]
+            else:
+                return F.identity(outputs)
+
+    class TestDesc:
+        def __init__(self, name, f, num_inputs=1, input_dim=4):
+            self.name = name
+            self.f = f
+            self.num_inputs = num_inputs
+            self.input_dim = input_dim
+
+        def generate_inputs(self):
+            shape = tuple(np.random.randint(4, 11, size=self.input_dim))
+            ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)]
+            for r in ret:
+                r.attach_grad()
+            return ret
+
+    tested_ops = [
+            TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2),
+            TestDesc('add_scalar', lambda: (lambda x: x + 0.5)),
+            TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))),
+            TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))),
+            TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)),
+            TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')),
+            #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)),
+            TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()),
+            TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()),
+            TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()),
+            TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()),
+            TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()),
+            TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()),
+            TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()),
+            TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()),
+            TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()),
+            TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)),
+            TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()),
+            TestDesc('ELU', lambda: mx.gluon.nn.ELU()),
+            TestDesc('SELU', lambda: mx.gluon.nn.SELU()),
+            TestDesc('Swish', lambda: mx.gluon.nn.Swish()),
+        ]
+
+    N = 10
+
+    with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1',
+                      'MXNET_USE_FUSION': '0'}):
+        for test_desc in tested_ops:
+            print("Testing ", test_desc.name)
+            inputs = test_desc.generate_inputs()
+            inputsg = [i.copy() for i in inputs]
+            for i in inputsg:
+                i.attach_grad()
+            seed = random.randint(0, 10000)
+            net = GraphTester(test_desc.f)
+            netg = GraphTester(test_desc.f)
+
+            # initialize parameters
+            net.initialize()
+            netg.initialize()
+
+            net(*inputs)
+
+            for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                p2.set_data(p1.data())
+
+            netg.hybridize(static_alloc=True, static_shape=True)
+
+            print("Testing inference mode")
+            with random_seed(seed):
+                for _ in range(N):
+                    assert_almost_equal(net(*inputs), netg(*inputsg))
+
+            mx.nd.waitall()
+            print("Testing training mode")
+            for _ in range(N):
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        out = net(*inputs)
+                    out.backward()
+
+                with random_seed(seed):
+                    with mx.autograd.record():
+                        outg = netg(*inputsg)
+                    outg.backward()
+
+                assert_almost_equal(out, outg)
+                for i, ig in zip(inputs, inputsg):
+                    assert_almost_equal(i.grad, ig.grad)
+
+                for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()):
+                    assert_almost_equal(p1.data(), p2.data())
+                    if p1.grad_req != 'null':
+                        assert_almost_equal(p1.grad(), p2.grad())
+            mx.nd.waitall()
+
 
 if __name__ == '__main__':
     import nose