diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md index e0b70a658b62..831f7ee3e043 100644 --- a/docs/static_site/src/pages/api/faq/env_var.md +++ b/docs/static_site/src/pages/api/faq/env_var.md @@ -134,6 +134,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD - Values: Int ```(default=)``` - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass. +* MXNET_ENABLE_CUDA_GRAPHS + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible. + - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True. +* MXNET_CUDA_GRAPHS_VERBOSE + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed. +* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES + - Values: Int ```(default=0)``` + - The maximum number of log messages generated by CUDA graphs executor. ## Control the Data Communication diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index a0ac301b1818..88d21f23ea0c 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function (const NodeAttrs& att */ using FNeedCalibrateOutput = std::function (const NodeAttrs& attrs)>; +#if MXNET_USE_CUDA + +/*! + * \brief Register a function to determine if + * the operator implementation is compatible + * with CUDA graphs. This requires the execution + * to stay the same as long as the shape and type + * of input stays the same. + */ +using FIsCUDAGraphsCompatible = std::function; + +#endif + } // namespace mxnet #endif // MXNET_OP_ATTR_TYPES_H_ diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 8f47bc29db13..ccf536920fb0 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -45,8 +45,10 @@ namespace exec { // FComputeExecutor and FStatefulComputeExecutor inherit from this class class StorageFallbackOpExecutor : public OpExecutor { public: - explicit StorageFallbackOpExecutor(const std::vector &mutate_idx) - : mutate_idx_(mutate_idx) {} + explicit StorageFallbackOpExecutor(const NodeAttrs& attrs, + const DispatchMode& dispatch_mode, + const std::vector &mutate_idx) + : OpExecutor(attrs, dispatch_mode), mutate_idx_(mutate_idx) {} void Setup() override { init_ = false; @@ -136,11 +138,13 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { return state_; } - explicit StatefulComputeExecutor(const OpStatePtr& state, + explicit StatefulComputeExecutor(const NodeAttrs& attrs, + const DispatchMode dispatch_mode, + const OpStatePtr& state, const FStatefulCompute& fcompute, ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: @@ -159,7 +163,7 @@ class StatefulComputeExExecutor : public OpExecutor { InvalidateOutputs(out_array, req); // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented const auto is_mkldnn = Op::GetAttr("TIsMKLDNN"); - if (!is_mkldnn.get(attrs_.op, false)) { + if (!is_mkldnn.get(attrs.op, false)) { CreateDefaultInputs(in_array, &in_array_fallback); fcompute_(state_, op_ctx, in_array_fallback, req, out_array); return; @@ -183,13 +187,14 @@ class StatefulComputeExExecutor : public OpExecutor { } explicit StatefulComputeExExecutor(const NodeAttrs& attrs, + const DispatchMode& dispatch_mode, const OpStatePtr& state, const FStatefulComputeEx& fcompute, ExecType exec_type) - : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} + : OpExecutor(attrs, dispatch_mode), state_(state), fcompute_(fcompute), + exec_type_(exec_type) {} private: - NodeAttrs attrs_; OpStatePtr state_; FStatefulComputeEx fcompute_; ExecType exec_type_; @@ -206,7 +211,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor { InvalidateOutputs(out_array, req); #endif PreFCompute(is_gpu); - fcompute_(attrs_, op_ctx, in_data_, req, out_data_); + fcompute_(attrs, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); } @@ -214,14 +219,14 @@ class FComputeExecutor : public StorageFallbackOpExecutor { return exec_type_; } - explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, - ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), - attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + explicit FComputeExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode, + FCompute fcompute, ExecType exec_type, + const std::vector &mutate_idx) + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), + fcompute_(fcompute), exec_type_(exec_type) { } private: - NodeAttrs attrs_; FCompute fcompute_; ExecType exec_type_; }; @@ -235,13 +240,13 @@ class FComputeExExecutor : public OpExecutor { InvalidateOutputs(out_array, req); // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented const auto is_mkldnn = Op::GetAttr("TIsMKLDNN"); - if (!is_mkldnn.get(attrs_.op, false)) { + if (!is_mkldnn.get(attrs.op, false)) { CreateDefaultInputs(in_array, &in_array_fallback); - fcompute_(attrs_, op_ctx, in_array_fallback, req, out_array); + fcompute_(attrs, op_ctx, in_array_fallback, req, out_array); return; } #endif - fcompute_(attrs_, op_ctx, in_array, req, out_array); + fcompute_(attrs, op_ctx, in_array, req, out_array); } void Setup() override {} @@ -250,13 +255,12 @@ class FComputeExExecutor : public OpExecutor { return exec_type_; } - explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute, - ExecType exec_type) - : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + explicit FComputeExExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode, + FComputeEx fcompute, ExecType exec_type) + : OpExecutor(attrs, dispatch_mode), fcompute_(fcompute), exec_type_(exec_type) { } private: - NodeAttrs attrs_; FComputeEx fcompute_; ExecType exec_type_; }; @@ -310,7 +314,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, op, "FStatefulComputeEx", vctx[i]); // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared(inode.source->attrs, state, + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], state, fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute( @@ -318,7 +323,9 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared(state, fcompute, + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], + state, fcompute, exec_type, mutate_index); } } else if (is_layer_backward.get(op, false)) { @@ -331,26 +338,27 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared( - inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex, - exec_type); + inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(), + fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute( op, "FStatefulCompute", vctx[i]); CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared( - ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index); + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type, + mutate_index); } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); FComputeEx fcomp_ex = common::GetFCompute(op, "FComputeEx", vctx[i]); if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared( - inode.source->attrs, fcomp_ex, exec_type); + inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type); } else if (fcompute != nullptr) { ret[i] = std::make_shared( - inode.source->attrs, fcompute, exec_type, mutate_index); + inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index); } else { LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name; } diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h new file mode 100644 index 000000000000..e669d7d1d2e3 --- /dev/null +++ b/src/executor/cuda_graphs.h @@ -0,0 +1,547 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file cuda_graphs.h + * \brief Wrappers for use of CUDA Graphs API + */ +#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_ +#define MXNET_EXECUTOR_CUDA_GRAPHS_H_ + +#include +#include +#include +#include +#include + +#include "./exec_pass.h" +#include "../common/cuda_utils.h" + +#if MXNET_USE_CUDA +#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020) +#else +#define CUDA_GRAPHS_AVAILABLE (0) +#endif + +#if CUDA_GRAPHS_AVAILABLE + +namespace mxnet { +namespace cuda_graphs { + +inline std::string CudaDim3ToString(const dim3& dims) { + std::stringstream ss; + if (dims.z != 1) + ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")"; + else if (dims.y != 1) + ss << "(" << dims.x << "," << dims.y << ")"; + else + ss << "(" << dims.x << ")"; + return ss.str(); +} + +// Return the list of CUDA Graph nodes from a graph +inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { + size_t numNodes; + CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast(nullptr), &numNodes)); + if (numNodes == 0) + return std::vector(); + std::vector graphNodes(numNodes); + CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes)); + return graphNodes; +} + +// It does not really involve RTC, but requires libcuda.so, +// which is linked only when RTC is enabled. +#if MXNET_ENABLE_CUDA_RTC + +inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { + std::stringstream ss; + + // The following introspection calls are made through the driver API in order to bypass + // problems that would arise if multiple statically-linked copies of the runtime exist. + + CUgraphNode cu_node = node; + CUgraphNodeType t; + CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); + switch (t) { + case CU_GRAPH_NODE_TYPE_KERNEL: + { + CUDA_KERNEL_NODE_PARAMS kparams; + auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); + if (err == CUDA_SUCCESS) { + ss << "GPUKernel@" << kparams.func; + dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); + dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); + ss << "<<>>"; + ss << "(..."; + if (kparams.sharedMemBytes != 0) + ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; + ss << ")"; + } else { + ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; + } + } + break; + case CU_GRAPH_NODE_TYPE_MEMCPY: + { + cudaMemcpy3DParms mparams = {}; + CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); + // If memcpy is seen, return without setting up runnable executor + switch (mparams.kind) { + case cudaMemcpyHostToHost: ss << "Host->Host "; break; + case cudaMemcpyHostToDevice: ss << "Host->Device "; break; + case cudaMemcpyDeviceToHost: ss << "Device->Host "; break; + case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break; + default: break; + } + ss << "Memcpy"; + } + break; + case CU_GRAPH_NODE_TYPE_MEMSET: + { + cudaMemsetParams mparams = {}; + CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); + if (mparams.height == 1 && mparams.elementSize == 1) { + ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value + << ", count=" << mparams.width << ")"; + } else { + if (mparams.elementSize == 1) + ss << "cudaMemset2D"; + else + ss << "MemSet"; + ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch + << ", value=" << mparams.value << ", width=" << mparams.width + << ", height=" << mparams.height << ")"; + } + } + break; + case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break; + case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break; + case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break; + default: ss << "Unknown/Invalid node type " << t; + } + return ss.str(); +} + +#endif // MXNET_ENABLE_CUDA_RTC + +// CUDA Graphs are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphDeleter { + public: + void operator() (cudaGraph_t graph) { + if (graph != nullptr) + CUDA_CALL(cudaGraphDestroy(graph)); + } +}; + +// CUDA Graphs Executors are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphExecDeleter { + public: + void operator() (cudaGraphExec_t graph_exec) { + if (graph_exec != nullptr) + CUDA_CALL(cudaGraphExecDestroy(graph_exec)); + } +}; + +// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'), +// characterized by a starting index in the OpExecutor list and a number of ops. +class CudaGraphsSubSegExec { + public: + CudaGraphsSubSegExec(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops, + bool ops_are_cuda_graph_compatible = true) : + from_op_idx_(from_op_idx), + num_ops_(num_ops), + graph_(nullptr), + graph_exec_(nullptr) { + if (ops_are_cuda_graph_compatible) { + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); + MakeGraphExec(); + } + } + + void Update(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose) { + // Current executor should be Runnable with the same parameters + CHECK(IsRunnable()); + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_); + + cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; + cudaGraphNode_t error_node; + CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), + &error_node, &update_result)); + // If update fails make a new executor, discarding old one. + if (update_result != cudaGraphExecUpdateSuccess) + MakeGraphExec(); + } + + void RunSubSeg(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + if (IsRunnable()) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s)); + } else { + // No CUDA Graph could be made for this portion of the OpSegment. Run conventionally. + for (int i = 0; i != num_ops_; ++i) + exec_list[from_op_idx_ + i]->Run(rctx, is_gpu); + } + } + + bool IsRunnable() { return graph_exec_ != nullptr; } + + private: + void MakeGraph(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // Create CUDA Graph + // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers + // to sync their streams without disturbing this capture. + CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal)); + // Run those oprs in the sub segment while capturing- no actual GPU work is launched. + for (int i = 0; i != num_ops; ++i) + exec_list[from_op_idx + i]->Run(rctx, is_gpu); + cudaGraph_t cuda_graph = nullptr; + CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph)); + graph_.reset(cuda_graph, CudaGraphDeleter()); + + if (verbose) { + std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); + size_t num_nodes = graph_nodes.size(); + LOG(INFO) << " Graph has " << num_nodes << " nodes:"; +#if MXNET_ENABLE_CUDA_RTC + for (size_t i = 0; i != num_nodes; ++i) { + LOG(INFO) << " node " << i << " = " + << CudaGraphNodeToString(graph_nodes[i]); + } +#endif // MXNET_ENABLE_CUDA_RTC + } + } + + void MakeGraphExec() { + cudaGraphExec_t cuda_graph_exec; + cudaGraphNode_t error_node; + char log_buffer[1000]; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), + &error_node, log_buffer, 1000)); + graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); + + // At this point we have a CUDA Graph executor + static int num_graph_creations_logged = 0; + static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); + if (num_graph_creations_logged < max_log_entries) { + num_graph_creations_logged++; + LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; + if (num_graph_creations_logged == max_log_entries) + LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; + } + } + + int from_op_idx_; + int num_ops_; + using cudaGraphStruct_t = typename std::remove_pointer::type; + using cudaGraphExecStruct_t = typename std::remove_pointer::type; + std::shared_ptr graph_; + std::shared_ptr graph_exec_; +}; + +// The CudaGraph executor and associated Tempspace ptrs for which it is valid. +struct CudaGraphInfo { + std::vector cuda_graph_subseg_execs; + bool has_been_run_conventionally = false; + std::vector tempspace_dptrs; +}; +// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and +// the state of the is_train flag of the OpContext. If the tempspace_dptrs change, we +// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph. +struct CudaGraphCacheKey { + cudaStream_t cu_s; + bool is_train; + // overload '<' so CudaGraphCacheKey can be used as a std::map key + bool operator<(const CudaGraphCacheKey &other) const { + return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train); + } +}; +using CudaGraphCache = std::map; + +class CudaGraphsExec { + public: + CudaGraphsExec(const std::vector > &exec_list, + bool is_gpu, + const char *opr_names) : + verbose_(false), is_enabled_(false) { + opr_names_ = opr_names ? std::string(opr_names) : std::string(); + if (is_gpu) { + is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false); + verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); + SetTempSpaces(exec_list); + } + } + + void RunAll(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + // If this a CPU op or CUDA Graphs use isn't possible, run normally and return + if (!is_gpu || !is_enabled_) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + return; + } + + // Also if we're in a warm-up period where tempspace pointers are likely + // to change, run normally and return + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // All the ops in the bulked segment will have the same setting of is_train as the first op + const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; + const CudaGraphCacheKey key = {cu_s, is_train}; + // Look-up the CUDA Graph info for this combo of stream and is_train setting + // This may create a default-initialized new entry. + auto &cuda_graph_info = cache_[key]; + if (!cuda_graph_info.has_been_run_conventionally) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + cuda_graph_info.has_been_run_conventionally = true; + return; + } + + // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors' + // (there might be more than one executor if some ops in the segment are not capturable) + auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s); + + // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'. + if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 && + cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) { + // Update all runnable executors. Non-runnable executors launch their ops conventionally. + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + subseg_exec.Update(exec_list, rctx, is_gpu, verbose_); + } + } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) { + // No executors exist yet, so create them. + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_; + // Make one or more CUDA Graphs, avoiding ops that are not compatible. + for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) { + if (OpOK(exec_list[last_op_idx])) + num_good_ops++; + else + break; + } + if (num_good_ops > 0) { + CreateSubExecOverRegion(exec_list, rctx, is_gpu, + first_op_idx, + first_op_idx + num_good_ops, + &cuda_graph_info.cuda_graph_subseg_execs); + first_op_idx += num_good_ops; + } + if (first_op_idx != exec_list.size()) { + // We had to have hit an op that was not OK. + if (verbose_) { + LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]" + << " of op segment " << opr_names_; + } + CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false); + cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg); + first_op_idx++; + } + } + // During graph capture, the ops may be asking for the tempworkspace. This should + // not alter the base pointers, since this op seg has been executed before on this + // stream (i.e. on this gpu worker). Safest to double-check this though. + auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s); + if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs) + LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use."; + cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs; + } + // Now execute the CUDA Graph that we either just created or looked-up in the cache. + if (verbose_) { + int runnable_execs = 0; + int bypassed_ops = 0; + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + runnable_execs++; + else + bypassed_ops++; + } + LOG(INFO) << "Launching " << runnable_execs + << " captured CUDA graph(s) for op segment " << opr_names_; + if (bypassed_ops > 0) + LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; + } + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) + subseg_exec.RunSubSeg(exec_list, rctx, is_gpu); + } + + private: + // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph + // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a + // number of smaller graphs that avoid those ops with the memcpys. + void CreateSubExecOverRegion(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + size_t from_op_idx, + size_t upto_op_idx, + std::vector *cuda_graph_subseg_execs) { + // Optimistically try to create a CUDA Graph of the entire op segment region + + int num_ops = upto_op_idx - from_op_idx; + CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops); + if (full_opseg.IsRunnable()) { + cuda_graph_subseg_execs->push_back(full_opseg); + } else { + if (verbose_) + LOG(INFO) << " Graph was not runnable- creating op sub-segments..."; + // Enter fall-back approach to making many sub-execs + for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) { + CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1); + if (single_opseg.IsRunnable()) + num_good_ops++; + // Is it time to create a subseg exec from accumulated good ops? + if (num_good_ops > 0 && + (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op sub segment[" + << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]" + << " of op segment " << opr_names_; + CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_, + first_op_idx, num_good_ops); + CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; + cuda_graph_subseg_execs->push_back(good_opseg); + first_op_idx += num_good_ops; + } + // If the last single op was not runnable, use the exec to handle that op conventionally + if (!single_opseg.IsRunnable()) { + if (verbose_) { + LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]" + << " of op segment " << opr_names_; + // Generate throw-away exec in order to produce a diagnostic listing of graph nodes + CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1); + } + cuda_graph_subseg_execs->push_back(single_opseg); + first_op_idx++; + break; + } + } + } + } + } + + // Is the Op OK to make part of a CUDA Graph? + bool OpOK(const std::shared_ptr &exec) { + static auto& fstateful = Op::GetAttr("FCreateOpState"); + static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); + const auto& attrs = exec->attrs; + if (attrs.op != nullptr) { + const auto f = fgraphcompatible.get(attrs.op, nullptr); + if (f != nullptr) { + return f(attrs, exec->op_ctx.is_train); + } + if (fstateful.get(attrs.op, nullptr) != nullptr) { + if (verbose_) { + LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; + } + return false; + } + if ((fcompute_ex.get(attrs.op, nullptr) != nullptr && + exec->dispatch_mode == DispatchMode::kFComputeEx) || + exec->dispatch_mode == DispatchMode::kFComputeFallback) { + if (verbose_) { + LOG(INFO) << "Omitting operator " << attrs.op->name + << " from CUDA graph due to dispatch mode " + << static_cast(exec->dispatch_mode); + } + return false; + } + } + for (auto& resource : exec->op_ctx.requested) { + if (!(resource.req.type == ResourceRequest::kTempSpace)) { + if (verbose_) { + LOG(INFO) << "Omitting operator " << attrs.op->name + << " from CUDA graph due to using the resource type " + << static_cast(resource.req.type); + } + return false; + } + } + return true; + } + + // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. + void SetTempSpaces(const std::vector > &exec_list) { + // Gather info about the ops use of TempSpace. + if (is_enabled_) { + std::set tempspaces_set; + for (auto& exec : exec_list) { + for (auto& resource : exec->op_ctx.requested) { + if (resource.req.type == ResourceRequest::kTempSpace) { + tempspaces_set.insert(&resource); + } + } + } + tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end()); + } + } + + // Return the addresses of the gpu TempSpace areas + std::vector GetGPUTempspacePtrs(mshadow::Stream *s) { + std::vector ret; + for (const auto& resource : tempspaces_) { + // Ask for minimal allocation to get base pointer without increasing the size + auto *base_ptr = resource->get_space_typed(mshadow::Shape1(1), s).dptr_; + ret.push_back(static_cast(base_ptr)); + } + return ret; + } + + CudaGraphCache cache_; + std::vector tempspaces_; + std::string opr_names_; + bool verbose_; + bool is_enabled_; +}; + +} // namespace cuda_graphs +} // namespace mxnet + +#endif // CUDA_GRAPHS_AVAILABLE + +#endif // MXNET_EXECUTOR_CUDA_GRAPHS_H_ diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index e3d2fa459bc3..4552fa173fe4 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,13 @@ class OpExecutor { std::vector req; /*! \brief runtime op context, contains allocated resources */ OpContext op_ctx; + /*! \brief attributes of the node */ + NodeAttrs attrs; + /*! \brief dispatch mode of the executor */ + DispatchMode dispatch_mode; + + explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode) : + attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {} /*! \brief virtual destructor */ virtual ~OpExecutor() {} /*! @@ -100,6 +108,16 @@ class OpExecutor { * \param rctx The runtime context passed in by environment. */ virtual void Run(RunContext rctx, bool is_gpu) = 0; + /*! + * \brief run the operators of a vector of execs, given runtime context on device. + * This function call does not synchronize the stream. + * \param rctx The runtime context passed in by environment. + */ + static void RunAll(const std::vector > &execs, + RunContext rctx, bool is_gpu) { + for (auto &exec : execs) + exec->Run(rctx, is_gpu); + } /*! \return the execution type */ virtual ExecType exec_type() const = 0; /*! \return return engine variable for operator states */ diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 13bab2e544bf..c7febc4712f1 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -31,6 +31,7 @@ #include "./exec_pass.h" #include "./graph_executor.h" +#include "./cuda_graphs.h" #include "../profiler/profiler.h" #include "../common/utils.h" #include "../common/exec_utils.h" @@ -84,7 +85,8 @@ void GraphExecutor::Forward(bool is_train) { void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) { size_t sstep = static_cast(step); if (sstep >= num_forward_nodes_) { - *step_left = 0; return; + *step_left = 0; + return; } RunOps(is_train, sstep, sstep + 1); *step_left = static_cast(num_forward_nodes_ - sstep - 1); @@ -166,11 +168,12 @@ void GraphExecutor::Backward(const std::vector& head_grads, bool is_tra } void GraphExecutor::Print(std::ostream &os) const { // NOLINT(*) - nnvm::Symbol s; s.outputs = graph_.outputs; + nnvm::Symbol s; + s.outputs = graph_.outputs; s.Print(os); // message to be backward compatible with the memonger size_t total_bytes = graph_.GetAttr("storage_allocated_bytes"); - os << "Total " << (total_bytes >> 20UL) <<" MB allocated\n"; + os << "Total " << (total_bytes >> 20UL) << " MB allocated\n"; os << "Total " << 11 << " TempSpace resource requested\n"; } @@ -337,7 +340,8 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol, g = exec::EliminateCommonExpr(std::move(g)); need_grad_ = false; for (OpReqType req : grad_req_types) { - if (req != kNullOp) need_grad_ = true; + if (req != kNullOp) + need_grad_ = true; } if (!need_grad_) return g; for (size_t i = 0; i < g.outputs.size(); ++i) { @@ -1312,12 +1316,12 @@ void GraphExecutor::InitCachedOps() { // call on complete only if it is async op if (!is_async) { if (is_gpu) { - #if MXNET_USE_CUDA +#if MXNET_USE_CUDA // Wait GPU kernel to finish. ctx.get_stream()->Wait(); - #else +#else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif +#endif } on_complete(); } @@ -1607,21 +1611,31 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, opr_names += inode.source->op()->name + ","; } - if (pctx == nullptr) return ret; + if (pctx == nullptr) + return ret; ret.ctx = *pctx; Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars); bool is_gpu = pctx->dev_mask() == gpu::kDevMask; + +#if CUDA_GRAPHS_AVAILABLE + // Provide initialized `cuda_graphs_exec`, which when captured + // by exec_fun, acts like a static variable inside the mutable closure. + cuda_graphs::CudaGraphsExec cuda_graphs_exec(exec_list, is_gpu, opr_names.c_str()); + auto exec_fun = [cuda_graphs_exec, exec_list, is_gpu] ( + RunContext rctx, Engine::CallbackOnComplete on_complete) mutable { + // Run all opr in the sub-graph with CUDA graphs executor if possible + cuda_graphs_exec.RunAll(exec_list, rctx, is_gpu); +#else auto exec_fun = [exec_list, is_gpu] ( - RunContext ctx, Engine::CallbackOnComplete on_complete) { + RunContext rctx, Engine::CallbackOnComplete on_complete) { // Run all opr in the sub-graph - for (auto &exec : exec_list) { - exec->Run(ctx, is_gpu); - } + OpExecutor::RunAll(exec_list, rctx, is_gpu); +#endif if (is_gpu) { #if MXNET_USE_CUDA // Wait GPU kernel to finish. - ctx.get_stream()->Wait(); + rctx.get_stream()->Wait(); #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 156013857d6a..498e868a2b39 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -26,6 +26,7 @@ #include #include #include "../executor/graph_executor.h" +#include "../executor/cuda_graphs.h" #include "../executor/exec_pass.h" #include "../c_api/c_api_common.h" #include "../common/utils.h" @@ -985,12 +986,25 @@ inline Engine::OprHandle CreateEngineOp( bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask; bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync; +#if CUDA_GRAPHS_AVAILABLE + // Provide initialized `cuda_graphs_exec`, which when captured + // by exec_fun, acts like a static variable inside the mutable closure. + cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names); + auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] ( + RunContext ctx, Engine::CallbackOnComplete on_complete) mutable { + if (is_async) { + execs[0]->op_ctx.async_on_complete = on_complete; + } + // Run all opr in the sub-graph with CUDA graphs executor if possible + cuda_graphs_exec.RunAll(execs, ctx, is_gpu); +#else auto exec_fun = [execs, is_async, is_gpu] ( RunContext ctx, Engine::CallbackOnComplete on_complete) { if (is_async) { execs[0]->op_ctx.async_on_complete = on_complete; } - for (const auto& exec : execs) exec->Run(ctx, is_gpu); + exec::OpExecutor::RunAll(execs, ctx, is_gpu); +#endif // call on complete only if it is async op if (!is_async) { if (is_gpu) { diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu index 2b0040e5f6ac..1a81ba0fa99e 100644 --- a/src/operator/contrib/adamw.cu +++ b/src/operator/contrib/adamw.cu @@ -41,15 +41,31 @@ void GetScaleFloat(mshadow::Stream *s, const TBlob &scale_blob, float } NNVM_REGISTER_OP(_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", MPUpdate>); NNVM_REGISTER_OP(_mp_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", MPUpdate>); NNVM_REGISTER_OP(_multi_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", multiMPUpdate); NNVM_REGISTER_OP(_multi_mp_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", multiMPUpdate); } // namespace op diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index 20c5714dd904..f3a99eb431a4 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -30,6 +30,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(Dropout) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool is_train) { + // Dropout is just passthrough during inference + return !is_train; + }) .set_attr("FStatefulCompute", DropoutCompute); NNVM_REGISTER_OP(_backward_Dropout) diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu index c0184ad221d5..ce4d5a173ced 100644 --- a/src/operator/numpy/linalg/np_eig.cu +++ b/src/operator/numpy/linalg/np_eig.cu @@ -29,11 +29,19 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_eig) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EighOpForward); #endif diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu index 974dedc6172e..0b21ffe24262 100644 --- a/src/operator/numpy/linalg/np_eigvals.cu +++ b/src/operator/numpy/linalg/np_eigvals.cu @@ -29,11 +29,19 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_eigvals) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigvalsOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigvalsh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigvalshOpForward); #endif diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu index 09e85ab36f19..1936cf937419 100644 --- a/src/operator/numpy/linalg/np_norm_backward.cu +++ b/src/operator/numpy/linalg/np_norm_backward.cu @@ -27,6 +27,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_backward_npi_norm) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeBackward); } // namespace op diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu index 6feecb09a09e..7f8673cd0ec9 100644 --- a/src/operator/numpy/linalg/np_norm_forward.cu +++ b/src/operator/numpy/linalg/np_norm_forward.cu @@ -27,6 +27,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_norm) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeForward); } // namespace op diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu index 6fa59bea7710..909a7de63788 100644 --- a/src/operator/numpy/np_boolean_mask_assign.cu +++ b/src/operator/numpy/np_boolean_mask_assign.cu @@ -263,9 +263,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs, } NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); } // namespace op diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu index f83fca0e5c33..60260affdef3 100644 --- a/src/operator/numpy/np_constraint_check.cu +++ b/src/operator/numpy/np_constraint_check.cu @@ -39,6 +39,10 @@ void GetReduceOutput(mshadow::Stream *s, const TBlob &output_blob, boo } NNVM_REGISTER_OP(_npx_constraint_check) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", ConstraintCheckForward); } // namespace op diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu index b24c230d6216..09cdd2e30796 100644 --- a/src/operator/numpy/np_percentile_op.cu +++ b/src/operator/numpy/np_percentile_op.cu @@ -48,6 +48,10 @@ bool CheckInvalidInput(mshadow::Stream *s, const QType *data, } NNVM_REGISTER_OP(_npi_percentile) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyPercentileForward); } // namespace op diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu index a73bf9929db3..5ac005f85db6 100644 --- a/src/operator/numpy/random/np_bernoulli_op.cu +++ b/src/operator/numpy/random/np_bernoulli_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_bernoulli) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBernoulliForward); } // namespace op diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu index 4740b644a60c..4161a4bf7826 100644 --- a/src/operator/numpy/random/np_exponential_op.cu +++ b/src/operator/numpy/random/np_exponential_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_exponential) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyExponentialForward); NNVM_REGISTER_OP(_backward_broadcast_exponential) diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu index 5be15c7b9d13..8c3bca0ca44f 100644 --- a/src/operator/numpy/random/np_gamma_op.cu +++ b/src/operator/numpy/random/np_gamma_op.cu @@ -30,6 +30,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_gamma) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyGammaForward); } // namespace op diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu index 132d67beeb6a..23de9a72bde6 100644 --- a/src/operator/numpy/random/np_multinomial_op.cu +++ b/src/operator/numpy/random/np_multinomial_op.cu @@ -43,6 +43,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) { } NNVM_REGISTER_OP(_npi_multinomial) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyMultinomialForward); } // namespace op diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu index d45bc2321bd7..dc5b73254b08 100644 --- a/src/operator/numpy/random/np_normal_op.cu +++ b/src/operator/numpy/random/np_normal_op.cu @@ -29,12 +29,20 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_normal) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyNormalForward); NNVM_REGISTER_OP(_backward_broadcast_normal) .set_attr("FCompute", NormalReparamBackward); NNVM_REGISTER_OP(_npi_normal_n) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyNormalForward); } // namespace op diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu index d8a8a896e653..eecc2a8d44e8 100644 --- a/src/operator/numpy/random/np_pareto_op.cu +++ b/src/operator/numpy/random/np_pareto_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_pareto) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyParetoForward); NNVM_REGISTER_OP(_backward_broadcast_pareto) diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu index d5067f83bb02..154ef2e18f45 100644 --- a/src/operator/numpy/random/np_power_op.cu +++ b/src/operator/numpy/random/np_power_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_powerd) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyPowerForward); } // namespace op diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu index d35ac073fc4d..4eb6e390a922 100644 --- a/src/operator/numpy/random/np_rayleigh_op.cu +++ b/src/operator/numpy/random/np_rayleigh_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_rayleigh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyRayleighForward); NNVM_REGISTER_OP(_backward_broadcast_rayleigh) diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu index 57d609d62768..645c8c4366d3 100644 --- a/src/operator/numpy/random/np_weibull_op.cu +++ b/src/operator/numpy/random/np_weibull_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_weibull) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyWeibullForward); NNVM_REGISTER_OP(_backward_broadcast_weibull) diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu index 6904656b304b..44be673e8dcd 100644 --- a/src/operator/tensor/indexing_op.cu +++ b/src/operator/tensor/indexing_op.cu @@ -877,6 +877,10 @@ NNVM_REGISTER_OP(one_hot) .set_attr("FCompute", OneHotOpForward); NNVM_REGISTER_OP(gather_nd) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", GatherNDForwardGPU); NNVM_REGISTER_OP(scatter_nd) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 60a90c9f5c0c..6b5b4fbbd764 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -20,6 +20,7 @@ import os import tempfile import time +import random import mxnet as mx import multiprocessing as mp from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, environment @@ -30,7 +31,7 @@ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.insert(0, os.path.join(curr_path, '../unittest')) -from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process +from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed from test_gluon import * from test_loss import * from test_gluon_rnn import * @@ -643,6 +644,115 @@ def test_gemms_true_fp16(): assert_almost_equal(ref_results.asnumpy(), results_trueFP16.asnumpy(), atol=atol, rtol=rtol) +@with_seed() +def test_cuda_graphs(): + class GraphTester(gluon.HybridBlock): + def __init__(self, function_to_test, **kwargs): + super(GraphTester, self).__init__(**kwargs) + with self.name_scope(): + self.f = function_to_test() + + def hybrid_forward(self, F, *args): + # We need to isolate the operation to be fully inside the graph + # in order for graphs usage to be possible + copied_args = [F.identity(a) for a in args] + outputs = self.f(*copied_args) + if isinstance(outputs, (list, tuple)): + return [F.identity(o) for o in outputs] + else: + return F.identity(outputs) + + class TestDesc: + def __init__(self, name, f, num_inputs=1, input_dim=4): + self.name = name + self.f = f + self.num_inputs = num_inputs + self.input_dim = input_dim + + def generate_inputs(self): + shape = tuple(np.random.randint(4, 11, size=self.input_dim)) + ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)] + for r in ret: + r.attach_grad() + return ret + + tested_ops = [ + TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2), + TestDesc('add_scalar', lambda: (lambda x: x + 0.5)), + TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))), + TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))), + TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), + TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')), + #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), + TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()), + TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()), + TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()), + TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()), + TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()), + TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()), + TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()), + TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()), + TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()), + TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)), + TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()), + TestDesc('ELU', lambda: mx.gluon.nn.ELU()), + TestDesc('SELU', lambda: mx.gluon.nn.SELU()), + TestDesc('Swish', lambda: mx.gluon.nn.Swish()), + ] + + N = 10 + + with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', + 'MXNET_USE_FUSION': '0'}): + for test_desc in tested_ops: + print("Testing ", test_desc.name) + inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) + + # initialize parameters + net.initialize() + netg.initialize() + + net(*inputs) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) + + netg.hybridize(static_alloc=True, static_shape=True) + + print("Testing inference mode") + with random_seed(seed): + for _ in range(N): + assert_almost_equal(net(*inputs), netg(*inputsg)) + + mx.nd.waitall() + print("Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) + mx.nd.waitall() + if __name__ == '__main__': import nose