From 62d381a446512d6a5ba46b86dc9a1e9188132789 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 11 Sep 2020 13:33:36 -0700 Subject: [PATCH 01/13] Initial cherry-pick --- src/executor/cuda_graphs.h | 529 ++++++++++++++++++++++++++++++ src/executor/exec_pass.h | 10 + src/executor/graph_executor.cc | 40 ++- src/imperative/imperative_utils.h | 15 + 4 files changed, 581 insertions(+), 13 deletions(-) create mode 100644 src/executor/cuda_graphs.h diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h new file mode 100644 index 000000000000..512efae0b73c --- /dev/null +++ b/src/executor/cuda_graphs.h @@ -0,0 +1,529 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2020 by Contributors + * \file cuda_graphs.h + * \brief Wrappers for use of CUDA Graphs API + */ +#ifndef MXNET_EXECUTOR_CUDA_GRAPHS_H_ +#define MXNET_EXECUTOR_CUDA_GRAPHS_H_ + +#include +#include +#include +#include +#include + +#include "./exec_pass.h" +#include "../common/cuda_utils.h" + +#if MXNET_USE_CUDA +#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10010) +#else +#define CUDA_GRAPHS_AVAILABLE (0) +#endif + +#if CUDA_GRAPHS_AVAILABLE + +namespace mxnet { +namespace cuda_graphs { + +inline std::string CudaDim3ToString(const dim3& dims) { + std::stringstream ss; + if (dims.z != 1) + ss << "(" << dims.x << "," << dims.y << "," << dims.z << ")"; + else if (dims.y != 1) + ss << "(" << dims.x << "," << dims.y << ")"; + else + ss << "(" << dims.x << ")"; + return ss.str(); +} + +#define CU_CALL(func) \ + { \ + CUresult e = (func); \ + if (e != CUDA_SUCCESS) { \ + std::cerr << "cuda driver failure code: " << e << std::endl; \ + exit(1); \ + } \ + } + +// Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.) +inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) { + CUgraphNode cu_node = node; + CUgraphNodeType t; + CU_CALL(cuGraphNodeGetType(cu_node, &t)); + return t; +} + +// Return the list of CUDA Graph nodes from a graph +inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { + size_t numNodes; + CUDA_CALL(cudaGraphGetNodes(cuda_graph, static_cast(nullptr), &numNodes)); + if (numNodes == 0) + return std::vector(); + std::vector graphNodes(numNodes); + CUDA_CALL(cudaGraphGetNodes(cuda_graph, graphNodes.data(), &numNodes)); + return graphNodes; +} + +inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { + std::stringstream ss; + + // The following introspection calls are made through the driver API in order to bypass + // problems that would arise if multiple statically-linked copies of the runtime exist. + + CUgraphNode cu_node = node; + CUgraphNodeType t; + CU_CALL(cuGraphNodeGetType(cu_node, &t)); + switch (t) { + case CU_GRAPH_NODE_TYPE_KERNEL: + { + CUDA_KERNEL_NODE_PARAMS kparams; + auto err = cuGraphKernelNodeGetParams(cu_node, &kparams); + if (err == CUDA_SUCCESS) { + ss << "GPUKernel@" << kparams.func; + dim3 gridDim(kparams.gridDimX, kparams.gridDimY, kparams.gridDimZ); + dim3 blockDim(kparams.blockDimX, kparams.blockDimY, kparams.blockDimZ); + ss << "<<>>"; + ss << "(..."; + if (kparams.sharedMemBytes != 0) + ss << ", dynSharedMemBytes=" << kparams.sharedMemBytes; + ss << ")"; + } else { + ss << "GPU Kernel: cuGraphKernelNodeGetParams() fails with " << err; + } + } + break; + case CU_GRAPH_NODE_TYPE_MEMCPY: + { + cudaMemcpy3DParms mparams = {}; + CUDA_CALL(cudaGraphMemcpyNodeGetParams(node, &mparams)); + // If memcpy is seen, return without setting up runnable executor + switch (mparams.kind) { + case cudaMemcpyHostToHost: ss << "Host->Host "; break; + case cudaMemcpyHostToDevice: ss << "Host->Device "; break; + case cudaMemcpyDeviceToHost: ss << "Device->Host "; break; + case cudaMemcpyDeviceToDevice: ss << "Device->Device "; break; + default: break; + } + ss << "Memcpy"; + } + break; + case CU_GRAPH_NODE_TYPE_MEMSET: + { + cudaMemsetParams mparams = {}; + CUDA_CALL(cudaGraphMemsetNodeGetParams(node, &mparams)); + if (mparams.height == 1 && mparams.elementSize == 1) { + ss << "cudaMemset(devPtr=" << mparams.dst << ", value=" << mparams.value + << ", count=" << mparams.width << ")"; + } else { + if (mparams.elementSize == 1) + ss << "cudaMemset2D"; + else + ss << "MemSet"; + ss << "(devPtr=" << mparams.dst << ", pitch=" << mparams.pitch + << ", value=" << mparams.value << ", width=" << mparams.width + << ", height=" << mparams.height << ")"; + } + } + break; + case CU_GRAPH_NODE_TYPE_HOST: ss << "Host (executable) node"; break; + case CU_GRAPH_NODE_TYPE_GRAPH: ss << "Node which executes an embedded graph"; break; + case CU_GRAPH_NODE_TYPE_EMPTY: ss << "Empty (no-op) node"; break; + default: ss << "Unknown/Invalid node type " << t; + } + return ss.str(); +} + + +// CUDA Graphs are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphDeleter { + public: + void operator() (cudaGraph_t graph) { + if (graph != nullptr) + CUDA_CALL(cudaGraphDestroy(graph)); + } +}; + +// CUDA Graphs Executors are managed in RAII fashion by smart pointers below. +// Function objects (preferred for readability) provide the deleter function. +class CudaGraphExecDeleter { + public: + void operator() (cudaGraphExec_t graph_exec) { + if (graph_exec != nullptr) + CUDA_CALL(cudaGraphExecDestroy(graph_exec)); + } +}; + +// A CUDA Graphs executor for a portion of an Operator Segment (i.e. a 'SubSegment'), +// characterized by a starting index in the OpExecutor list and a number of ops. +class CudaGraphsSubSegExec { + public: + CudaGraphsSubSegExec(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops, + bool ops_are_cuda_graph_compatible = true) : + from_op_idx_(from_op_idx), + num_ops_(num_ops), + graph_(nullptr), + graph_exec_(nullptr) { + if (ops_are_cuda_graph_compatible) { + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx, num_ops); + MakeGraphExec(); + } + } + + void Update(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose) { + // Current executor should be Runnable with the same parameters + CHECK(IsRunnable()); + MakeGraph(exec_list, rctx, is_gpu, verbose, from_op_idx_, num_ops_); + + cudaGraphExecUpdateResult update_result = cudaGraphExecUpdateError; + cudaGraphNode_t error_node; + CUDA_CALL(cudaGraphExecUpdate(graph_exec_.get(), graph_.get(), + &error_node, &update_result)); + // If update fails make a new executor, discarding old one. + if (update_result != cudaGraphExecUpdateSuccess) + MakeGraphExec(); + } + + void RunSubSeg(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + if (IsRunnable()) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + CUDA_CALL(cudaGraphLaunch(graph_exec_.get(), cu_s)); + } else { + // No CUDA Graph could be made for this portion of the OpSegment. Run conventionally. + for (int i = 0; i != num_ops_; ++i) + exec_list[from_op_idx_ + i]->Run(rctx, is_gpu); + } + } + + bool IsRunnable() { return graph_exec_ != nullptr; } + + private: + void MakeGraph(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + bool verbose, + int from_op_idx, + int num_ops) { + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // Create CUDA Graph + // Use of cudaStreamCaptureModeThreadLocal allows other threads like GPU Copy workers + // to sync their streams without disturbing this capture. + CUDA_CALL(cudaStreamBeginCapture(cu_s, cudaStreamCaptureModeThreadLocal)); + // Run those oprs in the sub segment while capturing- no actual GPU work is launched. + for (int i = 0; i != num_ops; ++i) + exec_list[from_op_idx + i]->Run(rctx, is_gpu); + cudaGraph_t cuda_graph = nullptr; + CUDA_CALL(cudaStreamEndCapture(cu_s, &cuda_graph)); + graph_.reset(cuda_graph, CudaGraphDeleter()); + + if (verbose) { + std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); + size_t num_nodes = graph_nodes.size(); + LOG(INFO) << " Graph has " << num_nodes << " nodes:"; + for (size_t i = 0; i != num_nodes; ++i) { + LOG(INFO) << " node " << i << " = " + << CudaGraphNodeToString(graph_nodes[i]); + } + } + } + + void MakeGraphExec() { + cudaGraphExec_t cuda_graph_exec; + cudaGraphNode_t error_node; + char log_buffer[1000]; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec, graph_.get(), + &error_node, log_buffer, 1000)); + graph_exec_.reset(cuda_graph_exec, CudaGraphExecDeleter()); + + // At this point we have a CUDA Graph executor + static int num_graph_creations_logged = 0; + static int max_log_entries = dmlc::GetEnv("MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES", 0); + if (num_graph_creations_logged < max_log_entries) { + num_graph_creations_logged++; + LOG(INFO) << "Created CUDA graph " << num_graph_creations_logged; + if (num_graph_creations_logged == max_log_entries) + LOG(INFO) << "Further CUDA graph creation log messages are suppressed."; + } + } + + int from_op_idx_; + int num_ops_; + using cudaGraphStruct_t = typename std::remove_pointer::type; + using cudaGraphExecStruct_t = typename std::remove_pointer::type; + std::shared_ptr graph_; + std::shared_ptr graph_exec_; +}; + +// The CudaGraph executor and associated Tempspace ptrs for which it is valid. +struct CudaGraphInfo { + std::vector cuda_graph_subseg_execs; + bool has_been_run_conventionally = false; + std::vector tempspace_dptrs; +}; +// A CUDA graph is maintained for every combination of cudaStream_t (i.e. GPU Worker) and +// the state of the is_train flag of the OpContext. If the tempspace_dptrs change, we +// don't expect to ever see the old tempspace_dptrs config again, so we discard the CUDA graph. +struct CudaGraphCacheKey { + cudaStream_t cu_s; + bool is_train; + // overload '<' so CudaGraphCacheKey can be used as a std::map key + bool operator<(const CudaGraphCacheKey &other) const { + return cu_s < other.cu_s || (cu_s == other.cu_s && is_train < other.is_train); + } +}; +using CudaGraphCache = std::map; + +class CudaGraphsExec { + public: + CudaGraphsExec(const std::vector > &exec_list, + bool is_gpu, + const char *opr_names) : + verbose_(false), is_enabled_(false) { + opr_names_ = opr_names ? std::string(opr_names) : std::string(); + if (is_gpu) { + is_enabled_ = dmlc::GetEnv("MXNET_ENABLE_CUDA_GRAPHS", false); + verbose_ = dmlc::GetEnv("MXNET_CUDA_GRAPHS_VERBOSE", false); + SetTempSpaces(exec_list); + } + } + + void RunAll(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu) { + // If this a CPU op or CUDA Graphs use isn't possible, run normally and return + if (!is_gpu || !is_enabled_) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + return; + } + + // Also if we're in a warm-up period where tempspace pointers are likely + // to change, run normally and return + auto s = rctx.get_stream(); + const cudaStream_t cu_s = mshadow::Stream::GetStream(s); + // All the ops in the bulked segment will have the same setting of is_train as the first op + const bool is_train = exec_list.size() > 0 && exec_list[0]->op_ctx.is_train; + const CudaGraphCacheKey key = {cu_s, is_train}; + // Look-up the CUDA Graph info for this combo of stream and is_train setting + // This may create a default-initialized new entry. + auto &cuda_graph_info = cache_[key]; + if (!cuda_graph_info.has_been_run_conventionally) { + // Run all opr in the sub-graph + exec::OpExecutor::RunAll(exec_list, rctx, is_gpu); + cuda_graph_info.has_been_run_conventionally = true; + return; + } + + // At this point we will launch one or more CUDA Graphs through CUDA Graphs 'executors' + // (there might be more than one executor if some ops in the segment are not capturable) + auto before_exec_tempspace_ptrs = GetGPUTempspacePtrs(s); + + // Executors exist, but the tempspace pts have changed, so update them in-place via 'recapture'. + if (cuda_graph_info.cuda_graph_subseg_execs.size() > 0 && + cuda_graph_info.tempspace_dptrs != before_exec_tempspace_ptrs) { + // Update all runnable executors. Non-runnable executors launch their ops conventionally. + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + subseg_exec.Update(exec_list, rctx, is_gpu, verbose_); + } + } else if (cuda_graph_info.cuda_graph_subseg_execs.size() == 0) { + // No executors exist yet, so create them. + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op segment " << opr_names_; + // Make one or more CUDA Graphs, avoiding ops that are not compatible. + for (size_t first_op_idx = 0; first_op_idx != exec_list.size();) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != exec_list.size(); ++last_op_idx) { + if (OpOK(exec_list[last_op_idx])) + num_good_ops++; + else + break; + } + if (num_good_ops > 0) { + CreateSubExecOverRegion(exec_list, rctx, is_gpu, + first_op_idx, + first_op_idx + num_good_ops, + &cuda_graph_info.cuda_graph_subseg_execs); + first_op_idx += num_good_ops; + } + if (first_op_idx != exec_list.size()) { + // We had to have hit an op that was not OK. + if (verbose_) { + LOG(INFO) << "Bypassing notOK op segment[" << first_op_idx << "," << first_op_idx << "]" + << " of op segment " << opr_names_; + } + CudaGraphsSubSegExec notOK_opseg(exec_list, rctx, is_gpu, false, first_op_idx, 1, false); + cuda_graph_info.cuda_graph_subseg_execs.push_back(notOK_opseg); + first_op_idx++; + } + } + // During graph capture, the ops may be asking for the tempworkspace. This should + // not alter the base pointers, since this op seg has been executed before on this + // stream (i.e. on this gpu worker). Safest to double-check this though. + auto after_capture_tempspace_ptrs = GetGPUTempspacePtrs(s); + if (before_exec_tempspace_ptrs != after_capture_tempspace_ptrs) + LOG(FATAL) << "Internal error: saw change in TempSpace ptrs during CUDA graph use."; + cuda_graph_info.tempspace_dptrs = before_exec_tempspace_ptrs; + } + // Now execute the CUDA Graph that we either just created or looked-up in the cache. + if (verbose_) { + int runnable_execs = 0; + int bypassed_ops = 0; + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) { + if (subseg_exec.IsRunnable()) + runnable_execs++; + else + bypassed_ops++; + } + LOG(INFO) << "Launching " << runnable_execs + << " captured CUDA graph(s) for op segment " << opr_names_; + if (bypassed_ops > 0) + LOG(INFO) << " (bypassing " << bypassed_ops << " un-capturable ops)"; + } + for (auto &subseg_exec : cuda_graph_info.cuda_graph_subseg_execs) + subseg_exec.RunSubSeg(exec_list, rctx, is_gpu); + } + + private: + // Make a CUDA Graph of the region of ops [from_op_idx, upto_op_idx). If such a graph + // is not runnable, e.g. if it includes memcpys from unpinned cpu memory, then make a + // number of smaller graphs that avoid those ops with the memcpys. + void CreateSubExecOverRegion(const std::vector > &exec_list, + const RunContext &rctx, + bool is_gpu, + size_t from_op_idx, + size_t upto_op_idx, + std::vector *cuda_graph_subseg_execs) { + // Optimistically try to create a CUDA Graph of the entire op segment region + + int num_ops = upto_op_idx - from_op_idx; + CudaGraphsSubSegExec full_opseg(exec_list, rctx, is_gpu, verbose_, from_op_idx, num_ops); + if (full_opseg.IsRunnable()) { + cuda_graph_subseg_execs->push_back(full_opseg); + } else { + if (verbose_) + LOG(INFO) << " Graph was not runnable- creating op sub-segments..."; + // Enter fall-back approach to making many sub-execs + for (size_t first_op_idx = from_op_idx; first_op_idx != upto_op_idx; ) { + int num_good_ops = 0; + for (size_t last_op_idx = first_op_idx; last_op_idx != upto_op_idx; ++last_op_idx) { + CudaGraphsSubSegExec single_opseg(exec_list, rctx, is_gpu, false, last_op_idx, 1); + if (single_opseg.IsRunnable()) + num_good_ops++; + // Is it time to create a subseg exec from accumulated good ops? + if (num_good_ops > 0 && + (last_op_idx == upto_op_idx - 1 || !single_opseg.IsRunnable())) { + if (verbose_) + LOG(INFO) << "Capturing CUDA graph of op sub segment[" + << first_op_idx << ":" << (first_op_idx + num_good_ops - 1) << "]" + << " of op segment " << opr_names_; + CudaGraphsSubSegExec good_opseg(exec_list, rctx, is_gpu, verbose_, + first_op_idx, num_good_ops); + CHECK(good_opseg.IsRunnable()) << "Unexpected issue with CUDA Graphs creation"; + cuda_graph_subseg_execs->push_back(good_opseg); + first_op_idx += num_good_ops; + } + // If the last single op was not runnable, use the exec to handle that op conventionally + if (!single_opseg.IsRunnable()) { + if (verbose_) { + LOG(INFO) << "Bypassing op sub segment[" << last_op_idx << "," << last_op_idx << "]" + << " of op segment " << opr_names_; + // Generate throw-away exec in order to produce a diagnostic listing of graph nodes + CudaGraphsSubSegExec dummy(exec_list, rctx, is_gpu, verbose_, last_op_idx, 1); + } + cuda_graph_subseg_execs->push_back(single_opseg); + first_op_idx++; + break; + } + } + } + } + } + + // Is the Op OK to make part of a CUDA Graph? + bool OpOK(const std::shared_ptr &exec) { + static bool allow_dropout_resource = + dmlc::GetEnv("MXNET_CUDA_GRAPHS_ALLOW_DROPOUT", false); + for (auto& resource : exec->op_ctx.requested) { + if (!(resource.req.type == ResourceRequest::kTempSpace || + (resource.req.type == ResourceRequest::kCuDNNDropoutDesc && allow_dropout_resource))) { + return false; + } + } + return true; + } + + // Determine Tempspaces used by ops. Other resource uses disable CUDA Graphs. + void SetTempSpaces(const std::vector > &exec_list) { + // Gather info about the ops use of TempSpace. + if (is_enabled_) { + std::set tempspaces_set; + for (auto& exec : exec_list) { + for (auto& resource : exec->op_ctx.requested) { + if (resource.req.type == ResourceRequest::kTempSpace) { + tempspaces_set.insert(&resource); + } + } + } + tempspaces_.assign(tempspaces_set.begin(), tempspaces_set.end()); + } + } + + // Return the addresses of the gpu TempSpace areas + std::vector GetGPUTempspacePtrs(mshadow::Stream *s) { + std::vector ret; + for (const auto& resource : tempspaces_) { + // Ask for minimal allocation to get base pointer without increasing the size + auto *base_ptr = resource->get_space_typed(mshadow::Shape1(1), s).dptr_; + ret.push_back(static_cast(base_ptr)); + } + return ret; + } + + CudaGraphCache cache_; + std::vector tempspaces_; + std::string opr_names_; + bool verbose_; + bool is_enabled_; +}; + +#endif // CUDA_GRAPHS_AVAILABLE + + +} // namespace cuda_graphs +} // namespace mxnet +#endif // MXNET_EXECUTOR_CUDA_GRAPHS_H_ diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index e3d2fa459bc3..b5d3b2471b8d 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -100,6 +100,16 @@ class OpExecutor { * \param rctx The runtime context passed in by environment. */ virtual void Run(RunContext rctx, bool is_gpu) = 0; + /*! + * \brief run the operators of a vector of execs, given runtime context on device. + * This function call does not synchronize the stream. + * \param rctx The runtime context passed in by environment. + */ + static void RunAll(const std::vector > &execs, + RunContext rctx, bool is_gpu) { + for (auto &exec : execs) + exec->Run(rctx, is_gpu); + } /*! \return the execution type */ virtual ExecType exec_type() const = 0; /*! \return return engine variable for operator states */ diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 13bab2e544bf..c7febc4712f1 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -31,6 +31,7 @@ #include "./exec_pass.h" #include "./graph_executor.h" +#include "./cuda_graphs.h" #include "../profiler/profiler.h" #include "../common/utils.h" #include "../common/exec_utils.h" @@ -84,7 +85,8 @@ void GraphExecutor::Forward(bool is_train) { void GraphExecutor::PartialForward(bool is_train, int step, int *step_left) { size_t sstep = static_cast(step); if (sstep >= num_forward_nodes_) { - *step_left = 0; return; + *step_left = 0; + return; } RunOps(is_train, sstep, sstep + 1); *step_left = static_cast(num_forward_nodes_ - sstep - 1); @@ -166,11 +168,12 @@ void GraphExecutor::Backward(const std::vector& head_grads, bool is_tra } void GraphExecutor::Print(std::ostream &os) const { // NOLINT(*) - nnvm::Symbol s; s.outputs = graph_.outputs; + nnvm::Symbol s; + s.outputs = graph_.outputs; s.Print(os); // message to be backward compatible with the memonger size_t total_bytes = graph_.GetAttr("storage_allocated_bytes"); - os << "Total " << (total_bytes >> 20UL) <<" MB allocated\n"; + os << "Total " << (total_bytes >> 20UL) << " MB allocated\n"; os << "Total " << 11 << " TempSpace resource requested\n"; } @@ -337,7 +340,8 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol, g = exec::EliminateCommonExpr(std::move(g)); need_grad_ = false; for (OpReqType req : grad_req_types) { - if (req != kNullOp) need_grad_ = true; + if (req != kNullOp) + need_grad_ = true; } if (!need_grad_) return g; for (size_t i = 0; i < g.outputs.size(); ++i) { @@ -1312,12 +1316,12 @@ void GraphExecutor::InitCachedOps() { // call on complete only if it is async op if (!is_async) { if (is_gpu) { - #if MXNET_USE_CUDA +#if MXNET_USE_CUDA // Wait GPU kernel to finish. ctx.get_stream()->Wait(); - #else +#else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; - #endif +#endif } on_complete(); } @@ -1607,21 +1611,31 @@ GraphExecutor::CachedSegOpr GraphExecutor::CreateCachedSegOpr(size_t topo_start, opr_names += inode.source->op()->name + ","; } - if (pctx == nullptr) return ret; + if (pctx == nullptr) + return ret; ret.ctx = *pctx; Engine::Get()->DeduplicateVarHandle(&use_vars, &mutate_vars); bool is_gpu = pctx->dev_mask() == gpu::kDevMask; + +#if CUDA_GRAPHS_AVAILABLE + // Provide initialized `cuda_graphs_exec`, which when captured + // by exec_fun, acts like a static variable inside the mutable closure. + cuda_graphs::CudaGraphsExec cuda_graphs_exec(exec_list, is_gpu, opr_names.c_str()); + auto exec_fun = [cuda_graphs_exec, exec_list, is_gpu] ( + RunContext rctx, Engine::CallbackOnComplete on_complete) mutable { + // Run all opr in the sub-graph with CUDA graphs executor if possible + cuda_graphs_exec.RunAll(exec_list, rctx, is_gpu); +#else auto exec_fun = [exec_list, is_gpu] ( - RunContext ctx, Engine::CallbackOnComplete on_complete) { + RunContext rctx, Engine::CallbackOnComplete on_complete) { // Run all opr in the sub-graph - for (auto &exec : exec_list) { - exec->Run(ctx, is_gpu); - } + OpExecutor::RunAll(exec_list, rctx, is_gpu); +#endif if (is_gpu) { #if MXNET_USE_CUDA // Wait GPU kernel to finish. - ctx.get_stream()->Wait(); + rctx.get_stream()->Wait(); #else LOG(FATAL) << MXNET_GPU_NOT_ENABLED_ERROR; #endif diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 156013857d6a..4933808ad892 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -26,6 +26,7 @@ #include #include #include "../executor/graph_executor.h" +#include "../executor/cuda_graphs.h" #include "../executor/exec_pass.h" #include "../c_api/c_api_common.h" #include "../common/utils.h" @@ -985,12 +986,26 @@ inline Engine::OprHandle CreateEngineOp( bool is_gpu = default_ctx.dev_mask() == gpu::kDevMask; bool is_async = execs.size() > 1 ? false : execs[0]->exec_type() == ExecType::kAsync; +#if CUDA_GRAPHS_AVAILABLE + // Provide initialized `cuda_graphs_exec`, which when captured + // by exec_fun, acts like a static variable inside the mutable closure. + cuda_graphs::CudaGraphsExec cuda_graphs_exec(execs, is_gpu, opr_names); + auto exec_fun = [cuda_graphs_exec, execs, is_async, is_gpu] ( + RunContext ctx, Engine::CallbackOnComplete on_complete) mutable { + if (is_async) { + execs[0]->op_ctx.async_on_complete = on_complete; + } + // Run all opr in the sub-graph with CUDA graphs executor if possible + cuda_graphs_exec.RunAll(execs, ctx, is_gpu); +#else auto exec_fun = [execs, is_async, is_gpu] ( RunContext ctx, Engine::CallbackOnComplete on_complete) { if (is_async) { execs[0]->op_ctx.async_on_complete = on_complete; } for (const auto& exec : execs) exec->Run(ctx, is_gpu); + OpExecutor::RunAll(execs, ctx, is_gpu); +#endif // call on complete only if it is async op if (!is_async) { if (is_gpu) { From c15357a378a33a0fc14e1207b7d256b23f447dbe Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 11 Sep 2020 14:07:27 -0700 Subject: [PATCH 02/13] Store NodeAttrs in OpExecutor --- src/executor/attach_op_execs_pass.cc | 36 ++++++++++++++-------------- src/executor/exec_pass.h | 4 ++++ 2 files changed, 22 insertions(+), 18 deletions(-) diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 8f47bc29db13..8a32a249f9ac 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -45,8 +45,9 @@ namespace exec { // FComputeExecutor and FStatefulComputeExecutor inherit from this class class StorageFallbackOpExecutor : public OpExecutor { public: - explicit StorageFallbackOpExecutor(const std::vector &mutate_idx) - : mutate_idx_(mutate_idx) {} + explicit StorageFallbackOpExecutor(const NodeAttrs& attrs, + const std::vector &mutate_idx) + : OpExecutor(attrs), mutate_idx_(mutate_idx) {} void Setup() override { init_ = false; @@ -136,11 +137,12 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { return state_; } - explicit StatefulComputeExecutor(const OpStatePtr& state, + explicit StatefulComputeExecutor(const NodeAttrs& attrs, + const OpStatePtr& state, const FStatefulCompute& fcompute, ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), + : StorageFallbackOpExecutor(attrs, mutate_idx), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: @@ -159,7 +161,7 @@ class StatefulComputeExExecutor : public OpExecutor { InvalidateOutputs(out_array, req); // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented const auto is_mkldnn = Op::GetAttr("TIsMKLDNN"); - if (!is_mkldnn.get(attrs_.op, false)) { + if (!is_mkldnn.get(attrs.op, false)) { CreateDefaultInputs(in_array, &in_array_fallback); fcompute_(state_, op_ctx, in_array_fallback, req, out_array); return; @@ -186,10 +188,9 @@ class StatefulComputeExExecutor : public OpExecutor { const OpStatePtr& state, const FStatefulComputeEx& fcompute, ExecType exec_type) - : attrs_(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} + : OpExecutor(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: - NodeAttrs attrs_; OpStatePtr state_; FStatefulComputeEx fcompute_; ExecType exec_type_; @@ -206,7 +207,7 @@ class FComputeExecutor : public StorageFallbackOpExecutor { InvalidateOutputs(out_array, req); #endif PreFCompute(is_gpu); - fcompute_(attrs_, op_ctx, in_data_, req, out_data_); + fcompute_(attrs, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); } @@ -216,12 +217,11 @@ class FComputeExecutor : public StorageFallbackOpExecutor { explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(mutate_idx), - attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + : StorageFallbackOpExecutor(attrs, mutate_idx), + fcompute_(fcompute), exec_type_(exec_type) { } private: - NodeAttrs attrs_; FCompute fcompute_; ExecType exec_type_; }; @@ -235,13 +235,13 @@ class FComputeExExecutor : public OpExecutor { InvalidateOutputs(out_array, req); // TODO(alex): (MXNET-847) Remove this fallback feature after subgraph implemented const auto is_mkldnn = Op::GetAttr("TIsMKLDNN"); - if (!is_mkldnn.get(attrs_.op, false)) { + if (!is_mkldnn.get(attrs.op, false)) { CreateDefaultInputs(in_array, &in_array_fallback); - fcompute_(attrs_, op_ctx, in_array_fallback, req, out_array); + fcompute_(attrs, op_ctx, in_array_fallback, req, out_array); return; } #endif - fcompute_(attrs_, op_ctx, in_array, req, out_array); + fcompute_(attrs, op_ctx, in_array, req, out_array); } void Setup() override {} @@ -252,11 +252,10 @@ class FComputeExExecutor : public OpExecutor { explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute, ExecType exec_type) - : attrs_(attrs), fcompute_(fcompute), exec_type_(exec_type) { + : OpExecutor(attrs), fcompute_(fcompute), exec_type_(exec_type) { } private: - NodeAttrs attrs_; FComputeEx fcompute_; ExecType exec_type_; }; @@ -318,7 +317,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared(state, fcompute, + ret[i] = std::make_shared(inode.source->attrs, + state, fcompute, exec_type, mutate_index); } } else if (is_layer_backward.get(op, false)) { @@ -339,7 +339,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, CHECK(fcompute != nullptr) << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; - ret[i] = std::make_shared( + ret[i] = std::make_shared(inode.source->attrs, ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index); } } else { diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index b5d3b2471b8d..e364187f2f9c 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,9 @@ class OpExecutor { std::vector req; /*! \brief runtime op context, contains allocated resources */ OpContext op_ctx; + /*! \brief attributes of the node */ + NodeAttrs attrs; + explicit OpExecutor(NodeAttrs attrs) : attrs(std::move(attrs)) {} /*! \brief virtual destructor */ virtual ~OpExecutor() {} /*! From 3945895ba8f808114462726bcf7bf25a51875a14 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 11 Sep 2020 15:14:27 -0700 Subject: [PATCH 03/13] Do not allow stateful operations in CUDA graphs and provide mechanism for marking ops as safe --- include/mxnet/op_attr_types.h | 13 +++++++++++++ src/executor/cuda_graphs.h | 17 +++++++++++++---- src/operator/nn/dropout.cu | 5 +++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h index a0ac301b1818..88d21f23ea0c 100644 --- a/include/mxnet/op_attr_types.h +++ b/include/mxnet/op_attr_types.h @@ -362,6 +362,19 @@ using FNeedCalibrateInput = std::function (const NodeAttrs& att */ using FNeedCalibrateOutput = std::function (const NodeAttrs& attrs)>; +#if MXNET_USE_CUDA + +/*! + * \brief Register a function to determine if + * the operator implementation is compatible + * with CUDA graphs. This requires the execution + * to stay the same as long as the shape and type + * of input stays the same. + */ +using FIsCUDAGraphsCompatible = std::function; + +#endif + } // namespace mxnet #endif // MXNET_OP_ATTR_TYPES_H_ diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index 512efae0b73c..105e91f9bf44 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -476,11 +476,20 @@ class CudaGraphsExec { // Is the Op OK to make part of a CUDA Graph? bool OpOK(const std::shared_ptr &exec) { - static bool allow_dropout_resource = - dmlc::GetEnv("MXNET_CUDA_GRAPHS_ALLOW_DROPOUT", false); + static auto& fstateful = Op::GetAttr("FCreateOpState"); + static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); + const auto& attrs = exec->attrs; + if (attrs.op != nullptr) { + const auto& f = fgraphcompatible.get(attrs.op, nullptr); + if (f != nullptr) { + return f(attrs, exec->op_ctx.is_train); + } + if (fstateful.get(attrs.op, nullptr) != nullptr) { + return false; + } + } for (auto& resource : exec->op_ctx.requested) { - if (!(resource.req.type == ResourceRequest::kTempSpace || - (resource.req.type == ResourceRequest::kCuDNNDropoutDesc && allow_dropout_resource))) { + if (!resource.req.type == ResourceRequest::kTempSpace) { return false; } } diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index 20c5714dd904..f3a99eb431a4 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -30,6 +30,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(Dropout) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool is_train) { + // Dropout is just passthrough during inference + return !is_train; + }) .set_attr("FStatefulCompute", DropoutCompute); NNVM_REGISTER_OP(_backward_Dropout) From 599986a8dbfbf24fb4f2c698149a431884cdc3ee Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 11 Sep 2020 18:35:47 -0700 Subject: [PATCH 04/13] Guard against using ops with synchronization --- src/executor/attach_op_execs_pass.cc | 38 +++++++++++-------- src/executor/cuda_graphs.h | 21 +++++++++- src/executor/exec_pass.h | 6 ++- src/operator/contrib/adamw.cu | 16 ++++++++ src/operator/numpy/linalg/np_eig.cu | 8 ++++ src/operator/numpy/linalg/np_eigvals.cu | 8 ++++ src/operator/numpy/linalg/np_norm_backward.cu | 5 +++ src/operator/numpy/linalg/np_norm_forward.cu | 5 +++ src/operator/numpy/np_boolean_mask_assign.cu | 8 ++++ src/operator/numpy/np_constraint_check.cu | 4 ++ src/operator/numpy/np_percentile_op.cu | 4 ++ src/operator/numpy/random/np_bernoulli_op.cu | 4 ++ .../numpy/random/np_exponential_op.cu | 4 ++ src/operator/numpy/random/np_gamma_op.cu | 4 ++ .../numpy/random/np_multinomial_op.cu | 4 ++ src/operator/numpy/random/np_normal_op.cu | 8 ++++ src/operator/numpy/random/np_pareto_op.cu | 4 ++ src/operator/numpy/random/np_power_op.cu | 4 ++ src/operator/numpy/random/np_rayleigh_op.cu | 4 ++ src/operator/numpy/random/np_weibull_op.cu | 4 ++ src/operator/tensor/indexing_op.cu | 4 ++ 21 files changed, 149 insertions(+), 18 deletions(-) diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 8a32a249f9ac..ccf536920fb0 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -46,8 +46,9 @@ namespace exec { class StorageFallbackOpExecutor : public OpExecutor { public: explicit StorageFallbackOpExecutor(const NodeAttrs& attrs, + const DispatchMode& dispatch_mode, const std::vector &mutate_idx) - : OpExecutor(attrs), mutate_idx_(mutate_idx) {} + : OpExecutor(attrs, dispatch_mode), mutate_idx_(mutate_idx) {} void Setup() override { init_ = false; @@ -138,11 +139,12 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { } explicit StatefulComputeExecutor(const NodeAttrs& attrs, + const DispatchMode dispatch_mode, const OpStatePtr& state, const FStatefulCompute& fcompute, ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(attrs, mutate_idx), + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} private: @@ -185,10 +187,12 @@ class StatefulComputeExExecutor : public OpExecutor { } explicit StatefulComputeExExecutor(const NodeAttrs& attrs, + const DispatchMode& dispatch_mode, const OpStatePtr& state, const FStatefulComputeEx& fcompute, ExecType exec_type) - : OpExecutor(attrs), state_(state), fcompute_(fcompute), exec_type_(exec_type) {} + : OpExecutor(attrs, dispatch_mode), state_(state), fcompute_(fcompute), + exec_type_(exec_type) {} private: OpStatePtr state_; @@ -215,9 +219,10 @@ class FComputeExecutor : public StorageFallbackOpExecutor { return exec_type_; } - explicit FComputeExecutor(const NodeAttrs& attrs, FCompute fcompute, - ExecType exec_type, const std::vector &mutate_idx) - : StorageFallbackOpExecutor(attrs, mutate_idx), + explicit FComputeExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode, + FCompute fcompute, ExecType exec_type, + const std::vector &mutate_idx) + : StorageFallbackOpExecutor(attrs, dispatch_mode, mutate_idx), fcompute_(fcompute), exec_type_(exec_type) { } @@ -250,9 +255,9 @@ class FComputeExExecutor : public OpExecutor { return exec_type_; } - explicit FComputeExExecutor(const NodeAttrs& attrs, FComputeEx fcompute, - ExecType exec_type) - : OpExecutor(attrs), fcompute_(fcompute), exec_type_(exec_type) { + explicit FComputeExExecutor(const NodeAttrs& attrs, const DispatchMode dispatch_mode, + FComputeEx fcompute, ExecType exec_type) + : OpExecutor(attrs, dispatch_mode), fcompute_(fcompute), exec_type_(exec_type) { } private: @@ -309,7 +314,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, op, "FStatefulComputeEx", vctx[i]); // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { - ret[i] = std::make_shared(inode.source->attrs, state, + ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], state, fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute( @@ -318,6 +324,7 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; ret[i] = std::make_shared(inode.source->attrs, + dispatch_modes[i], state, fcompute, exec_type, mutate_index); } @@ -331,8 +338,8 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, // FStatefulComputeEx is dispatched only when dispatch_mode is DispatchMode::kFComputeEx if (fcompute_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared( - inode.source->attrs, ret[fwd_id].get()->state(), fcompute_ex, - exec_type); + inode.source->attrs, dispatch_modes[i], ret[fwd_id].get()->state(), + fcompute_ex, exec_type); } else { FStatefulCompute fcompute = common::GetFCompute( op, "FStatefulCompute", vctx[i]); @@ -340,17 +347,18 @@ void CreateOpExecs(const Graph& g, OpExecVector* p_ret, OpStateVector* p_state, << "One of FStatefulCompute and FStatefulComputeEx must be registered " << "for stateful operator " << op->name; ret[i] = std::make_shared(inode.source->attrs, - ret[fwd_id].get()->state(), fcompute, exec_type, mutate_index); + dispatch_modes[i], ret[fwd_id].get()->state(), fcompute, exec_type, + mutate_index); } } else { FCompute fcompute = common::GetFCompute(op, "FCompute", vctx[i]); FComputeEx fcomp_ex = common::GetFCompute(op, "FComputeEx", vctx[i]); if (fcomp_ex != nullptr && dispatch_modes[i] == DispatchMode::kFComputeEx) { ret[i] = std::make_shared( - inode.source->attrs, fcomp_ex, exec_type); + inode.source->attrs, dispatch_modes[i], fcomp_ex, exec_type); } else if (fcompute != nullptr) { ret[i] = std::make_shared( - inode.source->attrs, fcompute, exec_type, mutate_index); + inode.source->attrs, dispatch_modes[i], fcompute, exec_type, mutate_index); } else { LOG(INFO) << "Neither FCompute nor FComputeEx registered " << op->name; } diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index 105e91f9bf44..482d6f9bfde4 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -480,16 +480,33 @@ class CudaGraphsExec { static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); const auto& attrs = exec->attrs; if (attrs.op != nullptr) { - const auto& f = fgraphcompatible.get(attrs.op, nullptr); + const auto f = fgraphcompatible.get(attrs.op, nullptr); if (f != nullptr) { return f(attrs, exec->op_ctx.is_train); } if (fstateful.get(attrs.op, nullptr) != nullptr) { + if (verbose_) { + LOG(INFO) << "Omitting stateful operator" << attrs.op->name << " from CUDA graph."; + } + return false; + } + if (exec->dispatch_mode == DispatchMode::kFComputeEx || + exec->dispatch_mode == DispatchMode::kFComputeFallback) { + if (verbose_) { + LOG(INFO) << "Omitting operator" << attrs.op->name + << " from CUDA graph due to dispatch mode " + << static_cast(exec->dispatch_mode); + } return false; } } for (auto& resource : exec->op_ctx.requested) { - if (!resource.req.type == ResourceRequest::kTempSpace) { + if (!(resource.req.type == ResourceRequest::kTempSpace)) { + if (verbose_) { + LOG(INFO) << "Omitting operator" << attrs.op->name + << " from CUDA graph due to using the resource type " + << static_cast(resource.req.type); + } return false; } } diff --git a/src/executor/exec_pass.h b/src/executor/exec_pass.h index e364187f2f9c..4552fa173fe4 100644 --- a/src/executor/exec_pass.h +++ b/src/executor/exec_pass.h @@ -89,7 +89,11 @@ class OpExecutor { OpContext op_ctx; /*! \brief attributes of the node */ NodeAttrs attrs; - explicit OpExecutor(NodeAttrs attrs) : attrs(std::move(attrs)) {} + /*! \brief dispatch mode of the executor */ + DispatchMode dispatch_mode; + + explicit OpExecutor(NodeAttrs attrs, DispatchMode dispatch_mode) : + attrs(std::move(attrs)), dispatch_mode(dispatch_mode) {} /*! \brief virtual destructor */ virtual ~OpExecutor() {} /*! diff --git a/src/operator/contrib/adamw.cu b/src/operator/contrib/adamw.cu index 2b0040e5f6ac..1a81ba0fa99e 100644 --- a/src/operator/contrib/adamw.cu +++ b/src/operator/contrib/adamw.cu @@ -41,15 +41,31 @@ void GetScaleFloat(mshadow::Stream *s, const TBlob &scale_blob, float } NNVM_REGISTER_OP(_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", MPUpdate>); NNVM_REGISTER_OP(_mp_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", MPUpdate>); NNVM_REGISTER_OP(_multi_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", multiMPUpdate); NNVM_REGISTER_OP(_multi_mp_adamw_update) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", multiMPUpdate); } // namespace op diff --git a/src/operator/numpy/linalg/np_eig.cu b/src/operator/numpy/linalg/np_eig.cu index c0184ad221d5..ce4d5a173ced 100644 --- a/src/operator/numpy/linalg/np_eig.cu +++ b/src/operator/numpy/linalg/np_eig.cu @@ -29,11 +29,19 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_eig) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EighOpForward); #endif diff --git a/src/operator/numpy/linalg/np_eigvals.cu b/src/operator/numpy/linalg/np_eigvals.cu index 974dedc6172e..0b21ffe24262 100644 --- a/src/operator/numpy/linalg/np_eigvals.cu +++ b/src/operator/numpy/linalg/np_eigvals.cu @@ -29,11 +29,19 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_eigvals) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigvalsOpForward); #if MXNET_USE_CUSOLVER == 1 NNVM_REGISTER_OP(_npi_eigvalsh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", EigvalshOpForward); #endif diff --git a/src/operator/numpy/linalg/np_norm_backward.cu b/src/operator/numpy/linalg/np_norm_backward.cu index 09e85ab36f19..1936cf937419 100644 --- a/src/operator/numpy/linalg/np_norm_backward.cu +++ b/src/operator/numpy/linalg/np_norm_backward.cu @@ -27,6 +27,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_backward_npi_norm) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeBackward); } // namespace op diff --git a/src/operator/numpy/linalg/np_norm_forward.cu b/src/operator/numpy/linalg/np_norm_forward.cu index 6feecb09a09e..7f8673cd0ec9 100644 --- a/src/operator/numpy/linalg/np_norm_forward.cu +++ b/src/operator/numpy/linalg/np_norm_forward.cu @@ -27,6 +27,11 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_norm) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs& attrs, const bool) { + const NumpyNormParam& param = nnvm::get(attrs.parsed); + return param.axis.value().ndim() == 2; + }) .set_attr("FCompute", NumpyNormComputeForward); } // namespace op diff --git a/src/operator/numpy/np_boolean_mask_assign.cu b/src/operator/numpy/np_boolean_mask_assign.cu index 6fa59bea7710..909a7de63788 100644 --- a/src/operator/numpy/np_boolean_mask_assign.cu +++ b/src/operator/numpy/np_boolean_mask_assign.cu @@ -263,9 +263,17 @@ void NumpyBooleanAssignForwardGPU(const nnvm::NodeAttrs& attrs, } NNVM_REGISTER_OP(_npi_boolean_mask_assign_scalar) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); NNVM_REGISTER_OP(_npi_boolean_mask_assign_tensor) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBooleanAssignForwardGPU); } // namespace op diff --git a/src/operator/numpy/np_constraint_check.cu b/src/operator/numpy/np_constraint_check.cu index f83fca0e5c33..60260affdef3 100644 --- a/src/operator/numpy/np_constraint_check.cu +++ b/src/operator/numpy/np_constraint_check.cu @@ -39,6 +39,10 @@ void GetReduceOutput(mshadow::Stream *s, const TBlob &output_blob, boo } NNVM_REGISTER_OP(_npx_constraint_check) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", ConstraintCheckForward); } // namespace op diff --git a/src/operator/numpy/np_percentile_op.cu b/src/operator/numpy/np_percentile_op.cu index b24c230d6216..09cdd2e30796 100644 --- a/src/operator/numpy/np_percentile_op.cu +++ b/src/operator/numpy/np_percentile_op.cu @@ -48,6 +48,10 @@ bool CheckInvalidInput(mshadow::Stream *s, const QType *data, } NNVM_REGISTER_OP(_npi_percentile) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyPercentileForward); } // namespace op diff --git a/src/operator/numpy/random/np_bernoulli_op.cu b/src/operator/numpy/random/np_bernoulli_op.cu index a73bf9929db3..5ac005f85db6 100644 --- a/src/operator/numpy/random/np_bernoulli_op.cu +++ b/src/operator/numpy/random/np_bernoulli_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_bernoulli) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyBernoulliForward); } // namespace op diff --git a/src/operator/numpy/random/np_exponential_op.cu b/src/operator/numpy/random/np_exponential_op.cu index 4740b644a60c..4161a4bf7826 100644 --- a/src/operator/numpy/random/np_exponential_op.cu +++ b/src/operator/numpy/random/np_exponential_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_exponential) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyExponentialForward); NNVM_REGISTER_OP(_backward_broadcast_exponential) diff --git a/src/operator/numpy/random/np_gamma_op.cu b/src/operator/numpy/random/np_gamma_op.cu index 5be15c7b9d13..8c3bca0ca44f 100644 --- a/src/operator/numpy/random/np_gamma_op.cu +++ b/src/operator/numpy/random/np_gamma_op.cu @@ -30,6 +30,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_gamma) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyGammaForward); } // namespace op diff --git a/src/operator/numpy/random/np_multinomial_op.cu b/src/operator/numpy/random/np_multinomial_op.cu index 132d67beeb6a..23de9a72bde6 100644 --- a/src/operator/numpy/random/np_multinomial_op.cu +++ b/src/operator/numpy/random/np_multinomial_op.cu @@ -43,6 +43,10 @@ void CheckPvalGPU(const OpContext& ctx, DType* input, int prob_length) { } NNVM_REGISTER_OP(_npi_multinomial) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyMultinomialForward); } // namespace op diff --git a/src/operator/numpy/random/np_normal_op.cu b/src/operator/numpy/random/np_normal_op.cu index d45bc2321bd7..dc5b73254b08 100644 --- a/src/operator/numpy/random/np_normal_op.cu +++ b/src/operator/numpy/random/np_normal_op.cu @@ -29,12 +29,20 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_normal) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyNormalForward); NNVM_REGISTER_OP(_backward_broadcast_normal) .set_attr("FCompute", NormalReparamBackward); NNVM_REGISTER_OP(_npi_normal_n) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyNormalForward); } // namespace op diff --git a/src/operator/numpy/random/np_pareto_op.cu b/src/operator/numpy/random/np_pareto_op.cu index d8a8a896e653..eecc2a8d44e8 100644 --- a/src/operator/numpy/random/np_pareto_op.cu +++ b/src/operator/numpy/random/np_pareto_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_pareto) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyParetoForward); NNVM_REGISTER_OP(_backward_broadcast_pareto) diff --git a/src/operator/numpy/random/np_power_op.cu b/src/operator/numpy/random/np_power_op.cu index d5067f83bb02..154ef2e18f45 100644 --- a/src/operator/numpy/random/np_power_op.cu +++ b/src/operator/numpy/random/np_power_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_powerd) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyPowerForward); } // namespace op diff --git a/src/operator/numpy/random/np_rayleigh_op.cu b/src/operator/numpy/random/np_rayleigh_op.cu index d35ac073fc4d..4eb6e390a922 100644 --- a/src/operator/numpy/random/np_rayleigh_op.cu +++ b/src/operator/numpy/random/np_rayleigh_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_rayleigh) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyRayleighForward); NNVM_REGISTER_OP(_backward_broadcast_rayleigh) diff --git a/src/operator/numpy/random/np_weibull_op.cu b/src/operator/numpy/random/np_weibull_op.cu index 57d609d62768..645c8c4366d3 100644 --- a/src/operator/numpy/random/np_weibull_op.cu +++ b/src/operator/numpy/random/np_weibull_op.cu @@ -29,6 +29,10 @@ namespace mxnet { namespace op { NNVM_REGISTER_OP(_npi_weibull) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", NumpyWeibullForward); NNVM_REGISTER_OP(_backward_broadcast_weibull) diff --git a/src/operator/tensor/indexing_op.cu b/src/operator/tensor/indexing_op.cu index 6904656b304b..44be673e8dcd 100644 --- a/src/operator/tensor/indexing_op.cu +++ b/src/operator/tensor/indexing_op.cu @@ -877,6 +877,10 @@ NNVM_REGISTER_OP(one_hot) .set_attr("FCompute", OneHotOpForward); NNVM_REGISTER_OP(gather_nd) +.set_attr("FIsCUDAGraphsCompatible", + [](const NodeAttrs&, const bool) { + return false; + }) .set_attr("FCompute", GatherNDForwardGPU); NNVM_REGISTER_OP(scatter_nd) From f7aa6545b6ef161ed43ace557bcfd1d945df659f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 14 Sep 2020 13:12:53 -0700 Subject: [PATCH 05/13] Cleaning --- src/executor/cuda_graphs.h | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index 482d6f9bfde4..704100bfcd7d 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -56,20 +56,11 @@ inline std::string CudaDim3ToString(const dim3& dims) { return ss.str(); } -#define CU_CALL(func) \ - { \ - CUresult e = (func); \ - if (e != CUDA_SUCCESS) { \ - std::cerr << "cuda driver failure code: " << e << std::endl; \ - exit(1); \ - } \ - } - // Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.) inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) { CUgraphNode cu_node = node; CUgraphNodeType t; - CU_CALL(cuGraphNodeGetType(cu_node, &t)); + CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); return t; } @@ -92,7 +83,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { CUgraphNode cu_node = node; CUgraphNodeType t; - CU_CALL(cuGraphNodeGetType(cu_node, &t)); + CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); switch (t) { case CU_GRAPH_NODE_TYPE_KERNEL: { From 5d507107f6732a78f3dd2e93261a4e4e69c43229 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 14 Sep 2020 14:31:38 -0700 Subject: [PATCH 06/13] Properly guard graphs --- src/executor/cuda_graphs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index 704100bfcd7d..be5ea111e3c9 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -538,9 +538,9 @@ class CudaGraphsExec { bool is_enabled_; }; -#endif // CUDA_GRAPHS_AVAILABLE - - } // namespace cuda_graphs } // namespace mxnet + +#endif // CUDA_GRAPHS_AVAILABLE + #endif // MXNET_EXECUTOR_CUDA_GRAPHS_H_ From bb8740e064fe228325cc2d0078b861f5bfdf55bb Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 14 Sep 2020 15:12:47 -0700 Subject: [PATCH 07/13] Limit graphs to CUDA 10.2+ --- src/executor/cuda_graphs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index be5ea111e3c9..d2e1ba659c2f 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -35,7 +35,7 @@ #include "../common/cuda_utils.h" #if MXNET_USE_CUDA -#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10010) +#define CUDA_GRAPHS_AVAILABLE (CUDA_VERSION >= 10020) #else #define CUDA_GRAPHS_AVAILABLE (0) #endif From 1dcb30e8f14740622441dff71e675795283df002 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 15 Sep 2020 09:27:16 -0700 Subject: [PATCH 08/13] Fix the compilation when graphs are not available --- src/imperative/imperative_utils.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 4933808ad892..498e868a2b39 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -1003,8 +1003,7 @@ inline Engine::OprHandle CreateEngineOp( if (is_async) { execs[0]->op_ctx.async_on_complete = on_complete; } - for (const auto& exec : execs) exec->Run(ctx, is_gpu); - OpExecutor::RunAll(execs, ctx, is_gpu); + exec::OpExecutor::RunAll(execs, ctx, is_gpu); #endif // call on complete only if it is async op if (!is_async) { From 3746543dd814d21b56f964a0bf3f93cb5624e56a Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 15 Sep 2020 11:54:57 -0700 Subject: [PATCH 09/13] Guarding the libcuda.so usage behind RTC compilation flag --- src/executor/cuda_graphs.h | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index d2e1ba659c2f..cd39ce024903 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -56,14 +56,6 @@ inline std::string CudaDim3ToString(const dim3& dims) { return ss.str(); } -// Get the type of a CUDA Graph node (e.g. kernel launch, memcpy, etc.) -inline CUgraphNodeType CudaGraphNodeType(const cudaGraphNode_t node) { - CUgraphNode cu_node = node; - CUgraphNodeType t; - CUDA_DRIVER_CALL(cuGraphNodeGetType(cu_node, &t)); - return t; -} - // Return the list of CUDA Graph nodes from a graph inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { size_t numNodes; @@ -75,6 +67,10 @@ inline std::vector GetCudaGraphNodes(cudaGraph_t cuda_graph) { return graphNodes; } +// It does not really involve RTC, but requires libcuda.so, +// which is linked only when RTC is enabled. +#if MXNET_ENABLE_CUDA_RTC + inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { std::stringstream ss; @@ -145,6 +141,7 @@ inline std::string CudaGraphNodeToString(const cudaGraphNode_t node) { return ss.str(); } +#endif // MXNET_ENABLE_CUDA_RTC // CUDA Graphs are managed in RAII fashion by smart pointers below. // Function objects (preferred for readability) provide the deleter function. @@ -244,10 +241,12 @@ class CudaGraphsSubSegExec { std::vector graph_nodes = GetCudaGraphNodes(cuda_graph); size_t num_nodes = graph_nodes.size(); LOG(INFO) << " Graph has " << num_nodes << " nodes:"; +#if MXNET_ENABLE_CUDA_RTC for (size_t i = 0; i != num_nodes; ++i) { LOG(INFO) << " node " << i << " = " << CudaGraphNodeToString(graph_nodes[i]); } +#endif // MXNET_ENABLE_CUDA_RTC } } From a925cebc9b96cd85e7a61e98fb1f123d8f8f74a4 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 15 Sep 2020 20:37:52 -0700 Subject: [PATCH 10/13] Document the env variables --- docs/static_site/src/pages/api/faq/env_var.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/static_site/src/pages/api/faq/env_var.md b/docs/static_site/src/pages/api/faq/env_var.md index e0b70a658b62..831f7ee3e043 100644 --- a/docs/static_site/src/pages/api/faq/env_var.md +++ b/docs/static_site/src/pages/api/faq/env_var.md @@ -134,6 +134,16 @@ $env:MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0 * MXNET_EXEC_BULK_EXEC_MAX_NODE_TRAIN_BWD - Values: Int ```(default=)``` - The maximum number of nodes in the subgraph executed in bulk during training (not inference) in the backward pass. +* MXNET_ENABLE_CUDA_GRAPHS + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, MXNet will utilize CUDA graphs when executing models on the GPU when possible. + - For CUDA graphs execution, one needs to use either symbolic model or Gluon model hybridized with options `static_alloc` and `static_shape` set to True. +* MXNET_CUDA_GRAPHS_VERBOSE + - Values: 0(false) or 1(true) ```(default=0)``` + - If set to `1`, CUDA graphs executor will provide information about the graph being captured and executed. +* MXNET_CUDA_GRAPHS_MAX_LOG_ENTRIES + - Values: Int ```(default=0)``` + - The maximum number of log messages generated by CUDA graphs executor. ## Control the Data Communication From ae4ef9a7ababcacf6050b51482c65454b877900e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Sep 2020 13:49:00 -0700 Subject: [PATCH 11/13] Add test --- tests/python/gpu/test_gluon_gpu.py | 74 +++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 1 deletion(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 42a2424c7d9b..54d912eb60b1 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -22,7 +22,7 @@ import time import mxnet as mx import multiprocessing as mp -from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray +from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, rand_shape_nd import mxnet.ndarray as nd import numpy as np import math @@ -635,6 +635,78 @@ def test_gemms_true_fp16(): atol=atol, rtol=rtol) os.environ["MXNET_FC_TRUE_FP16"] = "0" +@with_seed() +def test_cuda_graphs(): + class GraphTester(gluon.HybridBlock): + def __init__(self, function_to_test, **kwargs) + super(GraphTester, self).__init__(**kwargs) + with self.name_scope(): + self.f = function_to_test() + + def hybrid_forward(self, F, *args): + # We need to isolate the operation to be fully inside the graph + # in order for graphs usage to be possible + copied_args = [F.identity(a) for a in args] + outputs = self.f(copied_args) + if isinstance(outputs, (list, tuple)): + return [F.identity(o) for o in outputs] + else: + return F.identity(outputs) + + class TestDesc: + def __init__(self, name, f, num_inputs=1, input_dim=4): + self.name = name + self.f = f + self.num_inputs = num_inputs + self.input_dim = input_dim + + def generate_inputs(self): + shape = rand_shape_nd(self.input_dim) + return [mx.random.uniform(shape=shape) for _ in num_inputs] + + tested_ops = [ + TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2), + TestDesc('add_scalar', lambda: (lambda x: x + 0.5)), + TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(3,3))), + TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), + TestDesc('Activation', lambda: mx.gluon.nn.Activation(act_type='tanh')), + ] + + N = 5 + + graph_env = 'MXNET_ENABLE_CUDA_GRAPHS' + if graph_env in os.environ: + old_env_value = os.environ[graph_env] + else: + old_env_value = None + + os.environ[graph_env] = '1' + + for test_desc in tested_ops: + print("Testing ", test_desc.name) + inputs = test_desc.generate_inputs() + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) + + # initialize parameters + net.initialize() + netg.initialize() + + net(inputs) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) + + netg.hybridize(static_alloc=True, static_shape=True) + + for _ in range(N): + assert_almost_equal(net(inputs), netg(inputs)) + + + if old_env_value is not None: + os.environ[graph_env] = old_env_value + else: + del(os.environ[graph_env]) if __name__ == '__main__': import nose From adae98a4c6b2ca1a723cb6ffe143a0ac3b899ba3 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Sep 2020 14:09:20 -0700 Subject: [PATCH 12/13] Fix the test --- src/executor/cuda_graphs.h | 10 ++-- tests/python/gpu/test_gluon_gpu.py | 85 ++++++++++++++++++++++-------- 2 files changed, 68 insertions(+), 27 deletions(-) diff --git a/src/executor/cuda_graphs.h b/src/executor/cuda_graphs.h index cd39ce024903..e669d7d1d2e3 100644 --- a/src/executor/cuda_graphs.h +++ b/src/executor/cuda_graphs.h @@ -468,6 +468,7 @@ class CudaGraphsExec { bool OpOK(const std::shared_ptr &exec) { static auto& fstateful = Op::GetAttr("FCreateOpState"); static auto& fgraphcompatible = Op::GetAttr("FIsCUDAGraphsCompatible"); + static auto& fcompute_ex = Op::GetAttr("FComputeEx"); const auto& attrs = exec->attrs; if (attrs.op != nullptr) { const auto f = fgraphcompatible.get(attrs.op, nullptr); @@ -476,14 +477,15 @@ class CudaGraphsExec { } if (fstateful.get(attrs.op, nullptr) != nullptr) { if (verbose_) { - LOG(INFO) << "Omitting stateful operator" << attrs.op->name << " from CUDA graph."; + LOG(INFO) << "Omitting stateful operator " << attrs.op->name << " from CUDA graph."; } return false; } - if (exec->dispatch_mode == DispatchMode::kFComputeEx || + if ((fcompute_ex.get(attrs.op, nullptr) != nullptr && + exec->dispatch_mode == DispatchMode::kFComputeEx) || exec->dispatch_mode == DispatchMode::kFComputeFallback) { if (verbose_) { - LOG(INFO) << "Omitting operator" << attrs.op->name + LOG(INFO) << "Omitting operator " << attrs.op->name << " from CUDA graph due to dispatch mode " << static_cast(exec->dispatch_mode); } @@ -493,7 +495,7 @@ class CudaGraphsExec { for (auto& resource : exec->op_ctx.requested) { if (!(resource.req.type == ResourceRequest::kTempSpace)) { if (verbose_) { - LOG(INFO) << "Omitting operator" << attrs.op->name + LOG(INFO) << "Omitting operator " << attrs.op->name << " from CUDA graph due to using the resource type " << static_cast(resource.req.type); } diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 54d912eb60b1..87aaf5a5566b 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -20,9 +20,10 @@ import os import tempfile import time +import random import mxnet as mx import multiprocessing as mp -from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray, rand_shape_nd +from mxnet.test_utils import check_consistency, set_default_context, assert_almost_equal, rand_ndarray import mxnet.ndarray as nd import numpy as np import math @@ -30,7 +31,7 @@ curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) sys.path.insert(0, os.path.join(curr_path, '../unittest')) -from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process +from common import setup_module, with_seed, teardown, assert_raises_cudnn_not_satisfied, run_in_spawned_process, random_seed from test_gluon import * from test_loss import * from test_gluon_rnn import * @@ -638,7 +639,7 @@ def test_gemms_true_fp16(): @with_seed() def test_cuda_graphs(): class GraphTester(gluon.HybridBlock): - def __init__(self, function_to_test, **kwargs) + def __init__(self, function_to_test, **kwargs): super(GraphTester, self).__init__(**kwargs) with self.name_scope(): self.f = function_to_test() @@ -647,7 +648,7 @@ def hybrid_forward(self, F, *args): # We need to isolate the operation to be fully inside the graph # in order for graphs usage to be possible copied_args = [F.identity(a) for a in args] - outputs = self.f(copied_args) + outputs = self.f(*copied_args) if isinstance(outputs, (list, tuple)): return [F.identity(o) for o in outputs] else: @@ -661,30 +662,48 @@ def __init__(self, name, f, num_inputs=1, input_dim=4): self.input_dim = input_dim def generate_inputs(self): - shape = rand_shape_nd(self.input_dim) - return [mx.random.uniform(shape=shape) for _ in num_inputs] + shape = tuple(np.random.randint(4, 11, size=self.input_dim)) + ret = [mx.random.uniform(shape=shape) for _ in range(self.num_inputs)] + for r in ret: + r.attach_grad() + return ret tested_ops = [ TestDesc('add', lambda: (lambda x, y: x + y), num_inputs = 2), TestDesc('add_scalar', lambda: (lambda x: x + 0.5)), - TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(3,3))), + TestDesc('Conv', lambda: mx.gluon.nn.Conv2D(channels=32, kernel_size=(1,1))), + TestDesc('ConvTranspose', lambda: mx.gluon.nn.Conv2DTranspose(channels=32, kernel_size=(1,1))), TestDesc('Dense', lambda: mx.gluon.nn.Dense(units=128)), - TestDesc('Activation', lambda: mx.gluon.nn.Activation(act_type='tanh')), + TestDesc('Activation', lambda: mx.gluon.nn.Activation('tanh')), + #TestDesc('Dropout', lambda: mx.gluon.nn.Dropout(0.5)), + TestDesc('Flatten', lambda: mx.gluon.nn.Flatten()), + TestDesc('MaxPool', lambda: mx.gluon.nn.MaxPool2D()), + TestDesc('AvgPool', lambda: mx.gluon.nn.AvgPool2D()), + TestDesc('GlobalMaxPool', lambda: mx.gluon.nn.GlobalMaxPool2D()), + TestDesc('GlobalAvgPool', lambda: mx.gluon.nn.GlobalAvgPool2D()), + TestDesc('ReflectionPad2D', lambda: mx.gluon.nn.ReflectionPad2D()), + TestDesc('BatchNorm', lambda: mx.gluon.nn.BatchNorm()), + TestDesc('InstanceNorm', lambda: mx.gluon.nn.InstanceNorm()), + TestDesc('LayerNorm', lambda: mx.gluon.nn.LayerNorm()), + TestDesc('LeakyReLU', lambda: mx.gluon.nn.LeakyReLU(0.1)), + TestDesc('PReLU', lambda: mx.gluon.nn.PReLU()), + TestDesc('ELU', lambda: mx.gluon.nn.ELU()), + TestDesc('SELU', lambda: mx.gluon.nn.SELU()), + TestDesc('Swish', lambda: mx.gluon.nn.Swish()), ] - N = 5 + N = 10 - graph_env = 'MXNET_ENABLE_CUDA_GRAPHS' - if graph_env in os.environ: - old_env_value = os.environ[graph_env] - else: - old_env_value = None - - os.environ[graph_env] = '1' + os.environ['MXNET_ENABLE_CUDA_GRAPHS'] = '1' + os.environ['MXNET_USE_FUSION'] = '0' for test_desc in tested_ops: print("Testing ", test_desc.name) inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) net = GraphTester(test_desc.f) netg = GraphTester(test_desc.f) @@ -692,21 +711,41 @@ def generate_inputs(self): net.initialize() netg.initialize() - net(inputs) + net(*inputs) for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): p2.set_data(p1.data()) netg.hybridize(static_alloc=True, static_shape=True) - for _ in range(N): - assert_almost_equal(net(inputs), netg(inputs)) + print("Testing inference mode") + with random_seed(seed): + for _ in range(N): + assert_almost_equal(net(*inputs), netg(*inputsg)) + mx.nd.waitall() + print("Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) + mx.nd.waitall() - if old_env_value is not None: - os.environ[graph_env] = old_env_value - else: - del(os.environ[graph_env]) if __name__ == '__main__': import nose From 83b0b41a22e3e0cdbd30d6fb72eba222b6754e82 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Sep 2020 11:46:54 -0700 Subject: [PATCH 13/13] Use with_environment --- tests/python/gpu/test_gluon_gpu.py | 91 +++++++++++++++--------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/tests/python/gpu/test_gluon_gpu.py b/tests/python/gpu/test_gluon_gpu.py index 28eca456a0c9..6b5b4fbbd764 100644 --- a/tests/python/gpu/test_gluon_gpu.py +++ b/tests/python/gpu/test_gluon_gpu.py @@ -702,57 +702,56 @@ def generate_inputs(self): N = 10 - os.environ['MXNET_ENABLE_CUDA_GRAPHS'] = '1' - os.environ['MXNET_USE_FUSION'] = '0' - - for test_desc in tested_ops: - print("Testing ", test_desc.name) - inputs = test_desc.generate_inputs() - inputsg = [i.copy() for i in inputs] - for i in inputsg: - i.attach_grad() - seed = random.randint(0, 10000) - net = GraphTester(test_desc.f) - netg = GraphTester(test_desc.f) - - # initialize parameters - net.initialize() - netg.initialize() - - net(*inputs) - - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - p2.set_data(p1.data()) + with environment({'MXNET_ENABLE_CUDA_GRAPHS': '1', + 'MXNET_USE_FUSION': '0'}): + for test_desc in tested_ops: + print("Testing ", test_desc.name) + inputs = test_desc.generate_inputs() + inputsg = [i.copy() for i in inputs] + for i in inputsg: + i.attach_grad() + seed = random.randint(0, 10000) + net = GraphTester(test_desc.f) + netg = GraphTester(test_desc.f) + + # initialize parameters + net.initialize() + netg.initialize() + + net(*inputs) - netg.hybridize(static_alloc=True, static_shape=True) - - print("Testing inference mode") - with random_seed(seed): - for _ in range(N): - assert_almost_equal(net(*inputs), netg(*inputsg)) + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + p2.set_data(p1.data()) - mx.nd.waitall() - print("Testing training mode") - for _ in range(N): - with random_seed(seed): - with mx.autograd.record(): - out = net(*inputs) - out.backward() + netg.hybridize(static_alloc=True, static_shape=True) + print("Testing inference mode") with random_seed(seed): - with mx.autograd.record(): - outg = netg(*inputsg) - outg.backward() - - assert_almost_equal(out, outg) - for i, ig in zip(inputs, inputsg): - assert_almost_equal(i.grad, ig.grad) + for _ in range(N): + assert_almost_equal(net(*inputs), netg(*inputsg)) - for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): - assert_almost_equal(p1.data(), p2.data()) - if p1.grad_req != 'null': - assert_almost_equal(p1.grad(), p2.grad()) - mx.nd.waitall() + mx.nd.waitall() + print("Testing training mode") + for _ in range(N): + with random_seed(seed): + with mx.autograd.record(): + out = net(*inputs) + out.backward() + + with random_seed(seed): + with mx.autograd.record(): + outg = netg(*inputsg) + outg.backward() + + assert_almost_equal(out, outg) + for i, ig in zip(inputs, inputsg): + assert_almost_equal(i.grad, ig.grad) + + for p1, p2 in zip(net.collect_params().values(), netg.collect_params().values()): + assert_almost_equal(p1.data(), p2.data()) + if p1.grad_req != 'null': + assert_almost_equal(p1.grad(), p2.grad()) + mx.nd.waitall() if __name__ == '__main__':