From f0318794779581419b0162e24e6923ac3711cfd6 Mon Sep 17 00:00:00 2001 From: kaiJIN Date: Tue, 26 Mar 2019 18:35:58 +0800 Subject: [PATCH 1/2] Support for running on arbitrary CUDA device. (#537) * support for any one cuda device * Revert "support for any one cuda device" This reverts commit 0197e4e2ef18ec41cc155f3ae2a0face5b77e1e9. * support runnning for anyone cuda device * using safe CUDAGuard rather than intrinsic CUDASetDevice * supplement a header dependency (test passed) * Support for arbitrary GPU device. * Support for arbitrary GPU device. * add docs for two method to control devices --- README.md | 21 +++++++++++++++++++ maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu | 4 ++++ maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu | 4 ++++ .../csrc/cuda/SigmoidFocalLoss_cuda.cu | 7 ++++++- maskrcnn_benchmark/csrc/cuda/nms.cu | 3 +++ 5 files changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 780722ed8..c5acea6fa 100644 --- a/README.md +++ b/README.md @@ -68,6 +68,27 @@ image = ... predictions = coco_demo.run_on_opencv_image(image) ``` +### Use it on an arbitrary GPU device +For some cases, while multi-GPU devices are installed in a machine, a possible situation is that +we only have accesse to a specified GPU device (e.g. CUDA:1 or CUDA:2) for inference, testing or training. +Here, the repository currently supports two methods to control devices. + +#### 1. using CUDA_VISIBLE_DEVICES environment variable (Recommend) +Here is an example for Mask R-CNN R-50 FPN quick on the second device (CUDA:1): +```bash +export CUDA_VISIBLE_DEVICES=1 +python tools/train_net.py --config-file=configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml +``` +Now, the session will be totally loaded on the second GPU device (CUDA:1). + +#### 2. using MODEL.DEVICE flag +In addition, the program could run on a sepcific GPU device by setting `MODEL.DEVICE` flag. +```bash +python tools/train_net.py --config-file=configs/quick_schedules/e2e_mask_rcnn_R_50_FPN_quick.yaml MODEL.DEVICE cuda:1 +``` +Where, we add a `MODEL.DEVICE cuda:1` flag to configure the target device. +*Pay attention, there is still a small part of memory stored in `cuda:0` for some reasons.* + ## Perform training on COCO dataset For the following examples to work, you need to first install `maskrcnn_benchmark`. diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu index 170771aa8..29e7ac6ed 100644 --- a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu @@ -1,6 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include +#include #include #include @@ -263,6 +264,8 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(input.device()); + auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); @@ -311,6 +314,7 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, const int sampling_ratio) { AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(grad.device()); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu index cef3beaa4..f79bb71dc 100644 --- a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu @@ -1,6 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include +#include #include #include @@ -115,6 +116,8 @@ std::tuple ROIPool_forward_cuda(const at::Tensor& input, AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(input.device()); + auto num_rois = rois.size(0); auto channels = input.size(1); auto height = input.size(2); @@ -167,6 +170,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, AT_ASSERTM(grad.type().is_cuda(), "grad must be a CUDA tensor"); AT_ASSERTM(rois.type().is_cuda(), "rois must be a CUDA tensor"); // TODO add more checks + at::cuda::CUDAGuard device_guard(grad.device()); auto num_rois = rois.size(0); auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options()); diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu index cd9b4c96b..e25424e00 100644 --- a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu @@ -4,6 +4,7 @@ // cyfu@cs.unc.edu #include #include +#include #include #include @@ -111,6 +112,8 @@ at::Tensor SigmoidFocalLoss_forward_cuda( AT_ASSERTM(targets.type().is_cuda(), "targets must be a CUDA tensor"); AT_ASSERTM(logits.dim() == 2, "logits should be NxClass"); + at::cuda::CUDAGuard device_guard(logits.device()); + const int num_samples = logits.size(0); auto losses = at::empty({num_samples, logits.size(1)}, logits.options()); @@ -156,7 +159,9 @@ at::Tensor SigmoidFocalLoss_backward_cuda( const int num_samples = logits.size(0); AT_ASSERTM(logits.size(1) == num_classes, "logits.size(1) should be num_classes"); - + + at::cuda::CUDAGuard device_guard(logits.device()); + auto d_logits = at::zeros({num_samples, num_classes}, logits.options()); auto d_logits_size = num_samples * logits.size(1); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); diff --git a/maskrcnn_benchmark/csrc/cuda/nms.cu b/maskrcnn_benchmark/csrc/cuda/nms.cu index 833d8523a..7bb0e50a1 100644 --- a/maskrcnn_benchmark/csrc/cuda/nms.cu +++ b/maskrcnn_benchmark/csrc/cuda/nms.cu @@ -1,6 +1,7 @@ // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include #include +#include #include #include @@ -70,6 +71,8 @@ __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { using scalar_t = float; AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); + at::cuda::CUDAGuard device_guard(boxes.device()); + auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); From 4a7dcc4da980b1f6681a1dcefdc0938b1adbe623 Mon Sep 17 00:00:00 2001 From: Miguel Varela Ramos Date: Tue, 26 Mar 2019 11:36:55 +0100 Subject: [PATCH 2/2] Rollback dispatch patch (#603) * Merge branch 'master' of /home/braincreator/projects/maskrcnn-benchmark with conflicts. * rolls back the breaking AT dispatch changes (#555) * revert accidental docker changes * revert accidental docker changes (2) --- maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp | 2 +- maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp | 2 +- maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu | 4 ++-- maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu | 4 ++-- maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp index cd9fde2ae..d35aedf27 100644 --- a/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp +++ b/maskrcnn_benchmark/csrc/cpu/ROIAlign_cpu.cpp @@ -239,7 +239,7 @@ at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, return output; } - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] { + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { ROIAlignForward_cpu_kernel( output_size, input.data(), diff --git a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp index 639ca472e..1153dea04 100644 --- a/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp +++ b/maskrcnn_benchmark/csrc/cpu/nms_cpu.cpp @@ -68,7 +68,7 @@ at::Tensor nms_cpu(const at::Tensor& dets, const at::Tensor& scores, const float threshold) { at::Tensor result; - AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms", [&] { + AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { result = nms_cpu_kernel(dets, scores, threshold); }); return result; diff --git a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu index 29e7ac6ed..2ff36adee 100644 --- a/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/ROIAlign_cuda.cu @@ -283,7 +283,7 @@ at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, return output; } - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIAlign_forward", [&] { + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIAlign_forward", [&] { RoIAlignForward<<>>( output_size, input.contiguous().data(), @@ -330,7 +330,7 @@ at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, return grad_input; } - AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIAlign_backward", [&] { + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIAlign_backward", [&] { RoIAlignBackwardFeature<<>>( grad.numel(), grad.contiguous().data(), diff --git a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu index f79bb71dc..0b2b1758c 100644 --- a/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/ROIPool_cuda.cu @@ -137,7 +137,7 @@ std::tuple ROIPool_forward_cuda(const at::Tensor& input, return std::make_tuple(output, argmax); } - AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "ROIPool_forward", [&] { + AT_DISPATCH_FLOATING_TYPES(input.type(), "ROIPool_forward", [&] { RoIPoolFForward<<>>( output_size, input.contiguous().data(), @@ -186,7 +186,7 @@ at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, return grad_input; } - AT_DISPATCH_FLOATING_TYPES(grad.scalar_type(), "ROIPool_backward", [&] { + AT_DISPATCH_FLOATING_TYPES(grad.type(), "ROIPool_backward", [&] { RoIPoolFBackward<<>>( grad.numel(), grad.contiguous().data(), diff --git a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu index e25424e00..e0e7d3bff 100644 --- a/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu +++ b/maskrcnn_benchmark/csrc/cuda/SigmoidFocalLoss_cuda.cu @@ -128,7 +128,7 @@ at::Tensor SigmoidFocalLoss_forward_cuda( return losses; } - AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_forward", [&] { + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_forward", [&] { SigmoidFocalLossForward<<>>( losses_size, logits.contiguous().data(), @@ -174,7 +174,7 @@ at::Tensor SigmoidFocalLoss_backward_cuda( return d_logits; } - AT_DISPATCH_FLOATING_TYPES(logits.scalar_type(), "SigmoidFocalLoss_backward", [&] { + AT_DISPATCH_FLOATING_TYPES(logits.type(), "SigmoidFocalLoss_backward", [&] { SigmoidFocalLossBackward<<>>( d_logits_size, logits.contiguous().data(),