Skip to content
This repository has been archived by the owner on Nov 17, 2023. It is now read-only.

Many cudaMemCopy in operators #16583

Closed
eric-haibin-lin opened this issue Oct 22, 2019 · 1 comment
Closed

Many cudaMemCopy in operators #16583

eric-haibin-lin opened this issue Oct 22, 2019 · 1 comment

Comments

@eric-haibin-lin
Copy link
Member

src/kvstore/kvstore_utils.cu:  CUDA_CALL(cudaMemcpy(sort_output_ptr, dptr, sort_output_bytes,
src/kvstore/kvstore_utils.cu:  CUDA_CALL(cudaMemcpy(&num_selected_out, num_selected_ptr, num_selected_bytes,
src/ndarray/ndarray_function.cu:      CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg[num_rows-1], sizeof(dim_t),
src/operator/contrib/adamw.cu:    CUDA_CALL(cudaMemcpy(&scale, scale_blob.dptr<DType>(), sizeof(DType),
src/operator/contrib/boolean_mask.cu:  CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[idx_size - 1], sizeof(int32_t),
src/operator/contrib/index_array.cu:    CUDA_CALL(cudaMemcpy(workspace.dptr_, cpu_workspace.data(), sizeof(int64_t) * (2 * naxes),
src/operator/contrib/index_array.cu:    CUDA_CALL(cudaMemcpy(workspace.dptr_, inshape.data(), sizeof(dim_t) * ndim,
src/operator/contrib/multi_proposal.cu:  FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
src/operator/contrib/multi_proposal.cu:    FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_, &anchors[0],
src/operator/contrib/multi_proposal.cu:        FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
src/operator/contrib/proposal.cu:  FRCNN_CUDA_CHECK(cudaMemcpy(&mask_host[0],
src/operator/contrib/proposal.cu:    FRCNN_CUDA_CHECK(cudaMemcpy(workspace_proposals.dptr_,
src/operator/contrib/proposal.cu:    FRCNN_CUDA_CHECK(cudaMemcpy(&cpu_im_info[0], im_info.dptr_,
src/operator/contrib/proposal.cu:    FRCNN_CUDA_CHECK(cudaMemcpy(keep, &_keep[0], sizeof(int) * _keep.size(),
src/operator/numpy/np_boolean_mask_assign.cu:    CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[mask_size], sizeof(size_t),
src/operator/numpy/np_nonzero_op.cu:  CUDA_CALL(cudaMemcpy(&valid_num, &prefix_sum[in_size - 1], sizeof(int32_t),
src/operator/numpy/np_nonzero_op.cu:      CUDA_CALL(cudaMemcpy(out.data().dptr<int64_t>(), &temp, sizeof(int64_t),
src/operator/numpy/np_unique_op.cu:    CUDA_CALL(cudaMemcpy(&valid_num, thrust::raw_pointer_cast(&prefix_sum[input_size - 1]),
src/operator/numpy/np_unique_op.cu:    CUDA_CALL(cudaMemcpy(&valid_num, thrust::raw_pointer_cast(&prefix_sum[temp_shape[0] - 1]),
src/operator/numpy/np_unique_op.cu:      CUDA_CALL(cudaMemcpy(outputs[0].data().dptr<DType>(), inputs[0].data().dptr<DType>(),
src/operator/numpy/random/dist_common.cu:CUDA_CALL(cudaMemcpy(dst, src, sizeof(float), cudaMemcpyDeviceToHost));
src/operator/numpy/random/dist_common.cu:CUDA_CALL(cudaMemcpy(dst, src, sizeof(double), cudaMemcpyDeviceToHost));
src/operator/numpy/random/np_multinomial_op.cu:  CUDA_CALL(cudaMemcpy(&pvals_[0], input, sizeof(DType) * prob_length,
src/operator/rnn-inl.h:      CUDA_CALL(cudaMemcpy(sequence_length_cpu_itype,  sequence_length_ptr_gpu,
src/operator/tensor/cast_storage-inl.cuh:  CUDA_CALL(cudaMemcpy(&nnr, &row_flg[num_rows - 1], sizeof(dim_t), cudaMemcpyDeviceToHost));
src/operator/tensor/cast_storage-inl.cuh:        CUDA_CALL(cudaMemcpy(&nnz, &(indptr[num_rows]), sizeof(IType), cudaMemcpyDeviceToHost));
src/operator/tensor/dot-inl.cuh:          CUDA_CALL(cudaMemcpy(&nnr, nnr_ptr, nnr_bytes, cudaMemcpyDeviceToHost));
src/operator/tensor/dot-inl.cuh:            CUDA_CALL(cudaMemcpy(&nnr_out, &row_flg_out[num_cols_l-1], sizeof(dim_t),
src/operator/tensor/elemwise_binary_op_basic.cu:        CUDA_CALL(cudaMemcpy(&nnr_out, &common_row_table[num_rows-1], sizeof(nnvm::dim_t),
src/operator/tensor/indexing_op.cu:  CUDA_CALL(cudaMemcpy(&is_valid, is_valid_ptr, sizeof(char),
src/operator/tensor/indexing_op.cu:  CUDA_CALL(cudaMemcpy(&nnr, grad_row_idx + data_size, sizeof(RType),
src/operator/tensor/indexing_op.cu:        CUDA_CALL(cudaMemcpy(&nnr, &prefix_sum[num_rows-1], sizeof(dim_t),
src/operator/tensor/matrix_op.cu:        CUDA_CALL(cudaMemcpy(&nnr, &out_indptr[indptr_len-1], sizeof(RType),
src/operator/tensor/square_sum.cu:    CUDA_CALL(cudaMemcpy(&is_diff, is_diff_ptr, sizeof(int32_t), cudaMemcpyDeviceToHost));

Lots of CudaMemCpy in operators. We should replace them with CudaMemCpyAsync followed by CudaStreamSync.
#16532 (comment)

@haojin2
Copy link
Contributor

haojin2 commented Nov 13, 2019

Solved in #16790, closing.

@haojin2 haojin2 closed this as completed Nov 13, 2019
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Projects
None yet
Development

No branches or pull requests

2 participants