Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve dot #61

Merged
merged 8 commits into from
Jun 2, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/operator/mxnet_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#ifndef MXNET_OPERATOR_MXNET_OP_H_
#define MXNET_OPERATOR_MXNET_OP_H_

#include <dmlc/omp.h>
#include <mxnet/base.h>
#include <algorithm>

Expand All @@ -22,6 +23,8 @@ const float PI = 3.14159265358979323846;
using std::isnan;
#endif

template<typename xpu>
int get_num_threads(const int N);

#ifdef __CUDACC__
#define CUDA_KERNEL_LOOP(i, n) \
Expand All @@ -37,8 +40,18 @@ inline int cuda_get_num_blocks(const int N) {
using namespace mshadow::cuda;
return std::min(kMaxGridNum, (N + kBaseThreadNum - 1) / kBaseThreadNum);
}

template<>
inline int get_num_threads<gpu>(const int N) {
using namespace mshadow::cuda;
return kBaseThreadNum * cuda_get_num_blocks(N);
}
#endif // __CUDACC__

template<>
inline int get_num_threads<cpu>(const int N) {
return omp_get_num_threads();
}

/*! \brief operator request type switch */
#define MXNET_ASSIGN_REQ_SWITCH(req, ReqType, ...) \
Expand Down
71 changes: 33 additions & 38 deletions src/operator/tensor/matrix_op-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -538,48 +538,32 @@ struct DotCsrDnsDns<false, false, req> {
/*!
* \brief Kernel of dot(csr.T(), dns1) = dns2
*/
template<int req>
struct DotCsrDnsDns<true, false, req> {
struct DotCsrTransDnsDns {
/*!
* \brief This function represents performing an inner product between a column of lhs
* and a column of rhs and then assigning the value to out[i].
* \param i i-th element in out 1D view
* \param out output matrix
* \param data_l csr values of lhs
* \param indptr_l csr indptr of lhs
* \param col_idx_l csr col_idx of lhs
* \param data_r dense data of rhs
* \param num_rows_l number of rows of lhs
* \param num_cols number of columns of outputs
* \brief
* \param i the i-th thread
*/
template<typename DType, typename IType, typename CType>
MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data_l, const IType* indptr_l,
const CType* col_idx_l, const DType* data_r, const int num_rows_l,
const int num_cols) {
const int irow = i / num_cols; // col id of the lhs
const int icol = i % num_cols; // col id of the rhs
DType sum = 0;
for (int k = 0; k < num_rows_l; ++k) {
const IType low = indptr_l[k];
const IType high = indptr_l[k+1];
if (low == high || irow < col_idx_l[low] || irow > col_idx_l[high-1]) continue;
int j = -1, l = low, r = high - 1;
while (l <= r) {
int m = l + (r - l) / 2;
if (col_idx_l[m] == irow) {
j = m; break;
const CType* col_idx_l, const DType* data_r, const size_t seg_len,
const size_t num_rows_l, const size_t num_rows,
const size_t num_cols) {
const size_t seg_start = i * seg_len;
if (seg_start >= num_rows) return;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this check necessary? since you did rounding at line 599

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

num_threads could be so big that seg_len=1, and more threads than num_rows are launched. For those extra threads, we don't need to do anything, right?

const size_t seg_end = (i + 1) * seg_len;
for (size_t j = 0; j < num_rows_l; ++j) {
if (indptr_l[j] == indptr_l[j+1]) continue;
const size_t offset_r = j * num_cols;
for (auto k = indptr_l[j]; k < indptr_l[j+1]; ++k) {
const auto col_idx = col_idx_l[k];
if (col_idx < seg_start || col_idx >= seg_end) continue;
const size_t offset_out = col_idx * num_cols;
const auto val = data_l[k];
for (size_t l = 0; l < num_cols; ++l) {
out[offset_out+l] += data_r[offset_r+l] * val;
}
if (col_idx_l[m] < irow) {
l = m + 1;
} else {
r = m - 1;
}
}
if (j >= 0) {
sum += data_l[j] * data_r[k*num_cols+icol];
}
}
KERNEL_ASSIGN(out[i], req, sum);
}
};

Expand Down Expand Up @@ -608,11 +592,22 @@ void DotCsrDnsDnsImpl(const OpContext& ctx,
MSHADOW_INT_TYPE_SWITCH(col_idx_l.type_flag_, CType, { // col idx type
if (!lhs.storage_initialized()) return;
if (trans_lhs) {
mxnet_op::Kernel<DotCsrDnsDns<true, false, ReqType>, xpu>::Launch(s, data_out.Size(),
// parallelize by row blocks
int num_threads = mxnet_op::get_num_threads<xpu>(data_out.shape_[0]);
// divide output matrix row by num_threads
// each thread is responsible for seg_len rows
size_t seg_len = (data_out.shape_[0] + num_threads - 1) / num_threads;
if (kWriteTo == req) {
mxnet_op::Kernel<mxnet_op::set_zero, xpu>::Launch(
s, data_out.Size(), data_out.dptr<DType>());
}

mxnet_op::Kernel<DotCsrTransDnsDns, xpu>::Launch(s, num_threads,
data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
col_idx_l.dptr<CType>(), data_r.dptr<DType>(), lhs.shape()[0],
rhs.shape()[1]);
col_idx_l.dptr<CType>(), data_r.dptr<DType>(), seg_len,
lhs.shape()[0], data_out.shape_[0], data_out.shape_[1]);
} else {
// TODO(junwu): implement parallelization by row blocks for cpu
mxnet_op::Kernel<DotCsrDnsDns<false, false, ReqType>, xpu>::Launch(s, data_out.Size(),
data_out.dptr<DType>(), data_l.dptr<DType>(), indptr_l.dptr<IType>(),
col_idx_l.dptr<CType>(), data_r.dptr<DType>(), rhs.shape()[1]);
Expand Down