Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove cutlass usage in row major input for euclidean exp/unexp, cosine and L1 distance matrix #3589

Merged
merged 10 commits into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 9 additions & 74 deletions cpp/src_prims/distance/cosine.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,45 +34,6 @@
namespace MLCommon {
namespace Distance {

/**
* @brief the cosine distance matrix calculation kernel
* @tparam DataT input data-type (for A and B matrices)
* @tparam AccT accumulation data-type
* @tparam OutT output data-type (for C and D matrices)
* @tparam IdxT index data-type
* @tparam Policy struct which tunes the Contraction kernel
* @tparam CoreLambda lambda which implements accumulation operation
* @tparam EpilogueLambda lambda which implements operation for calculating
final value.
* @tparam FinalLambda final lambda called on final distance value
*
* @param[in] x input matrix
* @param[in] y input matrix
* @param[in] xn row norms of input matrix A.
* @param[in] yn row norms of input matrix B.
* @param[in] m number of rows of A and C/D
* @param[in] n number of columns of B and C/D
* @param[in] k number of cols of A and rows of B
* @param[output] pD output matrix
* @param core_op the core lambda
* @param epilog_op the epilogue lambda
* @param fin_op the final gemm epilogue lambda
*/
template <typename DataT, typename AccT, typename OutT, typename IdxT,
typename Policy, typename CoreLambda, typename EpilogueLambda,
typename FinalLambda>
__global__ __launch_bounds__(Policy::Nthreads, 2) void cosineKernel(
const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn, IdxT m,
IdxT n, IdxT k, OutT *dOutput, CoreLambda core_op, EpilogueLambda epilog_op,
FinalLambda fin_op) {
extern __shared__ char smem[];

PairwiseDistances<DataT, AccT, OutT, IdxT, false, Policy, CoreLambda,
EpilogueLambda, FinalLambda>
obj(x, y, m, n, k, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op);
obj.run();
}

/**
* @brief the cosine distance matrix calculation implementer
* It computes the following equation:
Expand Down Expand Up @@ -111,33 +72,9 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
};

// epilogue operation lambda for final value calculation
auto epilog_lambda = [xn, yn, m, n, k] __device__(
auto epilog_lambda = [] __device__(
AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
DataT * sxNorm, DataT * syNorm) {
__syncthreads(); // so that we can safely reuse smem

// Load x & y norms required by this threadblock in shmem buffer
for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) {
auto idx = blockIdx.x * Policy::Mblk + i;
sxNorm[i] = idx < m ? xn[idx] : 0;
}
for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) {
auto idx = blockIdx.y * Policy::Nblk + i;
syNorm[i] = idx < n ? yn[idx] : 0;
}

__syncthreads();
DataT regxn[Policy::AccRowsPerTh], regyn[Policy::AccColsPerTh];
#pragma unroll
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
regxn[i] =
sxNorm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)];
}
#pragma unroll
for (int i = 0; i < Policy::AccColsPerTh; ++i) {
regyn[i] =
syNorm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)];
}
DataT * regxn, DataT * regyn) {
#pragma unroll
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
#pragma unroll
Expand All @@ -147,8 +84,9 @@ void cosineImpl(const DataT *x, const DataT *y, const DataT *xn,
}
};

cosineKernel<DataT, AccT, OutT, IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
pairwiseDistanceMatKernel<raft::distance::DistanceType::CosineExpanded, DataT,
AccT, OutT, IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
<<<grid, blk, Policy::SmemSize, stream>>>(
x, y, xn, yn, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);

Expand Down Expand Up @@ -211,13 +149,10 @@ void cosineAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,

typedef std::is_same<OutType, bool> is_bool;

if (((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
(worksize < m * sizeof(AccType))) {
THROW("workspace size error");
}
if (workspace == nullptr) {
THROW("workspace is null");
}
ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
(worksize < m * sizeof(AccType))),
"workspace size error");
ASSERT(workspace != nullptr, "workspace is null");

InType *col_vec = workspace;
InType *row_vec = workspace;
Expand Down
156 changes: 38 additions & 118 deletions cpp/src_prims/distance/euclidean.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,47 +32,6 @@
namespace MLCommon {
namespace Distance {

/**
* @brief the expanded euclidean distance matrix calculation kernel
* It computes the following equation: C = op(A^2 + B^2 - 2AB)
* @tparam DataT input data-type (for A and B matrices)
* @tparam AccT accumulation data-type
* @tparam OutT output data-type (for C and D matrices)
* @tparam IdxT index data-type
* @tparam sqrt if the square root is computed or not
* @tparam Policy struct which tunes the Contraction kernel
* @tparam CoreLambda lambda which implements accumulation operation
* @tparam EpilogueLambda lambda which implements operation for calculating
final value.
* @tparam FinalLambda final lambda called on final distance value
*
* @param[in] x input matrix
* @param[in] y input matrix
* @param[in] xn row norms of input matrix A.
* @param[in] yn row norms of input matrix B.
* @param[in] m number of rows of A and C/D
* @param[in] n number of columns of B and C/D
* @param[in] k number of cols of A and rows of B
* @param[output] pD output matrix
* @param core_op the core lambda
* @param epilog_op the epilogue lambda
* @param fin_op the final gemm epilogue lambda
*/
template <typename DataT, typename AccT, typename OutT, typename IdxT,
bool Sqrt, typename Policy, typename CoreLambda,
typename EpilogueLambda, typename FinalLambda>
__global__ __launch_bounds__(Policy::Nthreads, 2) void euclideanExpKernel(
const DataT *x, const DataT *y, const DataT *_xn, const DataT *_yn, IdxT m,
IdxT n, IdxT k, OutT *dOutput, CoreLambda core_op, EpilogueLambda epilog_op,
FinalLambda fin_op) {
extern __shared__ char smem[];

PairwiseDistances<DataT, AccT, OutT, IdxT, Sqrt, Policy, CoreLambda,
EpilogueLambda, FinalLambda>
obj(x, y, m, n, k, _xn, _yn, dOutput, smem, core_op, epilog_op, fin_op);
obj.run();
}

/**
* @brief the expanded euclidean distance matrix calculation implementer
* It computes the following equation: C = op(A^2 + B^2 - 2AB)
Expand Down Expand Up @@ -111,53 +70,32 @@ void euclideanExpImpl(const DataT *x, const DataT *y, const DataT *xn,
};

// epilogue operation lambda for final value calculation
auto epilog_lambda = [xn, yn, m, n, k] __device__(
auto epilog_lambda = [sqrt] __device__(
AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
DataT * sxNorm, DataT * syNorm) {
__syncthreads(); // so that we can safely reuse smem

// Load x & y norms required by this threadblock in shmem buffer
for (int i = threadIdx.x; i < Policy::Mblk; i += Policy::Nthreads) {
auto idx = blockIdx.x * Policy::Mblk + i;
sxNorm[i] = idx < m ? xn[idx] : 0;
}
for (int i = threadIdx.x; i < Policy::Nblk; i += Policy::Nthreads) {
auto idx = blockIdx.y * Policy::Nblk + i;
syNorm[i] = idx < n ? yn[idx] : 0;
}

__syncthreads();
DataT regxn[Policy::AccRowsPerTh], regyn[Policy::AccColsPerTh];
DataT * regxn, DataT * regyn) {
#pragma unroll
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
regxn[i] =
sxNorm[i * Policy::AccThRows + (threadIdx.x / Policy::AccThCols)];
}
#pragma unroll
for (int i = 0; i < Policy::AccColsPerTh; ++i) {
regyn[i] =
syNorm[i * Policy::AccThCols + (threadIdx.x % Policy::AccThCols)];
for (int j = 0; j < Policy::AccColsPerTh; ++j) {
acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
}
}
if (sqrt) {
#pragma unroll
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
#pragma unroll
for (int j = 0; j < Policy::AccColsPerTh; ++j) {
acc[i][j] = regxn[i] + regyn[j] - (DataT)2.0 * acc[i][j];
for (int j = 0; j < Policy::AccColsPerTh; ++j) {
acc[i][j] = raft::mySqrt(acc[i][j]);
}
}
}
};

if (sqrt) {
euclideanExpKernel<DataT, AccT, OutT, IdxT, true, Policy,
decltype(core_lambda), decltype(epilog_lambda),
FinalLambda><<<grid, blk, Policy::SmemSize, stream>>>(
x, y, xn, yn, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);
} else {
euclideanExpKernel<DataT, AccT, OutT, IdxT, false, Policy,
decltype(core_lambda), decltype(epilog_lambda),
FinalLambda><<<grid, blk, Policy::SmemSize, stream>>>(
pairwiseDistanceMatKernel<raft::distance::DistanceType::L2Expanded, DataT,
AccT, OutT, IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
<<<grid, blk, Policy::SmemSize, stream>>>(
x, y, xn, yn, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);
}

CUDA_CHECK(cudaGetLastError());
}
Expand Down Expand Up @@ -212,13 +150,10 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,

typedef std::is_same<OutType, bool> is_bool;

if (((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
(worksize < m * sizeof(AccType))) {
THROW("workspace size error");
}
if (workspace == nullptr) {
THROW("workspace is null");
}
ASSERT(!(((pA != pB) && (worksize < (m + n) * sizeof(AccType))) ||
(worksize < m * sizeof(AccType))),
"workspace size error");
ASSERT(workspace != nullptr, "workspace is null");

InType *col_vec = workspace;
InType *row_vec = workspace;
Expand Down Expand Up @@ -302,43 +237,23 @@ void euclideanAlgo1(Index_ m, Index_ n, Index_ k, const InType *pA,
}

/**
* @brief the unexpanded euclidean distance matrix calculation kernel
* @brief the unexpanded euclidean distance matrix calculation
* It computes the following equation: cij = op((ai-bj)^2)
* @tparam DataT input data-type (for A and B matrices)
* @tparam AccT accumulation data-type
* @tparam OutT output data-type (for C and D matrices)
* @tparam IdxT index data-type
* @tparam sqrt if the square root is computed or not
* @tparam Policy struct which tunes the Contraction kernel
* @tparam CoreLambda lambda which implements accumulation operation
* @tparam EpilogueLambda lambda which implements operation for calculating
final value.
* @tparam FinalLambda final lambda called on final distance value
*
* @param[in] x input matrix
* @param[in] y input matrix
* @param[in] m number of rows of A and C/D
* @param[in] n number of columns of B and C/D
* @param[in] k number of cols of A and rows of B
* @param[in] sqrt if the square root is computed or not
* @param[output] pD output matrix
* @param core_op the core lambda
* @param epilog_op the epilogue lambda
* @param fin_op the final gemm epilogue lambda
*/
template <typename DataT, typename AccT, typename OutT, typename IdxT,
bool Sqrt, typename Policy, typename CoreLambda,
typename EpilogueLambda, typename FinalLambda>
__global__ __launch_bounds__(Policy::Nthreads, 2) void euclideanUnExpKernel(
const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, OutT *dOutput,
CoreLambda core_op, EpilogueLambda epilog_op, FinalLambda fin_op) {
extern __shared__ char smem[];

PairwiseDistances<DataT, AccT, OutT, IdxT, Sqrt, Policy, CoreLambda,
EpilogueLambda, FinalLambda>
obj(x, y, m, n, k, dOutput, smem, core_op, epilog_op, fin_op);
obj.run();
}

template <typename DataT, typename AccT, typename OutT, typename IdxT,
int VecLen, typename FinalLambda>
void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
Expand All @@ -356,21 +271,26 @@ void euclideanUnExpImpl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
};

// epilogue operation lambda for final value calculation
auto epilog_lambda = [] __device__(
auto epilog_lambda = [sqrt] __device__(
AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
DataT * sxNorm, DataT * syNorm) { return; };
DataT * regxn, DataT * regyn) {
if (sqrt) {
#pragma unroll
for (int i = 0; i < Policy::AccRowsPerTh; ++i) {
#pragma unroll
for (int j = 0; j < Policy::AccColsPerTh; ++j) {
acc[i][j] = raft::mySqrt(acc[i][j]);
}
}
}
};

if (sqrt) {
euclideanUnExpKernel<DataT, AccT, OutT, IdxT, true, Policy,
decltype(core_lambda), decltype(epilog_lambda),
FinalLambda><<<grid, blk, Policy::SmemSize, stream>>>(
x, y, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);
} else {
euclideanUnExpKernel<DataT, AccT, OutT, IdxT, false, Policy,
decltype(core_lambda), decltype(epilog_lambda),
FinalLambda><<<grid, blk, Policy::SmemSize, stream>>>(
x, y, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);
}
pairwiseDistanceMatKernel<raft::distance::DistanceType::L2Unexpanded, DataT,
AccT, OutT, IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
<<<grid, blk, Policy::SmemSize, stream>>>(x, y, nullptr, nullptr, m, n, k,
dOutput, core_lambda,
epilog_lambda, fin_op);

CUDA_CHECK(cudaGetLastError());
}
Expand Down
37 changes: 10 additions & 27 deletions cpp/src_prims/distance/l1.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,13 @@ namespace MLCommon {
namespace Distance {

/**
* @brief the L1 distance matrix calculation kernel
* @brief the L1 distance matrix calculation implementer
* It computes the following equation: cij = op(ai-bj)
* @tparam DataT input data-type (for A and B matrices)
* @tparam AccT accumulation data-type
* @tparam OutT output data-type (for C and D matrices)
* @tparam IdxT index data-type
* @tparam Policy struct which tunes the Contraction kernel
* @tparam CoreLambda lambda which implements accumulation operation
* @tparam EpilogueLambda lambda which implements operation for calculating
final value.

* @tparam FinalLambda final lambda called on final distance value
*
* @param[in] x input matrix
Expand All @@ -44,24 +41,8 @@ namespace Distance {
* @param[in] n number of columns of B and C/D
* @param[in] k number of cols of A and rows of B
* @param[output] pD output matrix
* @param core_op the core lambda
* @param epilog_op the epilogue lambda
* @param fin_op the final gemm epilogue lambda
*/
template <typename DataT, typename AccT, typename OutT, typename IdxT,
typename Policy, typename CoreLambda, typename EpilogueLambda,
typename FinalLambda>
__global__ __launch_bounds__(Policy::Nthreads, 2) void l1Kernel(
const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k, OutT *dOutput,
CoreLambda core_op, EpilogueLambda epilog_op, FinalLambda fin_op) {
extern __shared__ char smem[];

PairwiseDistances<DataT, AccT, OutT, IdxT, false, Policy, CoreLambda,
EpilogueLambda, FinalLambda>
obj(x, y, m, n, k, dOutput, smem, core_op, epilog_op, fin_op);
obj.run();
}

template <typename DataT, typename AccT, typename OutT, typename IdxT,
int VecLen, typename FinalLambda>
static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
Expand All @@ -80,12 +61,14 @@ static void l1Impl(const DataT *x, const DataT *y, IdxT m, IdxT n, IdxT k,
// epilogue operation lambda for final value calculation
auto epilog_lambda = [] __device__(
AccT acc[Policy::AccRowsPerTh][Policy::AccColsPerTh],
DataT * sxNorm, DataT * syNorm) { return; };

l1Kernel<DataT, AccT, OutT, IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
<<<grid, blk, Policy::SmemSize, stream>>>(
x, y, m, n, k, dOutput, core_lambda, epilog_lambda, fin_op);
DataT * regxn, DataT * regyn) { return; };

pairwiseDistanceMatKernel<raft::distance::DistanceType::L1, DataT, AccT, OutT,
IdxT, Policy, decltype(core_lambda),
decltype(epilog_lambda), FinalLambda>
<<<grid, blk, Policy::SmemSize, stream>>>(x, y, nullptr, nullptr, m, n, k,
dOutput, core_lambda,
epilog_lambda, fin_op);

CUDA_CHECK(cudaGetLastError());
}
Expand Down
Loading