Skip to content

Commit

Permalink
AVX/SSSE3/AVX512_MIC ISA cleanup (#2465)
Browse files Browse the repository at this point in the history
* cleanup avx512_mic

* cleanup ssse3

* cleanup avx

* fixing typos

* fixing / typos

* fixing make typos

* clang format

* Updating ISA in install.md

* replace AVX with AVX2 for intersect
  • Loading branch information
napetrov authored Aug 18, 2023
1 parent af5d732 commit eb6735e
Show file tree
Hide file tree
Showing 29 changed files with 138 additions and 629 deletions.
4 changes: 2 additions & 2 deletions INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -133,9 +133,9 @@ It is possible to build oneDAL libraries with selected set of algorithms and/or
make -f makefile daal PLAT=win32e CORE.ALGORITHMS.CUSTOM="linear_regression svm" -j16


- To build oneDAL with AVX2 and AVX CPU optimizations, run:
- To build oneDAL with AVX2 and AVX512 CPU optimizations, run:

make -f makefile daal PLAT=win32e REQCPU="avx2 avx" -j16
make -f makefile daal PLAT=win32e REQCPU="avx2 avx512" -j16


- To build oneDAL with Moments of Low Order algorithm and AVX2 CPU optimizations, run:
Expand Down
15 changes: 4 additions & 11 deletions cpp/daal/include/algorithms/algorithm_container_base_batch.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,21 +136,14 @@ class AlgorithmContainerImpl<batch> : public AlgorithmContainer<batch>
*
* \tparam mode Computation mode of the algorithm, \ref ComputeMode
* \tparam sse2Container Implementation for Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2)
* \tparam ssse3Container Implementation for Supplemental Streaming SIMD Extensions 3 (SSSE3)
* \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 42 (Intel(R) SSE42)
* \tparam avxContainer Implementation for Intel(R) Advanced Vector Extensions (Intel(R) AVX)
* \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
* \tparam avx512_micContainer Implementation for Intel(R) Xeon Phi(TM) processors/coprocessors based on Intel(R) Advanced Vector
* Extensions 512 (Intel(R) AVX512)
* \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512
*/
template <typename sse2Container DAAL_KERNEL_SSSE3_ONLY(typename ssse3Container) DAAL_KERNEL_SSE42_ONLY(typename sse42Container)
DAAL_KERNEL_AVX_ONLY(typename avxContainer) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
DAAL_KERNEL_AVX512_MIC_ONLY(typename avx512_micContainer) DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
class DAAL_EXPORT AlgorithmDispatchContainer<batch, sse2Container DAAL_KERNEL_SSSE3_ONLY(ssse3Container) DAAL_KERNEL_SSE42_ONLY(sse42Container)
DAAL_KERNEL_AVX_ONLY(avxContainer) DAAL_KERNEL_AVX2_ONLY(avx2Container)
DAAL_KERNEL_AVX512_MIC_ONLY(avx512_micContainer) DAAL_KERNEL_AVX512_ONLY(avx512Container)>
: public AlgorithmContainerImpl<batch>
template <typename sse2Container DAAL_KERNEL_SSE42_ONLY(typename sse42Container) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
class DAAL_EXPORT AlgorithmDispatchContainer<batch, sse2Container DAAL_KERNEL_SSE42_ONLY(sse42Container) DAAL_KERNEL_AVX2_ONLY(avx2Container)
DAAL_KERNEL_AVX512_ONLY(avx512Container)> : public AlgorithmContainerImpl<batch>
{
public:
/**
Expand Down
19 changes: 6 additions & 13 deletions cpp/daal/include/algorithms/algorithm_container_base_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,17 +49,12 @@ namespace interface1
*
* \tparam mode Computation mode of the algorithm, \ref ComputeMode
* \tparam sse2Container Implementation for Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2)
* \tparam ssse3Container Implementation for Supplemental Streaming SIMD Extensions 3 (SSSE3)
* \tparam sse42Container Implementation for Intel(R) Streaming SIMD Extensions 42 (Intel(R) SSE42)
* \tparam avxContainer Implementation for Intel(R) Advanced Vector Extensions (Intel(R) AVX)
* \tparam avx2Container Implementation for Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)
* \tparam avx512_micContainer Implementation for Intel(R) Xeon Phi(TM) processors/coprocessors based on Intel(R) Advanced Vector
* Extensions 512 (Intel(R) AVX512)
* \tparam avx512Container Implementation for Intel(R) Xeon(R) processors based on Intel AVX-512
*/
template <ComputeMode mode, typename sse2Container DAAL_KERNEL_SSSE3_ONLY(typename ssse3Container) DAAL_KERNEL_SSE42_ONLY(typename sse42Container)
DAAL_KERNEL_AVX_ONLY(typename avxContainer) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
DAAL_KERNEL_AVX512_MIC_ONLY(typename avx512_micContainer) DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
template <ComputeMode mode, typename sse2Container DAAL_KERNEL_SSE42_ONLY(typename sse42Container) DAAL_KERNEL_AVX2_ONLY(typename avx2Container)
DAAL_KERNEL_AVX512_ONLY(typename avx512Container)>
class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl<mode>
{
public:
Expand Down Expand Up @@ -104,12 +99,10 @@ class DAAL_EXPORT AlgorithmDispatchContainer : public AlgorithmContainerImpl<mod
AlgorithmDispatchContainer & operator=(const AlgorithmDispatchContainer &);
};

#define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...) \
algorithms::AlgorithmDispatchContainer<Mode, ContainerTemplate<__VA_ARGS__, sse2> DAAL_KERNEL_SSSE3_CONTAINER(ContainerTemplate, __VA_ARGS__) \
DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) DAAL_KERNEL_AVX_CONTAINER( \
ContainerTemplate, __VA_ARGS__) DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \
DAAL_KERNEL_AVX512_MIC_CONTAINER(ContainerTemplate, __VA_ARGS__) \
DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>
#define __DAAL_ALGORITHM_CONTAINER(Mode, ContainerTemplate, ...) \
algorithms::AlgorithmDispatchContainer<Mode, ContainerTemplate<__VA_ARGS__, sse2> DAAL_KERNEL_SSE42_CONTAINER(ContainerTemplate, __VA_ARGS__) \
DAAL_KERNEL_AVX2_CONTAINER(ContainerTemplate, __VA_ARGS__) \
DAAL_KERNEL_AVX512_CONTAINER(ContainerTemplate, __VA_ARGS__)>

/** @} */
} // namespace interface1
Expand Down
21 changes: 6 additions & 15 deletions cpp/daal/include/services/env_detect.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,11 @@ namespace daal
*/
enum CpuType
{
sse2 = 0, /*!< Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) */
ssse3 = 1, /*!< Supplemental Streaming SIMD Extensions 3 (SSSE3) */
sse42 = 2, /*!< Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) */
avx = 3, /*!< Intel(R) Advanced Vector Extensions (Intel(R) AVX) */
avx2 = 4, /*!< Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) */
avx512_mic = 5, /*!< Intel(R) Xeon Phi(TM) processors/coprocessors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) */
avx512 = 6, /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) */
avx512_mic_e1 =
7, /*!< Intel(R) Xeon Phi(TM) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) with support of AVX512_4FMAPS and AVX512_4VNNIW instruction groups. Should be used as parameter for setCpuId function only. Can`t be received as return value of setCpuId, getCpuId and enableInstructionsSet functions. */
lastCpuType = avx512_mic_e1
sse2 = 0, /*!< Intel(R) Streaming SIMD Extensions 2 (Intel(R) SSE2) */
sse42 = 2, /*!< Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) */
avx2 = 4, /*!< Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) */
avx512 = 6, /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) */
lastCpuType = avx512
};

namespace services
Expand Down Expand Up @@ -96,11 +91,7 @@ class DAAL_EXPORT Environment : public Base
enum CpuTypeEnable
{
cpu_default = 0, /*!< Default processor type */
avx512_mic =
1, /*!< Intel(R) Xeon Phi(TM) processors/coprocessors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */
avx512 = 2, /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */
avx512_mic_e1 =
4 /*!< Intel(R) Xeon Phi(TM) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) with support of AVX512_4FMAPS and AVX512_4VNNIW instruction groups */
avx512 = 2 /*!< Intel(R) Xeon(R) processors based on Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX-512) \DAAL_DEPRECATED */
};

/**
Expand Down
86 changes: 0 additions & 86 deletions cpp/daal/include/services/internal/daal_kernel_defines.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,35 +68,6 @@ case cpuType:
#define DAAL_KERNEL_SSE2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_SSSE3)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::ssse3
#define DAAL_KERNEL_SSSE3_ONLY(something) , something
#define DAAL_KERNEL_SSSE3_ONLY_CODE(...) __VA_ARGS__
#define DAAL_KERNEL_SSSE3_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, ssse3, __VA_ARGS__)
#define DAAL_KERNEL_SSSE3_CONTAINER1(ContainerTemplate, ...) \
extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, ssse3, __VA_ARGS__);
#define DAAL_KERNEL_SSSE3_CONTAINER_CASE(ContainerTemplate, ...) \
case ssse3: \
_cntr = (new DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__)(daalEnv)); \
break;
#define DAAL_KERNEL_SSSE3_CONTAINER_CASE_SYCL(ContainerTemplate, ...) \
case ssse3: \
{ \
using contTemplType = DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse2, __VA_ARGS__); \
static volatile daal::services::internal::GpuSupportRegistrar<contTemplType> registrar; \
_cntr = (new contTemplType(daalEnv)); \
break; \
}
#else
#define DAAL_KERNEL_SSSE3_ONLY(something)
#define DAAL_KERNEL_SSSE3_ONLY_CODE(...)
#define DAAL_KERNEL_SSSE3_CONTAINER(ContainerTemplate, ...)
#define DAAL_KERNEL_SSSE3_CONTAINER1(ContainerTemplate, ...)
#define DAAL_KERNEL_SSSE3_CONTAINER_CASE(ContainerTemplate, ...)
#define DAAL_KERNEL_SSSE3_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_SSE42)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::sse42
Expand All @@ -116,34 +87,6 @@ case cpuType:
#define DAAL_KERNEL_SSE42_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_AVX)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx
#define DAAL_KERNEL_AVX_ONLY(something) , something
#define DAAL_KERNEL_AVX_ONLY_CODE(...) __VA_ARGS__
#define DAAL_KERNEL_AVX_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx, __VA_ARGS__)
#define DAAL_KERNEL_AVX_CONTAINER1(ContainerTemplate, ...) extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx, __VA_ARGS__);
#define DAAL_KERNEL_AVX_CONTAINER_CASE(ContainerTemplate, ...) \
case avx: \
_cntr = (new DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__)(daalEnv)); \
break;
#define DAAL_KERNEL_AVX_CONTAINER_CASE_SYCL(ContainerTemplate, ...) \
case avx: \
{ \
using contTemplType = DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, sse42, __VA_ARGS__); \
static volatile daal::services::internal::GpuSupportRegistrar<contTemplType> registrar; \
_cntr = (new contTemplType(daalEnv)); \
break; \
}
#else
#define DAAL_KERNEL_AVX_ONLY(something)
#define DAAL_KERNEL_AVX_ONLY_CODE(...)
#define DAAL_KERNEL_AVX_CONTAINER(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX_CONTAINER1(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX_CONTAINER_CASE(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_AVX2)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx2
Expand All @@ -163,35 +106,6 @@ case cpuType:
#define DAAL_KERNEL_AVX2_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_AVX512_MIC)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx512_mic
#define DAAL_KERNEL_AVX512_MIC_ONLY(something) , something
#define DAAL_KERNEL_AVX512_MIC_ONLY_CODE(...) __VA_ARGS__
#define DAAL_KERNEL_AVX512_MIC_CONTAINER(ContainerTemplate, ...) , DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512_mic, __VA_ARGS__)
#define DAAL_KERNEL_AVX512_MIC_CONTAINER1(ContainerTemplate, ...) \
extern template class DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx512_mic, __VA_ARGS__);
#define DAAL_KERNEL_AVX512_MIC_CONTAINER_CASE(ContainerTemplate, ...) \
case avx512_mic: \
_cntr = (new DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__)(daalEnv)); \
break;
#define DAAL_KERNEL_AVX512_MIC_CONTAINER_CASE_SYCL(ContainerTemplate, ...) \
case avx512_mic: \
{ \
using contTemplType = DAAL_KERNEL_CONTAINER_TEMPL(ContainerTemplate, avx2, __VA_ARGS__); \
static volatile daal::services::internal::GpuSupportRegistrar<contTemplType> registrar; \
_cntr = (new contTemplType(daalEnv)); \
break; \
}
#else
#define DAAL_KERNEL_AVX512_MIC_ONLY(something)
#define DAAL_KERNEL_AVX512_MIC_ONLY_CODE(...)
#define DAAL_KERNEL_AVX512_MIC_CONTAINER(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX512_MIC_CONTAINER1(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX512_MIC_CONTAINER_CASE(ContainerTemplate, ...)
#define DAAL_KERNEL_AVX512_MIC_CONTAINER_CASE_SYCL(ContainerTemplate, ...)
#endif

#if defined(DAAL_KERNEL_AVX512)
#undef DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID
#define DAAL_KERNEL_BUILD_MAX_INSTRUCTION_SET_ID daal::avx512
Expand Down
70 changes: 2 additions & 68 deletions cpp/daal/src/algorithms/dtrees/gbt/gbt_train_hist_kernel.i
Original file line number Diff line number Diff line change
Expand Up @@ -314,16 +314,10 @@ struct MergeGHSums
#define SSE42_ALL sse42
#endif

#if __CPUID__(DAAL_CPU) >= __avx512_mic__
#if __CPUID__(DAAL_CPU) >= __avx512__
#define AVX512_ALL DAAL_CPU
#else
#define AVX512_ALL avx512_mic
#endif

#if __CPUID__(DAAL_CPU) >= __avx__
#define AVX_ALL DAAL_CPU
#else
#define AVX_ALL avx
#define AVX512_ALL avx512
#endif

template <typename RowIndexType, typename BinIndexType>
Expand Down Expand Up @@ -387,66 +381,6 @@ struct ComputeGHSumByRows<RowIndexType, BinIndexType, float, SSE42_ALL>
}
};

template <typename RowIndexType, typename BinIndexType>
struct ComputeGHSumByRows<RowIndexType, BinIndexType, double, AVX_ALL>
{
static void run(double * aGHSumFP, const BinIndexType * indexedFeature, const RowIndexType * aIdx, double * pgh, size_t nFeatures, size_t iStart,
size_t iEnd, size_t nRows, size_t * UniquesArr)
{
const size_t cacheLineSize = 64; // bytes
const size_t prefetchOffset = 10; // heuristic, prefetch on 10 rows ahead
const size_t elementsInCacheLine = cacheLineSize / sizeof(IndexType);

const size_t noPrefetchSize = services::internal::min<AVX_ALL, size_t>(prefetchOffset + elementsInCacheLine, nRows);
const size_t iEndWithPrefetch = services::internal::min<AVX_ALL, size_t>(nRows - noPrefetchSize, iEnd);
const size_t nCacheLinesToPrefetchOneRow = nFeatures / elementsInCacheLine + !!(nFeatures % elementsInCacheLine);

__m256d adds;
double * addsPtr = (double *)(&adds);
addsPtr[2] = 1;
addsPtr[3] = 0;

RowIndexType i = iStart;
PRAGMA_IVDEP
for (; i < iEndWithPrefetch; ++i)
{
DAAL_PREFETCH_READ_T0(pgh + 2 * aIdx[i + prefetchOffset]);
const BinIndexType * ptr = indexedFeature + aIdx[i + prefetchOffset] * nFeatures;
for (IndexType j = 0; j < nCacheLinesToPrefetchOneRow; j++) DAAL_PREFETCH_READ_T0(ptr + elementsInCacheLine * j);

const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures;
addsPtr[0] = pgh[2 * aIdx[i]];
addsPtr[1] = pgh[2 * aIdx[i] + 1];

PRAGMA_IVDEP
for (IndexType j = 0; j < nFeatures; j++)
{
const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]);
__m256d hist1 = _mm256_load_pd(aGHSumFP + idx);
__m256d newHist1 = _mm256_add_pd(adds, hist1);
_mm256_store_pd(aGHSumFP + idx, newHist1);
}
}

PRAGMA_IVDEP
for (; i < iEnd; ++i)
{
const BinIndexType * featIdx = indexedFeature + aIdx[i] * nFeatures;
addsPtr[0] = pgh[2 * aIdx[i]];
addsPtr[1] = pgh[2 * aIdx[i] + 1];

PRAGMA_IVDEP
for (IndexType j = 0; j < nFeatures; j++)
{
const size_t idx = 4 * (UniquesArr[j] + (size_t)featIdx[j]);
__m256d hist1 = _mm256_load_pd(aGHSumFP + idx);
__m256d newHist1 = _mm256_add_pd(adds, hist1);
_mm256_store_pd(aGHSumFP + idx, newHist1);
}
}
}
};

template <typename algorithmFPType, typename RowIndexType, typename BinIndexType>
struct MergeGHSums<algorithmFPType, RowIndexType, BinIndexType, AVX512_ALL>
{
Expand Down
Loading

0 comments on commit eb6735e

Please sign in to comment.