intel · pujiang2018 · Nov 8, 2023 · Oct 30, 2023 · Nov 8, 2023
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,4 @@ dist/
 /3rdparty/onednn
 /3rdparty/cmdline
 /3rdparty/sentencepiece
-/3rdparty/ig
+/3rdparty/xdnn
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 
 include("cmake/mklml.cmake")
 include("cmake/onednn.cmake")
-include("cmake/ig.cmake")
+include("cmake/xdnn.cmake")
 include(GNUInstallDirs)
 
 include_directories(${CMAKE_SOURCE_DIR}/3rdparty/)
 include_directories(${CMAKE_SOURCE_DIR}/3rdparty/mklml/include)
 include_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/include)
 include_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/build/include)
-include_directories(${CMAKE_SOURCE_DIR}/3rdparty/ig)
+include_directories(${CMAKE_SOURCE_DIR}/3rdparty/xdnn)
 include_directories(${CMAKE_SOURCE_DIR}/include)
 include_directories(${CMAKE_SOURCE_DIR}/src/kernels)
 include_directories(${CMAKE_SOURCE_DIR}/src/layers)
@@ -75,18 +75,18 @@ include_directories(${CMAKE_SOURCE_DIR}/src/common)
 link_directories(${CMAKE_SOURCE_DIR}/src/kernels)
 link_directories(${CMAKE_SOURCE_DIR}/3rdparty/mklml/lib)
 link_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/build/src)
-link_directories(${CMAKE_SOURCE_DIR}/3rdparty/ig)
+link_directories(${CMAKE_SOURCE_DIR}/3rdparty/xdnn)
 
 set(3RDPART_LIB_LIST "MPI::MPI_CXX" "ccl" "dnnl" "numa")
-set(DEPEND_LIST "onednn" "mklml" "ig_lib")
+set(DEPEND_LIST "onednn" "mklml" "xdnn_lib")
 
 option(BUILD_WITH_SHARED_LIBS "Build with shared libraries" OFF)
 if(BUILD_WITH_SHARED_LIBS)
     message("Building with shared libraries.")
-    list(APPEND 3RDPART_LIB_LIST "ig")
+    list(APPEND 3RDPART_LIB_LIST "xdnn")
 else()
     message("Building with static libraries.")
-    list(APPEND 3RDPART_LIB_LIST "ig_static")
+    list(APPEND 3RDPART_LIB_LIST "xdnn_static")
 endif()
 
 # Enable AVX512_FP16 optimization

diff --git a/cmake/ig.cmake → cmake/xdnn.cmake b/cmake/ig.cmake → cmake/xdnn.cmake
@@ -25,11 +25,11 @@ project(dependency NONE)
 include(ExternalProject)
 
 # cmake-format: off
-ExternalProject_Add(ig_lib
-  URL               https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/ig_v1.1.tar.gz
-  URL_HASH          MD5=47e5a2cd021caad2b1367c0b71dff2e7
+ExternalProject_Add(xdnn_lib
+  URL               https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.1.tar.gz
+  URL_HASH          MD5=b49bf8808d66ea75cfba80a406c9a587
   TIMEOUT           60
-  SOURCE_DIR        ${CMAKE_SOURCE_DIR}/3rdparty/ig
+  SOURCE_DIR        ${CMAKE_SOURCE_DIR}/3rdparty/xdnn
   CONFIGURE_COMMAND ""
   BUILD_COMMAND     ""
   INSTALL_COMMAND   ""

diff --git a/include/layers_norm.h b/include/layers_norm.h
@@ -14,13 +14,15 @@
 // ============================================================================
 #pragma once
 
+#include "dtype.h"
+
 namespace xft {
 
-void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
+void invokeLayerNorm(DataType dt, void *output, const void *input, const void *gamma, const void *beta, const int rows,
         const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);
 
-void invokeRmsNorm(float *output, const float *input, const float *weight, int rows, int cols, int iStride = -1,
-        int oStride = -1, float epsilon = 1e-6);
+void invokeRmsNorm(DataType dt, void *output, const void *input, const void *weight, int rows, int cols,
+        int iStride = -1, int oStride = -1, float epsilon = 1e-6);
 
 // Layer normalization: only support the norm along last dimension
 class LayerNorm {

diff --git a/src/kernels/layernorm_kernels.cpp b/src/kernels/layernorm_kernels.cpp
@@ -15,17 +15,14 @@
 #include <immintrin.h>
 
 #include "bfloat16.h"
+#include "dtype.h"
 #include "float16.h"
+#include "intrinsic_ext.h"
+#include "layernorm_kernels.h"
 #include "my_types.h"
 
 namespace xft {
 
-template <typename T>
-struct LayerNormWeight {
-    const T *gamma = nullptr;
-    const T *beta = nullptr;
-};
-
 void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
         const int size, int iStride, int oStride, const float epsilon) {
 
@@ -79,4 +76,67 @@ void invokeLayerNorm(float *output, const float *input, const float *gamma, cons
         }
     }
 }
+
+void invokeLayerNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *gamma, const bfloat16_t *beta,
+        const int rows, const int size, int iStride, int oStride, const float epsilon) {
+
+    if (iStride == -1) iStride = size;
+    if (oStride == -1) oStride = size;
+
+#pragma omp parallel for
+    for (int r = 0; r < rows; ++r) {
+        const bfloat16_t *px = input + r * iStride;
+        bfloat16_t *py = output + r * oStride;
+
+        float sum = 0;
+        float squareSum = 0;
+
+        __m512 vsum = _mm512_set1_ps(0);
+        __m512 vsqare = _mm512_set1_ps(0);
+
+        for (int col = 0; col < size; col += 16) {
+            int remain = size - col;
+            __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+
+            // SUM(x)
+            __m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
+            vsum = _mm512_add_ps(vsum, vx);
+
+            // SUM(x*x)
+            __m512 tmp = _mm512_mul_ps(vx, vx);
+            vsqare = _mm512_add_ps(vsqare, tmp);
+        }
+
+        sum = _mm512_reduce_add_ps(vsum);
+        squareSum = _mm512_reduce_add_ps(vsqare);
+
+        // Mean
+        float mean = sum / size;
+        __m512 vmean = _mm512_set1_ps(mean);
+
+        // Variance
+        float var = 1 / sqrt(squareSum / size - mean * mean + epsilon);
+        __m512 vvar = _mm512_set1_ps(var);
+
+        for (int col = 0; col < size; col += 16) {
+            int remain = size - col;
+            __mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);
+
+            __m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
+            __m512 vgamma = _mm512_maskz_loadu_pbh(mask, gamma + col);
+            __m512 vbeta = _mm512_maskz_loadu_pbh(mask, beta + col);
+            __m512 vy = (vx - vmean) * vgamma * vvar + vbeta;
+            _mm512_mask_storeu_pbh(py + col, mask, vy);
+        }
+    }
+}
+
+void invokeLayerNorm(DataType dt, void *output, const void *input, const void *gamma, const void *beta, const int rows,
+        const int size, int iStride, int oStride, const float epsilon) {
+    if (dt == DataType::bf16) {
+        invokeLayerNorm((bfloat16_t *)output, (const bfloat16_t *)input, (const bfloat16_t *)gamma,
+                (const bfloat16_t *)beta, rows, size, iStride, oStride, epsilon);
+    }
+}
+
 } // namespace xft
diff --git a/src/kernels/layernorm_kernels.h b/src/kernels/layernorm_kernels.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#pragma once
+
+#include <immintrin.h>
+
+#include "bfloat16.h"
+#include "dtype.h"
+#include "float16.h"
+#include "my_types.h"
+
+namespace xft {
+
+template <typename T>
+struct LayerNormWeight {
+    const T *gamma = nullptr;
+    const T *beta = nullptr;
+};
+
+void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
+        const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);
+
+void invokeLayerNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *gamma, const bfloat16_t *beta,
+        const int rows, const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);
+
+} // namespace xft
diff --git a/src/kernels/rmsnorm_kernels.cpp b/src/kernels/rmsnorm_kernels.cpp
@@ -15,8 +15,11 @@
 #include <immintrin.h>
 
 #include "bfloat16.h"
+#include "dtype.h"
 #include "float16.h"
+#include "intrinsic_ext.h"
 #include "my_types.h"
+#include "rmsnorm_kernels.h"
 
 namespace xft {
 
@@ -71,4 +74,65 @@ void invokeRmsNorm(float *output, const float *input, const float *weight, int r
         }
     } // end for rows
 }
+
+void invokeRmsNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *weight, int rows, int cols,
+        int iStride, int oStride, float epsilon) {
+    int size = cols;
+
+    if (iStride == -1) iStride = cols;
+    if (oStride == -1) oStride = cols;
+
+#pragma omp parallel for
+    for (int r = 0; r < rows; ++r) {
+        const bfloat16_t *px = input + r * iStride;
+        bfloat16_t *py = output + r * oStride;
+
+        float squareSum = 0;
+
+        __m512 vsqare = _mm512_set1_ps(0);
+
+        int col = 0;
+        for (; col + 15 < size; col += 16) {
+            // SUM(x*x)
+            __m512 vx = _mm512_loadu_pbh(px + col);
+            __m512 tmp = _mm512_mul_ps(vx, vx);
+            vsqare = _mm512_add_ps(vsqare, tmp);
+        }
+        if (col < size) {
+            __mmask16 mask = (1 << (size - col)) - 1;
+            __m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
+            __m512 tmp = _mm512_mul_ps(vx, vx);
+            vsqare = _mm512_add_ps(vsqare, tmp);
+        }
+
+        squareSum = _mm512_reduce_add_ps(vsqare);
+
+        // Variance
+        float var = 1 / sqrt(squareSum / size + epsilon);
+        __m512 vvar = _mm512_set1_ps(var);
+
+        for (col = 0; col + 15 < size; col += 16) {
+            __m512 vx = _mm512_loadu_pbh(px + col);
+            __m512 vw = _mm512_loadu_pbh(weight + col);
+            __m512 vy = vx * vvar * vw;
+            _mm512_storeu_pbh(py + col, vy);
+        }
+        if (col < size) {
+            __mmask16 mask = (1 << (size - col)) - 1;
+            __m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
+            __m512 vw = _mm512_maskz_loadu_pbh(mask, weight + col);
+            __m512 vy = vx * vvar * vw;
+            _mm512_mask_storeu_pbh(py + col, mask, vy);
+        }
+    } // end for rows
+}
+
+void invokeRmsNorm(DataType dt, void *output, const void *input, const void *weight, int rows, int cols, int iStride,
+        int oStride, float epsilon) {
+    if (dt == DataType::bf16) {
+        invokeRmsNorm((bfloat16_t *)output, (const bfloat16_t *)input, (const bfloat16_t *)weight, rows, cols, iStride,
+                oStride, epsilon);
+    }
+}
+
 } // namespace xft
diff --git a/src/kernels/rmsnorm_kernels.h b/src/kernels/rmsnorm_kernels.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2023 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// ============================================================================
+#pragma once
+
+#include <immintrin.h>
+
+#include "bfloat16.h"
+#include "float16.h"
+#include "my_types.h"
+
+namespace xft {
+
+void invokeRmsNorm(float *output, const float *input, const float *weight, int rows, int cols, int iStride = -1,
+        int oStride = -1, float epsilon = 1e-6);
+
+void invokeRmsNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *weight, int rows, int cols,
+        int iStride = -1, int oStride = -1, float epsilon = 1e-6);
+
+} // namespace xft
diff --git a/src/layers/attention.h b/src/layers/attention.h
@@ -496,9 +496,9 @@ class Attention {
                     C = result.Row(b * ctx->inputSeqLen + startSeq) + i * ctx->attHeadSize;
 
                     if constexpr (std::is_same_v<KVCacheT, float>) {
-                        ig_sgemm_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
+                        xdnn_sgemm_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
                     } else if constexpr (std::is_same_v<KVCacheT, float16_t>) {
-                        ig_sgemm_f32f16f32_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
+                        xdnn_sgemm_f32f16f32_single_thread(false, false, m, n, k, 1.0f, A, lda, (const XDNN_FP16 *)B, ldb, 0.0f, C, ldc);
                     }
 
 #ifdef DEBUG

diff --git a/src/layers/layer_norm.cpp b/src/layers/layer_norm.cpp
@@ -12,11 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // ============================================================================
-#pragma once
+#include <immintrin.h>
+
 #include <cstdlib>
 #include <cstring>
-#include <immintrin.h>
 
+#include "layernorm_kernels.h"
 #include "layers_norm.h"
 #include "timeline.h"
 
@@ -45,7 +46,7 @@ void LayerNorm::forward(const float *input, float *output, int rows, int iStride
     TimeLine t("LayerNorm.forward");
     const float *pgamma = weights;
     const float *pbeta = weights + normSize;
-    xft::invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
+    invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
 }
 
 } // namespace xft
diff --git a/src/layers/rms_norm.cpp b/src/layers/rms_norm.cpp
@@ -12,13 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 // ============================================================================
-#pragma once
 #include <immintrin.h>
 
 #include <cstdlib>
 #include <cstring>
 
 #include "layers_norm.h"
+#include "rmsnorm_kernels.h"
 #include "timeline.h"
 
 namespace xft {
@@ -41,7 +41,7 @@ void RmsNorm::setWeight(const float *w, const float *, int size) {
 // input and output are in shape of (rows, normSize)
 void RmsNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
     TimeLine t("RmsNorm.forward");
-    xft::invokeRmsNorm(output, input, weight, rows, normSize, iStride, oStride, epsilon);
+    invokeRmsNorm(output, input, weight, rows, normSize, iStride, oStride, epsilon);
 }
 
 } // namespace xft
diff --git a/src/utils/decoder_util.h b/src/utils/decoder_util.h
@@ -542,7 +542,7 @@ class DecoderUtil {
 
     // C = A * B
     // bTranspose: B need to be transposed or not
-    // ig_sgemm_single_thread(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
+    // xdnn_sgemm_single_thread(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
     static void sgemm(const float* A, const float* B, float* C, int m, int n, int k,
             bool transa, bool transb) {
         int lda = (transa ? m : k);