Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[API] Export layer_norm and rms_norm BF16 APIs. #31

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,4 @@ dist/
/3rdparty/onednn
/3rdparty/cmdline
/3rdparty/sentencepiece
/3rdparty/ig
/3rdparty/xdnn
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,14 @@ endif()

include("cmake/mklml.cmake")
include("cmake/onednn.cmake")
include("cmake/ig.cmake")
include("cmake/xdnn.cmake")
include(GNUInstallDirs)

include_directories(${CMAKE_SOURCE_DIR}/3rdparty/)
include_directories(${CMAKE_SOURCE_DIR}/3rdparty/mklml/include)
include_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/include)
include_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/build/include)
include_directories(${CMAKE_SOURCE_DIR}/3rdparty/ig)
include_directories(${CMAKE_SOURCE_DIR}/3rdparty/xdnn)
include_directories(${CMAKE_SOURCE_DIR}/include)
include_directories(${CMAKE_SOURCE_DIR}/src/kernels)
include_directories(${CMAKE_SOURCE_DIR}/src/layers)
Expand All @@ -75,18 +75,18 @@ include_directories(${CMAKE_SOURCE_DIR}/src/common)
link_directories(${CMAKE_SOURCE_DIR}/src/kernels)
link_directories(${CMAKE_SOURCE_DIR}/3rdparty/mklml/lib)
link_directories(${CMAKE_SOURCE_DIR}/3rdparty/onednn/build/src)
link_directories(${CMAKE_SOURCE_DIR}/3rdparty/ig)
link_directories(${CMAKE_SOURCE_DIR}/3rdparty/xdnn)

set(3RDPART_LIB_LIST "MPI::MPI_CXX" "ccl" "dnnl" "numa")
set(DEPEND_LIST "onednn" "mklml" "ig_lib")
set(DEPEND_LIST "onednn" "mklml" "xdnn_lib")

option(BUILD_WITH_SHARED_LIBS "Build with shared libraries" OFF)
if(BUILD_WITH_SHARED_LIBS)
message("Building with shared libraries.")
list(APPEND 3RDPART_LIB_LIST "ig")
list(APPEND 3RDPART_LIB_LIST "xdnn")
else()
message("Building with static libraries.")
list(APPEND 3RDPART_LIB_LIST "ig_static")
list(APPEND 3RDPART_LIB_LIST "xdnn_static")
endif()

# Enable AVX512_FP16 optimization
Expand Down
8 changes: 4 additions & 4 deletions cmake/ig.cmake → cmake/xdnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ project(dependency NONE)
include(ExternalProject)

# cmake-format: off
ExternalProject_Add(ig_lib
URL https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/ig_v1.1.tar.gz
URL_HASH MD5=47e5a2cd021caad2b1367c0b71dff2e7
ExternalProject_Add(xdnn_lib
URL https://github.com/intel/xFasterTransformer/releases/download/IntrinsicGemm/xdnn_v1.1.tar.gz
URL_HASH MD5=b49bf8808d66ea75cfba80a406c9a587
TIMEOUT 60
SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/ig
SOURCE_DIR ${CMAKE_SOURCE_DIR}/3rdparty/xdnn
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
Expand Down
8 changes: 5 additions & 3 deletions include/layers_norm.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,15 @@
// ============================================================================
#pragma once

#include "dtype.h"

namespace xft {

void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
void invokeLayerNorm(DataType dt, void *output, const void *input, const void *gamma, const void *beta, const int rows,
const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);

void invokeRmsNorm(float *output, const float *input, const float *weight, int rows, int cols, int iStride = -1,
int oStride = -1, float epsilon = 1e-6);
void invokeRmsNorm(DataType dt, void *output, const void *input, const void *weight, int rows, int cols,
int iStride = -1, int oStride = -1, float epsilon = 1e-6);

// Layer normalization: only support the norm along last dimension
class LayerNorm {
Expand Down
72 changes: 66 additions & 6 deletions src/kernels/layernorm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,14 @@
#include <immintrin.h>

#include "bfloat16.h"
#include "dtype.h"
#include "float16.h"
#include "intrinsic_ext.h"
#include "layernorm_kernels.h"
#include "my_types.h"

namespace xft {

template <typename T>
struct LayerNormWeight {
const T *gamma = nullptr;
const T *beta = nullptr;
};

void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
const int size, int iStride, int oStride, const float epsilon) {

Expand Down Expand Up @@ -79,4 +76,67 @@ void invokeLayerNorm(float *output, const float *input, const float *gamma, cons
}
}
}

void invokeLayerNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *gamma, const bfloat16_t *beta,
const int rows, const int size, int iStride, int oStride, const float epsilon) {

if (iStride == -1) iStride = size;
if (oStride == -1) oStride = size;

#pragma omp parallel for
for (int r = 0; r < rows; ++r) {
const bfloat16_t *px = input + r * iStride;
bfloat16_t *py = output + r * oStride;

float sum = 0;
float squareSum = 0;

__m512 vsum = _mm512_set1_ps(0);
__m512 vsqare = _mm512_set1_ps(0);

for (int col = 0; col < size; col += 16) {
int remain = size - col;
__mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);

// SUM(x)
__m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
vsum = _mm512_add_ps(vsum, vx);

// SUM(x*x)
__m512 tmp = _mm512_mul_ps(vx, vx);
vsqare = _mm512_add_ps(vsqare, tmp);
}

sum = _mm512_reduce_add_ps(vsum);
squareSum = _mm512_reduce_add_ps(vsqare);

// Mean
float mean = sum / size;
__m512 vmean = _mm512_set1_ps(mean);

// Variance
float var = 1 / sqrt(squareSum / size - mean * mean + epsilon);
__m512 vvar = _mm512_set1_ps(var);

for (int col = 0; col < size; col += 16) {
int remain = size - col;
__mmask16 mask = (remain >= 16 ? 0xffff : (1 << remain) - 1);

__m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
__m512 vgamma = _mm512_maskz_loadu_pbh(mask, gamma + col);
__m512 vbeta = _mm512_maskz_loadu_pbh(mask, beta + col);
__m512 vy = (vx - vmean) * vgamma * vvar + vbeta;
_mm512_mask_storeu_pbh(py + col, mask, vy);
}
}
}

void invokeLayerNorm(DataType dt, void *output, const void *input, const void *gamma, const void *beta, const int rows,
const int size, int iStride, int oStride, const float epsilon) {
if (dt == DataType::bf16) {
invokeLayerNorm((bfloat16_t *)output, (const bfloat16_t *)input, (const bfloat16_t *)gamma,
(const bfloat16_t *)beta, rows, size, iStride, oStride, epsilon);
}
}

} // namespace xft
38 changes: 38 additions & 0 deletions src/kernels/layernorm_kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
// Copyright (c) 2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
#pragma once

#include <immintrin.h>

#include "bfloat16.h"
#include "dtype.h"
#include "float16.h"
#include "my_types.h"

namespace xft {

template <typename T>
struct LayerNormWeight {
const T *gamma = nullptr;
const T *beta = nullptr;
};

void invokeLayerNorm(float *output, const float *input, const float *gamma, const float *beta, const int rows,
const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);

void invokeLayerNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *gamma, const bfloat16_t *beta,
const int rows, const int size, int iStride = -1, int oStride = -1, const float epsilon = 1e-5);

} // namespace xft
64 changes: 64 additions & 0 deletions src/kernels/rmsnorm_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
#include <immintrin.h>

#include "bfloat16.h"
#include "dtype.h"
#include "float16.h"
#include "intrinsic_ext.h"
#include "my_types.h"
#include "rmsnorm_kernels.h"

namespace xft {

Expand Down Expand Up @@ -71,4 +74,65 @@ void invokeRmsNorm(float *output, const float *input, const float *weight, int r
}
} // end for rows
}

void invokeRmsNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *weight, int rows, int cols,
int iStride, int oStride, float epsilon) {
int size = cols;

if (iStride == -1) iStride = cols;
if (oStride == -1) oStride = cols;

#pragma omp parallel for
for (int r = 0; r < rows; ++r) {
const bfloat16_t *px = input + r * iStride;
bfloat16_t *py = output + r * oStride;

float squareSum = 0;

__m512 vsqare = _mm512_set1_ps(0);

int col = 0;
for (; col + 15 < size; col += 16) {
// SUM(x*x)
__m512 vx = _mm512_loadu_pbh(px + col);
__m512 tmp = _mm512_mul_ps(vx, vx);
vsqare = _mm512_add_ps(vsqare, tmp);
}
if (col < size) {
__mmask16 mask = (1 << (size - col)) - 1;
__m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
__m512 tmp = _mm512_mul_ps(vx, vx);
vsqare = _mm512_add_ps(vsqare, tmp);
}

squareSum = _mm512_reduce_add_ps(vsqare);

// Variance
float var = 1 / sqrt(squareSum / size + epsilon);
__m512 vvar = _mm512_set1_ps(var);

for (col = 0; col + 15 < size; col += 16) {
__m512 vx = _mm512_loadu_pbh(px + col);
__m512 vw = _mm512_loadu_pbh(weight + col);
__m512 vy = vx * vvar * vw;
_mm512_storeu_pbh(py + col, vy);
}
if (col < size) {
__mmask16 mask = (1 << (size - col)) - 1;
__m512 vx = _mm512_maskz_loadu_pbh(mask, px + col);
__m512 vw = _mm512_maskz_loadu_pbh(mask, weight + col);
__m512 vy = vx * vvar * vw;
_mm512_mask_storeu_pbh(py + col, mask, vy);
}
} // end for rows
}

void invokeRmsNorm(DataType dt, void *output, const void *input, const void *weight, int rows, int cols, int iStride,
int oStride, float epsilon) {
if (dt == DataType::bf16) {
invokeRmsNorm((bfloat16_t *)output, (const bfloat16_t *)input, (const bfloat16_t *)weight, rows, cols, iStride,
oStride, epsilon);
}
}

} // namespace xft
31 changes: 31 additions & 0 deletions src/kernels/rmsnorm_kernels.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Copyright (c) 2023 Intel Corporation
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
#pragma once

#include <immintrin.h>

#include "bfloat16.h"
#include "float16.h"
#include "my_types.h"

namespace xft {

void invokeRmsNorm(float *output, const float *input, const float *weight, int rows, int cols, int iStride = -1,
int oStride = -1, float epsilon = 1e-6);

void invokeRmsNorm(bfloat16_t *output, const bfloat16_t *input, const bfloat16_t *weight, int rows, int cols,
int iStride = -1, int oStride = -1, float epsilon = 1e-6);

} // namespace xft
4 changes: 2 additions & 2 deletions src/layers/attention.h
Original file line number Diff line number Diff line change
Expand Up @@ -496,9 +496,9 @@ class Attention {
C = result.Row(b * ctx->inputSeqLen + startSeq) + i * ctx->attHeadSize;

if constexpr (std::is_same_v<KVCacheT, float>) {
ig_sgemm_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
xdnn_sgemm_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
} else if constexpr (std::is_same_v<KVCacheT, float16_t>) {
ig_sgemm_f32f16f32_single_thread(false, false, m, n, k, 1.0f, A, lda, B, ldb, 0.0f, C, ldc);
xdnn_sgemm_f32f16f32_single_thread(false, false, m, n, k, 1.0f, A, lda, (const XDNN_FP16 *)B, ldb, 0.0f, C, ldc);
}

#ifdef DEBUG
Expand Down
7 changes: 4 additions & 3 deletions src/layers/layer_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
#pragma once
#include <immintrin.h>

#include <cstdlib>
#include <cstring>
#include <immintrin.h>

#include "layernorm_kernels.h"
#include "layers_norm.h"
#include "timeline.h"

Expand Down Expand Up @@ -45,7 +46,7 @@ void LayerNorm::forward(const float *input, float *output, int rows, int iStride
TimeLine t("LayerNorm.forward");
const float *pgamma = weights;
const float *pbeta = weights + normSize;
xft::invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
invokeLayerNorm(output, input, pgamma, pbeta, rows, normSize, iStride, oStride);
}

} // namespace xft
4 changes: 2 additions & 2 deletions src/layers/rms_norm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// ============================================================================
#pragma once
#include <immintrin.h>

#include <cstdlib>
#include <cstring>

#include "layers_norm.h"
#include "rmsnorm_kernels.h"
#include "timeline.h"

namespace xft {
Expand All @@ -41,7 +41,7 @@ void RmsNorm::setWeight(const float *w, const float *, int size) {
// input and output are in shape of (rows, normSize)
void RmsNorm::forward(const float *input, float *output, int rows, int iStride, int oStride, float epsilon) {
TimeLine t("RmsNorm.forward");
xft::invokeRmsNorm(output, input, weight, rows, normSize, iStride, oStride, epsilon);
invokeRmsNorm(output, input, weight, rows, normSize, iStride, oStride, epsilon);
}

} // namespace xft
2 changes: 1 addition & 1 deletion src/utils/decoder_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -542,7 +542,7 @@ class DecoderUtil {

// C = A * B
// bTranspose: B need to be transposed or not
// ig_sgemm_single_thread(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
// xdnn_sgemm_single_thread(transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc);
static void sgemm(const float* A, const float* B, float* C, int m, int n, int k,
bool transa, bool transb) {
int lda = (transa ? m : k);
Expand Down
Loading