diff --git a/src/inline-thirdparty/fp16/fp16.h b/src/inline-thirdparty/fp16/fp16.h index 9d7366e997da..4d9ff120d399 100644 --- a/src/inline-thirdparty/fp16/fp16.h +++ b/src/inline-thirdparty/fp16/fp16.h @@ -4,8 +4,10 @@ #include -#if defined(PSIMD_H) -#include -#endif - #endif /* FP16_H */ + +// This file is part of the fp16 inline third-party dependency of YugabyteDB. +// Git repo: https://github.com/Maratyszcza/FP16/ +// Git commit: 98b0a46bce017382a6351a19577ec43a715b6835 +// +// See also src/inline-thirdparty/README.md. diff --git a/src/inline-thirdparty/fp16/fp16/bitcasts.h b/src/inline-thirdparty/fp16/fp16/bitcasts.h index 86a4e22c48b2..5baca96660c9 100644 --- a/src/inline-thirdparty/fp16/fp16/bitcasts.h +++ b/src/inline-thirdparty/fp16/fp16/bitcasts.h @@ -12,7 +12,7 @@ #include #endif -#if defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#if defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) #include #endif @@ -22,9 +22,9 @@ static inline float fp32_from_bits(uint32_t w) { return as_float(w); #elif defined(__CUDA_ARCH__) return __uint_as_float((unsigned int) w); -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) return _castu32_f32(w); -#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) return _CopyFloatFromInt32((__int32) w); #else union { @@ -40,9 +40,9 @@ static inline uint32_t fp32_to_bits(float f) { return as_uint(f); #elif defined(__CUDA_ARCH__) return (uint32_t) __float_as_uint(f); -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) return _castf32_u32(f); -#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) return (uint32_t) _CopyInt32FromFloat(f); #else union { @@ -58,9 +58,9 @@ static inline double fp64_from_bits(uint64_t w) { return as_double(w); #elif defined(__CUDA_ARCH__) return __longlong_as_double((long long) w); -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) return _castu64_f64(w); -#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) return _CopyDoubleFromInt64((__int64) w); #else union { @@ -76,9 +76,9 @@ static inline uint64_t fp64_to_bits(double f) { return as_ulong(f); #elif defined(__CUDA_ARCH__) return (uint64_t) __double_as_longlong(f); -#elif defined(__INTEL_COMPILER) +#elif defined(__INTEL_COMPILER) || defined(_MSC_VER) && (_MSC_VER >= 1932) && (defined(_M_IX86) || defined(_M_X64)) return _castf64_u64(f); -#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64)) +#elif defined(_MSC_VER) && !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64)) return (uint64_t) _CopyInt64FromDouble(f); #else union { @@ -90,3 +90,9 @@ static inline uint64_t fp64_to_bits(double f) { } #endif /* FP16_BITCASTS_H */ + +// This file is part of the fp16 inline third-party dependency of YugabyteDB. +// Git repo: https://github.com/Maratyszcza/FP16/ +// Git commit: 98b0a46bce017382a6351a19577ec43a715b6835 +// +// See also src/inline-thirdparty/README.md. diff --git a/src/inline-thirdparty/fp16/fp16/fp16.h b/src/inline-thirdparty/fp16/fp16/fp16.h index 2b61fff5c1b9..31ef86e2a8a8 100644 --- a/src/inline-thirdparty/fp16/fp16/fp16.h +++ b/src/inline-thirdparty/fp16/fp16/fp16.h @@ -10,11 +10,18 @@ #include #endif -#ifdef _MSC_VER +#include +#include + +#if defined(_MSC_VER) #include #endif - -#include +#if defined(__F16C__) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE + #include +#endif +#if (defined(__aarch64__) || defined(_M_ARM64)) && FP16_USE_NATIVE_CONVERSION && !FP16_USE_FLOAT16_TYPE && !FP16_USE_FP16_TYPE + #include +#endif /* @@ -106,6 +113,31 @@ static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) { * floating-point operations and bitcasts between integer and floating-point variables. */ static inline float fp16_ieee_to_fp32_value(uint16_t h) { +#if FP16_USE_NATIVE_CONVERSION + #if FP16_USE_FLOAT16_TYPE + union { + uint16_t as_bits; + _Float16 as_value; + } fp16 = { h }; + return (float) fp16.as_value; + #elif FP16_USE_FP16_TYPE + union { + uint16_t as_bits; + __fp16 as_value; + } fp16 = { h }; + return (float) fp16.as_value; + #else + #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) + return _cvtsh_ss((unsigned short) h); + #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128((int) (unsigned int) h))); + #elif defined(_M_ARM64) || defined(__aarch64__) + return vgetq_lane_f32(vcvt_f32_f16(vreinterpret_f16_u16(vdup_n_u16(h))), 0); + #else + #error "Archtecture- or compiler-specific implementation required" + #endif + #endif +#else /* * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: * +---+-----+------------+-------------------+ @@ -211,6 +243,7 @@ static inline float fp16_ieee_to_fp32_value(uint16_t h) { const uint32_t result = sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); return fp32_from_bits(result); +#endif } /* @@ -221,6 +254,31 @@ static inline float fp16_ieee_to_fp32_value(uint16_t h) { * floating-point operations and bitcasts between integer and floating-point variables. */ static inline uint16_t fp16_ieee_from_fp32_value(float f) { +#if FP16_USE_NATIVE_CONVERSION + #if FP16_USE_FLOAT16_TYPE + union { + _Float16 as_value; + uint16_t as_bits; + } fp16 = { (_Float16) f }; + return fp16.as_bits; + #elif FP16_USE_FP16_TYPE + union { + __fp16 as_value; + uint16_t as_bits; + } fp16 = { (__fp16) f }; + return fp16.as_bits; + #else + #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) + return _cvtss_sh(f, _MM_FROUND_CUR_DIRECTION); + #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) + return (uint16_t) _mm_cvtsi128_si32(_mm_cvtps_ph(_mm_set_ss(f), _MM_FROUND_CUR_DIRECTION)); + #elif defined(_M_ARM64) || defined(__aarch64__) + return vget_lane_u16(vcvt_f16_f32(vdupq_n_f32(f)), 0); + #else + #error "Archtecture- or compiler-specific implementation required" + #endif + #endif +#else #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) const float scale_to_inf = 0x1.0p+112f; const float scale_to_zero = 0x1.0p-110f; @@ -228,7 +286,12 @@ static inline uint16_t fp16_ieee_from_fp32_value(float f) { const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); #endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; +#if defined(_MSC_VER) && defined(_M_IX86_FP) && (_M_IX86_FP == 0) || defined(__GNUC__) && defined(__FLT_EVAL_METHOD__) && (__FLT_EVAL_METHOD__ != 0) + const volatile float saturated_f = fabsf(f) * scale_to_inf; +#else + const float saturated_f = fabsf(f) * scale_to_inf; +#endif + float base = saturated_f * scale_to_zero; const uint32_t w = fp32_to_bits(f); const uint32_t shl1_w = w + w; @@ -244,6 +307,7 @@ static inline uint16_t fp16_ieee_from_fp32_value(float f) { const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); const uint32_t nonsign = exp_bits + mantissa_bits; return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +#endif } /* @@ -449,3 +513,9 @@ static inline uint16_t fp16_alt_from_fp32_value(float f) { } #endif /* FP16_FP16_H */ + +// This file is part of the fp16 inline third-party dependency of YugabyteDB. +// Git repo: https://github.com/Maratyszcza/FP16/ +// Git commit: 98b0a46bce017382a6351a19577ec43a715b6835 +// +// See also src/inline-thirdparty/README.md. diff --git a/src/inline-thirdparty/fp16/fp16/macros.h b/src/inline-thirdparty/fp16/fp16/macros.h new file mode 100644 index 000000000000..601944c96749 --- /dev/null +++ b/src/inline-thirdparty/fp16/fp16/macros.h @@ -0,0 +1,52 @@ +#pragma once +#ifndef FP16_MACROS_H +#define FP16_MACROS_H + +#ifndef FP16_USE_NATIVE_CONVERSION + #if (defined(__INTEL_COMPILER) || defined(__GNUC__)) && defined(__F16C__) + #define FP16_USE_NATIVE_CONVERSION 1 + #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64)) && defined(__AVX2__) + #define FP16_USE_NATIVE_CONVERSION 1 + #elif defined(_MSC_VER) && defined(_M_ARM64) + #define FP16_USE_NATIVE_CONVERSION 1 + #elif defined(__GNUC__) && defined(__aarch64__) + #define FP16_USE_NATIVE_CONVERSION 1 + #endif + #if !defined(FP16_USE_NATIVE_CONVERSION) + #define FP16_USE_NATIVE_CONVERSION 0 + #endif // !defined(FP16_USE_NATIVE_CONVERSION) +#endif // !define(FP16_USE_NATIVE_CONVERSION) + +#ifndef FP16_USE_FLOAT16_TYPE + #if !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(__GNUC__) && (__GNUC__ >= 12) + #if defined(__F16C__) + #define FP16_USE_FLOAT16_TYPE 1 + #endif + #endif + #if !defined(FP16_USE_FLOAT16_TYPE) + #define FP16_USE_FLOAT16_TYPE 0 + #endif // !defined(FP16_USE_FLOAT16_TYPE) +#endif // !defined(FP16_USE_FLOAT16_TYPE) + +#ifndef FP16_USE_FP16_TYPE + #if defined(__clang__) + #if defined(__F16C__) || defined(__aarch64__) + #define FP16_USE_FP16_TYPE 1 + #endif + #elif defined(__GNUC__) + #if defined(__aarch64__) + #define FP16_USE_FP16_TYPE 1 + #endif + #endif + #if !defined(FP16_USE_FP16_TYPE) + #define FP16_USE_FP16_TYPE 0 + #endif // !defined(FP16_USE_FP16_TYPE) +#endif // !defined(FP16_USE_FP16_TYPE) + +#endif /* FP16_MACROS_H */ + +// This file is part of the fp16 inline third-party dependency of YugabyteDB. +// Git repo: https://github.com/Maratyszcza/FP16/ +// Git commit: 98b0a46bce017382a6351a19577ec43a715b6835 +// +// See also src/inline-thirdparty/README.md. diff --git a/src/inline-thirdparty/fp16/fp16/psimd.h b/src/inline-thirdparty/fp16/fp16/psimd.h deleted file mode 100644 index 428ab0651de9..000000000000 --- a/src/inline-thirdparty/fp16/fp16/psimd.h +++ /dev/null @@ -1,131 +0,0 @@ -#pragma once -#ifndef FP16_PSIMD_H -#define FP16_PSIMD_H - -#if defined(__cplusplus) && (__cplusplus >= 201103L) - #include -#elif !defined(__OPENCL_VERSION__) - #include -#endif - -#include - - -PSIMD_INTRINSIC psimd_f32 fp16_ieee_to_fp32_psimd(psimd_u16 half) { - const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half); - - const psimd_u32 sign = word & psimd_splat_u32(UINT32_C(0x80000000)); - const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4); - - const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000)); -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f); -#else - const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000))); -#endif - const psimd_f32 norm_nonsign = psimd_mul_f32((psimd_f32) (shr3_nonsign + exp_offset), exp_scale); - - const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80)); - const psimd_f32 magic_bias = psimd_splat_f32(0.25f); - const psimd_f32 denorm_nonsign = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(half + half, magic_mask), magic_bias); - - const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000)); - const psimd_s32 denorm_mask = (psimd_s32) shr3_nonsign < denorm_cutoff; - return (psimd_f32) (sign | (psimd_s32) psimd_blend_f32(denorm_mask, denorm_nonsign, norm_nonsign)); -} - -PSIMD_INTRINSIC psimd_f32x2 fp16_ieee_to_fp32x2_psimd(psimd_u16 half) { - const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half); - const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half); - - const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000)); - const psimd_u32 sign_lo = word_lo & sign_mask; - const psimd_u32 sign_hi = word_hi & sign_mask; - const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4); - const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4); - - const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x70000000)); -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const psimd_f32 exp_scale = psimd_splat_f32(0x1.0p-112f); -#else - const psimd_f32 exp_scale = psimd_splat_f32(fp32_from_bits(UINT32_C(0x7800000))); -#endif - const psimd_f32 norm_nonsign_lo = psimd_mul_f32((psimd_f32) (shr3_nonsign_lo + exp_offset), exp_scale); - const psimd_f32 norm_nonsign_hi = psimd_mul_f32((psimd_f32) (shr3_nonsign_hi + exp_offset), exp_scale); - - const psimd_u16 magic_mask = psimd_splat_u16(UINT16_C(0x3E80)); - const psimd_u16 shl1_half = half + half; - const psimd_f32 magic_bias = psimd_splat_f32(0.25f); - const psimd_f32 denorm_nonsign_lo = psimd_sub_f32((psimd_f32) psimd_interleave_lo_u16(shl1_half, magic_mask), magic_bias); - const psimd_f32 denorm_nonsign_hi = psimd_sub_f32((psimd_f32) psimd_interleave_hi_u16(shl1_half, magic_mask), magic_bias); - - const psimd_s32 denorm_cutoff = psimd_splat_s32(INT32_C(0x00800000)); - const psimd_s32 denorm_mask_lo = (psimd_s32) shr3_nonsign_lo < denorm_cutoff; - const psimd_s32 denorm_mask_hi = (psimd_s32) shr3_nonsign_hi < denorm_cutoff; - - psimd_f32x2 result; - result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_blend_f32(denorm_mask_lo, denorm_nonsign_lo, norm_nonsign_lo)); - result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_blend_f32(denorm_mask_hi, denorm_nonsign_hi, norm_nonsign_hi)); - return result; -} - -PSIMD_INTRINSIC psimd_f32 fp16_alt_to_fp32_psimd(psimd_u16 half) { - const psimd_u32 word = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half); - - const psimd_u32 sign = word & psimd_splat_u32(INT32_C(0x80000000)); - const psimd_u32 shr3_nonsign = (word + word) >> psimd_splat_u32(4); - -#if 0 - const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000)); - const psimd_s32 nonsign_bits = (psimd_s32) shr3_nonsign + exp112_offset; - const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000)); - const psimd_f32 two_nonsign = (psimd_f32) (nonsign_bits + exp1_offset); - const psimd_s32 exp113_offset = exp112_offset | exp1_offset; - return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(two_nonsign, (psimd_f32) psimd_max_s32(nonsign_bits, exp113_offset))); -#else - const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000)); - const psimd_f32 nonsign = (psimd_f32) (shr3_nonsign + exp_offset); -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) - const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f); -#else - const psimd_f32 denorm_bias = psimd_splat_f32(fp32_from_bits(UINT32_C(0x38800000))); -#endif - return (psimd_f32) (sign | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign, nonsign), psimd_max_f32(nonsign, denorm_bias))); -#endif -} - -PSIMD_INTRINSIC psimd_f32x2 fp16_alt_to_fp32x2_psimd(psimd_u16 half) { - const psimd_u32 word_lo = (psimd_u32) psimd_interleave_lo_u16(psimd_zero_u16(), half); - const psimd_u32 word_hi = (psimd_u32) psimd_interleave_hi_u16(psimd_zero_u16(), half); - - const psimd_u32 sign_mask = psimd_splat_u32(UINT32_C(0x80000000)); - const psimd_u32 sign_lo = word_lo & sign_mask; - const psimd_u32 sign_hi = word_hi & sign_mask; - const psimd_u32 shr3_nonsign_lo = (word_lo + word_lo) >> psimd_splat_u32(4); - const psimd_u32 shr3_nonsign_hi = (word_hi + word_hi) >> psimd_splat_u32(4); - -#if 1 - const psimd_s32 exp112_offset = psimd_splat_s32(INT32_C(0x38000000)); - const psimd_s32 nonsign_bits_lo = (psimd_s32) shr3_nonsign_lo + exp112_offset; - const psimd_s32 nonsign_bits_hi = (psimd_s32) shr3_nonsign_hi + exp112_offset; - const psimd_s32 exp1_offset = psimd_splat_s32(INT32_C(0x00800000)); - const psimd_f32 two_nonsign_lo = (psimd_f32) (nonsign_bits_lo + exp1_offset); - const psimd_f32 two_nonsign_hi = (psimd_f32) (nonsign_bits_hi + exp1_offset); - const psimd_s32 exp113_offset = exp1_offset | exp112_offset; - psimd_f32x2 result; - result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(two_nonsign_lo, (psimd_f32) psimd_max_s32(nonsign_bits_lo, exp113_offset))); - result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(two_nonsign_hi, (psimd_f32) psimd_max_s32(nonsign_bits_hi, exp113_offset))); - return result; -#else - const psimd_u32 exp_offset = psimd_splat_u32(UINT32_C(0x38000000)); - const psimd_f32 nonsign_lo = (psimd_f32) (shr3_nonsign_lo + exp_offset); - const psimd_f32 nonsign_hi = (psimd_f32) (shr3_nonsign_hi + exp_offset); - const psimd_f32 denorm_bias = psimd_splat_f32(0x1.0p-14f); - psimd_f32x2 result; - result.lo = (psimd_f32) (sign_lo | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_lo, nonsign_lo), psimd_max_f32(nonsign_lo, denorm_bias))); - result.hi = (psimd_f32) (sign_hi | (psimd_s32) psimd_sub_f32(psimd_add_f32(nonsign_hi, nonsign_hi), psimd_max_f32(nonsign_hi, denorm_bias))); - return result; -#endif -} - -#endif /* FP16_PSIMD_H */