From b5fcb59a851a794a870e91435d75169915cc7141 Mon Sep 17 00:00:00 2001
From: Justine Tunney <jtunney@gmail.com>
Date: Fri, 13 Sep 2024 05:06:34 -0700
Subject: [PATCH] Implement more bf16/fp16 compiler runtimes

Fixes #1259
---
 libc/integral/c.inc                           |  20 +++
 libc/intrin/{truncsfbf2.c => brain16.c}       |  67 ++++++++-
 libc/intrin/extendbfsf2.c                     |  39 -----
 .../compiler_rt => libc/intrin}/extendsftf2.c |   4 +-
 libc/intrin/float16.c                         | 137 ++++++++++++++++--
 libc/intrin/truncdfbf2.c                      |  24 ---
 .../compiler_rt => libc/intrin}/trunctfsf2.c  |   4 +-
 third_party/compiler_rt/extendhfdf2.c         |  17 ---
 third_party/compiler_rt/extendhfsf2.c         |  27 ----
 third_party/compiler_rt/truncdfhf2.c          |  21 ---
 third_party/compiler_rt/truncsfhf2.c          |  27 ----
 11 files changed, 209 insertions(+), 178 deletions(-)
 rename libc/intrin/{truncsfbf2.c => brain16.c} (68%)
 delete mode 100644 libc/intrin/extendbfsf2.c
 rename {third_party/compiler_rt => libc/intrin}/extendsftf2.c (89%)
 delete mode 100644 libc/intrin/truncdfbf2.c
 rename {third_party/compiler_rt => libc/intrin}/trunctfsf2.c (89%)
 delete mode 100644 third_party/compiler_rt/extendhfdf2.c
 delete mode 100644 third_party/compiler_rt/extendhfsf2.c
 delete mode 100644 third_party/compiler_rt/truncdfhf2.c
 delete mode 100644 third_party/compiler_rt/truncsfhf2.c

diff --git a/libc/integral/c.inc b/libc/integral/c.inc
index 0f29ff5f05f..04aeb22294d 100644
--- a/libc/integral/c.inc
+++ b/libc/integral/c.inc
@@ -65,6 +65,26 @@ typedef __UINT64_TYPE__ uint64_t;
 typedef __INTMAX_TYPE__ intmax_t;
 typedef __UINTMAX_TYPE__ uintmax_t;
 
+/* TODO(jart): re-import compiler-rt once they have it */
+#if defined(__x86_64__) && defined(__FLT128_MAX_10_EXP__)
+#undef __FLT128_MAX_10_EXP__
+#undef __FLT128_DENORM_MIN__
+#undef __FLT128_MIN_EXP__
+#undef __FLT128_MIN_10_EXP__
+#undef __FLT128_MANT_DIG__
+#undef __FLT128_HAS_INFINITY__
+#undef __FLT128_EPSILON__
+#undef __FLT128_MAX_EXP__
+#undef __FLT128_HAS_DENORM__
+#undef __FLT128_DIG__
+#undef __FLT128_MIN__
+#undef __FLT128_MAX__
+#undef __FLT128_NORM_MAX__
+#undef __FLT128_HAS_QUIET_NAN__
+#undef __FLT128_IS_IEC_60559__
+#undef __FLT128_DECIMAL_DIG__
+#endif
+
 #define __DEFINED_max_align_t
 typedef long double max_align_t;
 
diff --git a/libc/intrin/truncsfbf2.c b/libc/intrin/brain16.c
similarity index 68%
rename from libc/intrin/truncsfbf2.c
rename to libc/intrin/brain16.c
index b2d12e33d74..95b0050b8bc 100644
--- a/libc/intrin/truncsfbf2.c
+++ b/libc/intrin/brain16.c
@@ -17,12 +17,53 @@
 │ PERFORMANCE OF THIS SOFTWARE.                                                │
 ╚─────────────────────────────────────────────────────────────────────────────*/
 
-__bf16 __truncsfbf2(float f) {
+/**
+ * @fileoverview bf16 compiler runtime
+ */
+
+_Float32 __extendbfsf2(__bf16 f) {
+  union {
+    __bf16 f;
+    uint16_t i;
+  } ub = {f};
+
+  // convert brain16 to binary32
+  uint32_t x = (uint32_t)ub.i << 16;
+
+  // force nan to quiet
+  if ((x & 0x7fffffff) > 0x7f800000)
+    x |= 0x00400000;
+
+  // pun to _Float32
+  union {
+    uint32_t i;
+    _Float32 f;
+  } uf = {x};
+  return uf.f;
+}
+
+_Float64 __extendbfdf2(__bf16 f) {
+  return __extendbfsf2(f);
+}
+
+#ifdef __x86_64__
+__float80 __extendbfxf2(__bf16 f) {
+  return __extendbfsf2(f);
+}
+#endif
+
+#ifdef __aarch64__
+_Float128 __extendbftf2(__bf16 f) {
+  return __extendbfsf2(f);
+}
+#endif
+
+__bf16 __truncsfbf2(_Float32 f) {
   union {
-    float f;
-    unsigned i;
+    _Float32 f;
+    uint32_t i;
   } uf = {f};
-  unsigned x = uf.i;
+  uint32_t x = uf.i;
 
   if ((x & 0x7fffffff) > 0x7f800000)
     // force nan to quiet
@@ -33,8 +74,24 @@ __bf16 __truncsfbf2(float f) {
 
   // pun to bf16
   union {
-    unsigned short i;
+    uint16_t i;
     __bf16 f;
   } ub = {x};
   return ub.f;
 }
+
+__bf16 __truncdfbf2(_Float64 f) {
+  return __truncsfbf2(f);
+}
+
+#ifdef __x86_64__
+__bf16 __truncxfbf2(__float80 f) {
+  return __truncsfbf2(f);
+}
+#endif
+
+#ifdef __aarch64__
+__bf16 __trunctfbf2(_Float128 f) {
+  return __truncsfbf2(f);
+}
+#endif
diff --git a/libc/intrin/extendbfsf2.c b/libc/intrin/extendbfsf2.c
deleted file mode 100644
index 1773bac676c..00000000000
--- a/libc/intrin/extendbfsf2.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-
-float __extendbfsf2(__bf16 f) {
-  union {
-    __bf16 f;
-    unsigned short i;
-  } ub = {f};
-
-  // convert brain16 to binary32
-  unsigned x = (unsigned)ub.i << 16;
-
-  // force nan to quiet
-  if ((x & 0x7fffffff) > 0x7f800000)
-    x |= 0x00400000;
-
-  // pun to float
-  union {
-    unsigned i;
-    float f;
-  } uf = {x};
-  return uf.f;
-}
diff --git a/third_party/compiler_rt/extendsftf2.c b/libc/intrin/extendsftf2.c
similarity index 89%
rename from third_party/compiler_rt/extendsftf2.c
rename to libc/intrin/extendsftf2.c
index 1509b45e4ce..444140e1a7a 100644
--- a/third_party/compiler_rt/extendsftf2.c
+++ b/libc/intrin/extendsftf2.c
@@ -8,8 +8,6 @@
 //===----------------------------------------------------------------------===//
 //
 
-__static_yoink("huge_compiler_rt_license");
-
 #define QUAD_PRECISION
 #include "third_party/compiler_rt/fp_lib.inc"
 
@@ -19,7 +17,7 @@ __static_yoink("huge_compiler_rt_license");
 #include "third_party/compiler_rt/fp_extend_impl.inc"
 
 COMPILER_RT_ABI long double __extendsftf2(float a) {
-    return __extendXfYf2__(a);
+  return __extendXfYf2__(a);
 }
 
 #endif
diff --git a/libc/intrin/float16.c b/libc/intrin/float16.c
index 476a2f6c942..434f0cafd7a 100644
--- a/libc/intrin/float16.c
+++ b/libc/intrin/float16.c
@@ -21,22 +21,135 @@
  * @fileoverview fp16 compiler runtime
  */
 
-#define asint(x) ((union pun){x}).i
-#define isnan(x) (((x) & 0x7fff) > 0x7c00)
+#define isnan16(x) (((x) & 0x7fff) > 0x7c00)
 
-union pun {
-  _Float16 f;
-  unsigned short i;
-};
+static inline _Float16 tofloat16(int x) {
+  union {
+    uint16_t i;
+    _Float16 f;
+  } u = {x};
+  return u.f;
+}
+
+static inline int fromfloat16(_Float16 x) {
+  union {
+    _Float16 f;
+    uint16_t i;
+  } u = {x};
+  return u.i;
+}
+
+static inline _Float32 tofloat32(uint32_t w) {
+  union {
+    uint32_t as_bits;
+    _Float32 as_value;
+  } fp32;
+  fp32.as_bits = w;
+  return fp32.as_value;
+}
+
+static inline uint32_t fromfloat32(_Float32 f) {
+  union {
+    _Float32 as_value;
+    uint32_t as_bits;
+  } fp32;
+  fp32.as_value = f;
+  return fp32.as_bits;
+}
+
+static inline _Float32 fabs32(_Float32 x) {
+  return tofloat32(fromfloat32(x) & 0x7fffffffu);
+}
 
 int __eqhf2(_Float16 fx, _Float16 fy) {
-  int x = asint(fx);
-  int y = asint(fy);
-  return (x == y) & !isnan(x) & !isnan(y);
+  int x = fromfloat16(fx);
+  int y = fromfloat16(fy);
+  return (x == y) & !isnan16(x) & !isnan16(y);
 }
 
 int __nehf2(_Float16 fx, _Float16 fy) {
-  int x = asint(fx);
-  int y = asint(fy);
-  return (x != y) & !isnan(x) & !isnan(y);
+  int x = fromfloat16(fx);
+  int y = fromfloat16(fy);
+  return (x != y) & !isnan16(x) & !isnan16(y);
+}
+
+_Float32 __extendhfsf2(_Float16 f) {
+  uint16_t h = fromfloat16(f);
+  const uint32_t w = (uint32_t)h << 16;
+  const uint32_t sign = w & 0x80000000u;
+  const uint32_t two_w = w + w;
+  const uint32_t exp_offset = 0xE0u << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
+    defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const _Float32 exp_scale = 0x1.0p-112f;
+#else
+  const _Float32 exp_scale = tofloat32(0x7800000u);
+#endif
+  const _Float32 normalized_value =
+      tofloat32((two_w >> 4) + exp_offset) * exp_scale;
+  const uint32_t magic_mask = 126u << 23;
+  const _Float32 magic_bias = 0.5f;
+  const _Float32 denormalized_value =
+      tofloat32((two_w >> 17) | magic_mask) - magic_bias;
+  const uint32_t denormalized_cutoff = 1u << 27;
+  const uint32_t result =
+      sign | (two_w < denormalized_cutoff ? fromfloat32(denormalized_value)
+                                          : fromfloat32(normalized_value));
+  return tofloat32(result);
+}
+
+_Float64 __extendhfdf2(_Float16 f) {
+  return __extendhfsf2(f);
+}
+
+#ifdef __x86_64__
+__float80 __extendhfxf2(_Float16 f) {
+  return __extendhfsf2(f);
+}
+#endif
+
+#ifdef __aarch64__
+_Float128 __extendhftf2(_Float16 f) {
+  return __extendhfsf2(f);
+}
+#endif
+
+_Float16 __truncsfhf2(_Float32 f) {
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || \
+    defined(__GNUC__) && !defined(__STRICT_ANSI__)
+  const _Float32 scale_to_inf = 0x1.0p+112f;
+  const _Float32 scale_to_zero = 0x1.0p-110f;
+#else
+  const _Float32 scale_to_inf = tofloat32(0x77800000u);
+  const _Float32 scale_to_zero = tofloat32(0x08800000u);
+#endif
+  _Float32 base = (fabs32(f) * scale_to_inf) * scale_to_zero;
+  const uint32_t w = fromfloat32(f);
+  const uint32_t shl1_w = w + w;
+  const uint32_t sign = w & 0x80000000u;
+  uint32_t bias = shl1_w & 0xFF000000u;
+  if (bias < 0x71000000u)
+    bias = 0x71000000u;
+  base = tofloat32((bias >> 1) + 0x07800000u) + base;
+  const uint32_t bits = fromfloat32(base);
+  const uint32_t exp_bits = (bits >> 13) & 0x00007C00u;
+  const uint32_t mantissa_bits = bits & 0x00000FFFu;
+  const uint32_t nonsign = exp_bits + mantissa_bits;
+  return tofloat16((sign >> 16) | (shl1_w > 0xFF000000u ? 0x7E00u : nonsign));
+}
+
+_Float16 __truncdfhf2(_Float64 f) {
+  return __truncsfhf2(f);
+}
+
+#ifdef __x86_64__
+_Float16 __truncxfhf2(__float80 f) {
+  return __truncsfhf2(f);
+}
+#endif
+
+#ifdef __aarch64__
+_Float16 __trunctfhf2(_Float128 f) {
+  return __truncsfhf2(f);
 }
+#endif
diff --git a/libc/intrin/truncdfbf2.c b/libc/intrin/truncdfbf2.c
deleted file mode 100644
index 65dfff08c73..00000000000
--- a/libc/intrin/truncdfbf2.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
-│ vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8                               :vi │
-╞══════════════════════════════════════════════════════════════════════════════╡
-│ Copyright 2024 Justine Alexandra Roberts Tunney                              │
-│                                                                              │
-│ Permission to use, copy, modify, and/or distribute this software for         │
-│ any purpose with or without fee is hereby granted, provided that the         │
-│ above copyright notice and this permission notice appear in all copies.      │
-│                                                                              │
-│ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL                │
-│ WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED                │
-│ WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE             │
-│ AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL         │
-│ DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR        │
-│ PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER               │
-│ TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR             │
-│ PERFORMANCE OF THIS SOFTWARE.                                                │
-╚─────────────────────────────────────────────────────────────────────────────*/
-
-__bf16 __truncsfbf2(float);
-__bf16 __truncdfbf2(double f) {
-  // TODO(jart): What else are we supposed to do here?
-  return __truncsfbf2(f);
-}
diff --git a/third_party/compiler_rt/trunctfsf2.c b/libc/intrin/trunctfsf2.c
similarity index 89%
rename from third_party/compiler_rt/trunctfsf2.c
rename to libc/intrin/trunctfsf2.c
index 3ebda815138..bbb961dfe7b 100644
--- a/third_party/compiler_rt/trunctfsf2.c
+++ b/libc/intrin/trunctfsf2.c
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-__static_yoink("huge_compiler_rt_license");
-
 #define QUAD_PRECISION
 #include "third_party/compiler_rt/fp_lib.inc"
 
@@ -18,7 +16,7 @@ __static_yoink("huge_compiler_rt_license");
 #include "third_party/compiler_rt/fp_trunc_impl.inc"
 
 COMPILER_RT_ABI float __trunctfsf2(long double a) {
-    return __truncXfYf2__(a);
+  return __truncXfYf2__(a);
 }
 
 #endif
diff --git a/third_party/compiler_rt/extendhfdf2.c b/third_party/compiler_rt/extendhfdf2.c
deleted file mode 100644
index 729eb04c1a2..00000000000
--- a/third_party/compiler_rt/extendhfdf2.c
+++ /dev/null
@@ -1,17 +0,0 @@
-//===-- lib/extendhfdf2.c - half -> dubble conversion -------------*- C -*-===//
-//
-//                The Cosmopolitan Compiler Infrastructure
-//
-// This file is dual licensed under the MIT and the University of Illinois Open
-// Source Licenses. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-
-#define SRC_HALF
-#define DST_DOUBLE
-#include "third_party/compiler_rt/fp16_extend_impl.inc"
-
-COMPILER_RT_ABI dst_t __extendhfdf2(src_t a) {
-    return __extendXfYf2__(a);
-}
diff --git a/third_party/compiler_rt/extendhfsf2.c b/third_party/compiler_rt/extendhfsf2.c
deleted file mode 100644
index f891d95420e..00000000000
--- a/third_party/compiler_rt/extendhfsf2.c
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- lib/extendhfsf2.c - half -> single conversion -------------*- C -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define SRC_HALF
-#define DST_SINGLE
-#include "fp16_extend_impl.inc"
-
-// Use a forwarding definition and noinline to implement a poor man's alias,
-// as there isn't a good cross-platform way of defining one.
-COMPILER_RT_ABI NOINLINE float __extendhfsf2(src_t a) {
-  return __extendXfYf2__(a);
-}
-
-COMPILER_RT_ABI float __gnu_h2f_ieee(src_t a) { return __extendhfsf2(a); }
-
-#if defined(__ARM_EABI__)
-#if defined(COMPILER_RT_ARMHF_TARGET)
-AEABI_RTABI float __aeabi_h2f(src_t a) { return __extendhfsf2(a); }
-#else
-COMPILER_RT_ALIAS(__extendhfsf2, __aeabi_h2f)
-#endif
-#endif
diff --git a/third_party/compiler_rt/truncdfhf2.c b/third_party/compiler_rt/truncdfhf2.c
deleted file mode 100644
index 9a01e2c2e1e..00000000000
--- a/third_party/compiler_rt/truncdfhf2.c
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- lib/truncdfhf2.c - double -> half conversion --------------*- C -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define SRC_DOUBLE
-#define DST_HALF
-#include "fp16_trunc_impl.inc"
-
-COMPILER_RT_ABI dst_t __truncdfhf2(double a) { return __truncXfYf2__(a); }
-
-#if defined(__ARM_EABI__)
-#if defined(COMPILER_RT_ARMHF_TARGET)
-AEABI_RTABI dst_t __aeabi_d2h(double a) { return __truncdfhf2(a); }
-#else
-COMPILER_RT_ALIAS(__truncdfhf2, __aeabi_d2h)
-#endif
-#endif
diff --git a/third_party/compiler_rt/truncsfhf2.c b/third_party/compiler_rt/truncsfhf2.c
deleted file mode 100644
index d15e1884f23..00000000000
--- a/third_party/compiler_rt/truncsfhf2.c
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- lib/truncsfhf2.c - single -> half conversion --------------*- C -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define SRC_SINGLE
-#define DST_HALF
-#include "fp16_trunc_impl.inc"
-
-// Use a forwarding definition and noinline to implement a poor man's alias,
-// as there isn't a good cross-platform way of defining one.
-COMPILER_RT_ABI NOINLINE dst_t __truncsfhf2(float a) {
-  return __truncXfYf2__(a);
-}
-
-COMPILER_RT_ABI dst_t __gnu_f2h_ieee(float a) { return __truncsfhf2(a); }
-
-#if defined(__ARM_EABI__)
-#if defined(COMPILER_RT_ARMHF_TARGET)
-AEABI_RTABI dst_t __aeabi_f2h(float a) { return __truncsfhf2(a); }
-#else
-COMPILER_RT_ALIAS(__truncsfhf2, __aeabi_f2h)
-#endif
-#endif