Skip to content

Commit

Permalink
Check in ruler summation experiments
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Jul 30, 2024
1 parent 3dab207 commit 8cdb3e1
Show file tree
Hide file tree
Showing 3 changed files with 474 additions and 54 deletions.
22 changes: 17 additions & 5 deletions test/libc/tinymath/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@

PKGS += TEST_LIBC_TINYMATH

TEST_LIBC_TINYMATH_SRCS := $(wildcard test/libc/tinymath/*.c)
TEST_LIBC_TINYMATH_SRCS_C := $(wildcard test/libc/tinymath/*.c)
TEST_LIBC_TINYMATH_SRCS_CC := $(wildcard test/libc/tinymath/*.cc)
TEST_LIBC_TINYMATH_SRCS_TEST = $(filter %_test.c,$(TEST_LIBC_TINYMATH_SRCS))

TEST_LIBC_TINYMATH_SRCS = \
$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o) \
$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)

TEST_LIBC_TINYMATH_OBJS = \
$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%.o)
$(TEST_LIBC_TINYMATH_SRCS_C:%.c=o/$(MODE)/%.o) \
$(TEST_LIBC_TINYMATH_SRCS_CC:%.cc=o/$(MODE)/%.o)

TEST_LIBC_TINYMATH_COMS = \
$(TEST_LIBC_TINYMATH_SRCS:%.c=o/$(MODE)/%)
Expand All @@ -26,19 +32,21 @@ TEST_LIBC_TINYMATH_DIRECTDEPS = \
LIBC_CALLS \
LIBC_FMT \
LIBC_INTRIN \
LIBC_LOG \
LIBC_MEM \
LIBC_NEXGEN32E \
LIBC_STDIO \
LIBC_RUNTIME \
LIBC_STDIO \
LIBC_STR \
LIBC_SYSV \
LIBC_TESTLIB \
LIBC_TINYMATH \
LIBC_X \
THIRD_PARTY_COMPILER_RT \
THIRD_PARTY_GDTOA \
THIRD_PARTY_COMPILER_RT \
THIRD_PARTY_DOUBLECONVERSION
THIRD_PARTY_DOUBLECONVERSION \
THIRD_PARTY_GDTOA \
THIRD_PARTY_LIBCXX \

TEST_LIBC_TINYMATH_DEPS := \
$(call uniq,$(foreach x,$(TEST_LIBC_TINYMATH_DIRECTDEPS),$($(x))))
Expand All @@ -60,6 +68,10 @@ $(TEST_LIBC_TINYMATH_OBJS): private \
CFLAGS += \
-fno-builtin

$(TEST_LIBC_TINYMATH_OBJS): private \
CXXFLAGS += \
#-ffast-math

.PHONY: o/$(MODE)/test/libc/tinymath
o/$(MODE)/test/libc/tinymath: \
$(TEST_LIBC_TINYMATH_BINS) \
Expand Down
289 changes: 289 additions & 0 deletions test/libc/tinymath/fdot_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,289 @@
#include "libc/assert.h"
#include "libc/calls/struct/timespec.h"
#include "libc/intrin/bsr.h"
#include "libc/macros.internal.h"
#include "libc/math.h"
#include "libc/mem/gc.h"
#include "libc/mem/leaks.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/x/xasprintf.h"

#define EXPENSIVE_TESTS 0

#define CHUNK 8

#define FASTMATH __attribute__((__optimize__("-O3,-ffast-math")))
#define PORTABLE __target_clones("avx512f,avx")

static unsigned long long lcg = 1;

int rand32(void) {
/* Knuth, D.E., "The Art of Computer Programming," Vol 2,
Seminumerical Algorithms, Third Edition, Addison-Wesley, 1998,
p. 106 (line 26) & p. 108 */
lcg *= 6364136223846793005;
lcg += 1442695040888963407;
return lcg >> 32;
}

float float01(unsigned x) { // (0,1)
return 1.f / 8388608 * ((x >> 9) + .5f);
}

float numba(void) { // (-1,1)
return float01(rand32()) * 2 - 1;
}

PORTABLE float fdotf_dubble(const float *A, const float *B, size_t n) {
double s = 0;
for (size_t i = 0; i < n; ++i)
s = fma(A[i], B[i], s);
return s;
}

float fdotf_kahan(const float *A, const float *B, size_t n) {
size_t i;
float err, sum, t, y;
sum = err = 0;
for (i = 0; i < n; ++i) {
y = A[i] * B[i] - err;
t = sum + y;
err = (t - sum) - y;
sum = t;
}
return sum;
}

float fdotf_naive(const float *A, const float *B, size_t n) {
float s = 0;
for (size_t i = 0; i < n; ++i)
s = fmaf(A[i], B[i], s);
return s;
}

#define fdotf_naive_tester(A, B, n, tol) \
do { \
float err = fabsf(fdotf_naive(A, B, n) - fdotf_dubble(A, B, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)

void test_fdotf_naive(void) {
float *A = new float[2 * 1024 * 1024 + 1];
float *B = new float[2 * 1024 * 1024 + 1];
for (size_t i = 0; i < 2 * 1024 * 1024 + 1; ++i) {
A[i] = numba();
B[i] = numba();
}
for (size_t n = 0; n < 1024; ++n)
fdotf_naive_tester(A, B, n, 1e-4);
#if EXPENSIVE_TESTS
fdotf_naive_tester(A, B, 128 * 1024, 1e-2);
fdotf_naive_tester(A, B, 256 * 1024, 1e-2);
fdotf_naive_tester(A, B, 1024 * 1024, 1e-1);
fdotf_naive_tester(A, B, 1024 * 1024 - 1, 1e-1);
fdotf_naive_tester(A, B, 1024 * 1024 + 1, 1e-1);
fdotf_naive_tester(A, B, 2 * 1024 * 1024, 1e-1);
fdotf_naive_tester(A, B, 2 * 1024 * 1024 - 1, 1e-1);
fdotf_naive_tester(A, B, 2 * 1024 * 1024 + 1, 1e-1);
#endif
delete[] B;
delete[] A;
}

template <int N>
forceinline float hdot(const float *A, const float *B) {
return hdot<N / 2>(A, B) + hdot<N / 2>(A + N / 2, B + N / 2);
}

template <>
forceinline float hdot<1>(const float *A, const float *B) {
return A[0] * B[0];
}

float fdotf_recursive(const float *A, const float *B, size_t n) {
if (n > 32) {
float x, y;
x = fdotf_recursive(A, B, n / 2);
y = fdotf_recursive(A + n / 2, B + n / 2, n - n / 2);
return x + y;
} else {
float s;
size_t i;
for (s = i = 0; i < n; ++i)
s = fmaf(A[i], B[i], s);
return s;
}
}

FASTMATH float fdotf_ruler(const float *A, const float *B, size_t n) {
int rule, step = 2;
size_t chunk, sp = 0;
float stack[bsr(n / CHUNK + 1) + 1];
for (chunk = 0; chunk + CHUNK * 4 <= n; chunk += CHUNK * 4, step += 2) {
float sum = 0;
for (size_t elem = 0; elem < CHUNK * 4; ++elem)
sum += A[chunk + elem] * B[chunk + elem];
for (rule = bsr(step & -step); --rule;)
sum += stack[--sp];
stack[sp++] = sum;
}
float res = 0;
while (sp)
res += stack[--sp];
for (; chunk < n; ++chunk)
res += A[chunk] * B[chunk];
return res;
}

#define fdotf_ruler_tester(A, B, n, tol) \
do { \
float err = fabsf(fdotf_ruler(A, B, n) - fdotf_dubble(A, B, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)

void test_fdotf_ruler(void) {
float *A = new float[10 * 1024 * 1024 + 1];
float *B = new float[10 * 1024 * 1024 + 1];
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
A[i] = numba();
B[i] = numba();
}
fdotf_ruler_tester(A, B, 96, 1e-6);
for (size_t n = 0; n < 4096; ++n)
fdotf_ruler_tester(A, B, n, 1e-5);
#if EXPENSIVE_TESTS
fdotf_ruler_tester(A, B, 128 * 1024, 1e-4);
fdotf_ruler_tester(A, B, 256 * 1024, 1e-4);
fdotf_ruler_tester(A, B, 1024 * 1024, 1e-3);
fdotf_ruler_tester(A, B, 1024 * 1024 - 1, 1e-3);
fdotf_ruler_tester(A, B, 1024 * 1024 + 1, 1e-3);
fdotf_ruler_tester(A, B, 2 * 1024 * 1024, 1e-3);
fdotf_ruler_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
fdotf_ruler_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
fdotf_ruler_tester(A, B, 8 * 1024 * 1024, 1e-3);
fdotf_ruler_tester(A, B, 10 * 1024 * 1024, 1e-3);
#endif
delete[] B;
delete[] A;
}

PORTABLE float fdotf_hefty(const float *A, const float *B, size_t n) {
unsigned i, par, len = 0;
float sum, res[n / CHUNK + 1];
for (res[0] = i = 0; i + CHUNK <= n; i += CHUNK)
res[len++] = hdot<CHUNK>(A + i, B + i);
if (i < n) {
for (sum = 0; i < n; i++)
sum = fmaf(A[i], B[i], sum);
res[len++] = sum;
}
for (par = len >> 1; par; par >>= 1, len >>= 1) {
for (i = 0; i < par; ++i)
res[i] += res[par + i];
if (len & 1)
res[par - 1] += res[len - 1];
}
return res[0];
}

#define fdotf_hefty_tester(A, B, n, tol) \
do { \
float err = fabsf(fdotf_hefty(A, B, n) - fdotf_dubble(A, B, n)); \
if (err > tol) { \
printf("%s:%d: error: n=%zu failed %g\n", __FILE__, __LINE__, (size_t)n, \
err); \
exit(1); \
} \
} while (0)

void test_fdotf_hefty(void) {
float *A = new float[10 * 1024 * 1024 + 1];
float *B = new float[10 * 1024 * 1024 + 1];
for (size_t i = 0; i < 10 * 1024 * 1024 + 1; ++i) {
A[i] = numba();
B[i] = numba();
}
for (size_t n = 0; n < 1024; ++n)
fdotf_hefty_tester(A, B, n, 1e-5);
#if EXPENSIVE_TESTS
fdotf_hefty_tester(A, B, 128 * 1024, 1e-4);
fdotf_hefty_tester(A, B, 256 * 1024, 1e-4);
fdotf_hefty_tester(A, B, 1024 * 1024, 1e-3);
fdotf_hefty_tester(A, B, 1024 * 1024 - 1, 1e-3);
fdotf_hefty_tester(A, B, 1024 * 1024 + 1, 1e-3);
fdotf_hefty_tester(A, B, 2 * 1024 * 1024, 1e-3);
fdotf_hefty_tester(A, B, 2 * 1024 * 1024 - 1, 1e-3);
fdotf_hefty_tester(A, B, 2 * 1024 * 1024 + 1, 1e-3);
fdotf_hefty_tester(A, B, 8 * 1024 * 1024, 1e-3);
fdotf_hefty_tester(A, B, 10 * 1024 * 1024, 1e-3);
#endif
delete[] B;
delete[] A;
}

float nothing(float x) {
return x;
}

float (*barrier)(float) = nothing;

#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
do { \
struct timespec start = timespec_real(); \
for (int __i = 0; __i < ITERATIONS; ++__i) { \
asm volatile("" ::: "memory"); \
CODE; \
} \
long long work = (WORK_PER_RUN) * (ITERATIONS); \
long nanos = \
(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / \
(double)work; \
printf("%8ld ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
} while (0)

int main() {
ShowCrashReports();

#if EXPENSIVE_TESTS
size_t n = 512 * 1024;
#else
size_t n = 1024;
#endif

float *A = new float[n];
float *B = new float[n];
for (size_t i = 0; i < n; ++i) {
A[i] = numba();
B[i] = numba();
}
float kahan, naive, dubble, recursive, hefty, ruler;
test_fdotf_naive();
test_fdotf_hefty();
test_fdotf_ruler();
BENCH(20, 1, (kahan = barrier(fdotf_kahan(A, B, n))));
BENCH(20, 1, (dubble = barrier(fdotf_dubble(A, B, n))));
BENCH(20, 1, (naive = barrier(fdotf_naive(A, B, n))));
BENCH(20, 1, (recursive = barrier(fdotf_recursive(A, B, n))));
BENCH(20, 1, (ruler = barrier(fdotf_ruler(A, B, n))));
BENCH(20, 1, (hefty = barrier(fdotf_hefty(A, B, n))));
printf("dubble = %f (%g)\n", dubble, fabs(dubble - dubble));
printf("kahan = %f (%g)\n", kahan, fabs(kahan - dubble));
printf("naive = %f (%g)\n", naive, fabs(naive - dubble));
printf("recursive = %f (%g)\n", recursive, fabs(recursive - dubble));
printf("ruler = %f (%g)\n", ruler, fabs(ruler - dubble));
printf("hefty = %f (%g)\n", hefty, fabs(hefty - dubble));
delete[] B;
delete[] A;

CheckForMemoryLeaks();
}
Loading

0 comments on commit 8cdb3e1

Please sign in to comment.