Skip to content

Commit

Permalink
Avoid legacy instruction penalties on x86
Browse files Browse the repository at this point in the history
  • Loading branch information
jart committed Jul 31, 2024
1 parent 1fba310 commit 8d8aecb
Show file tree
Hide file tree
Showing 16 changed files with 199 additions and 158 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -540,7 +540,7 @@ COSMOCC_HDRS = \
$(foreach x,$(COSMOCC_PKGS),$($(x)_HDRS)) \
$(foreach x,$(COSMOCC_PKGS),$($(x)_INCS))

o/cosmocc.h.txt: Makefile
o/cosmocc.h.txt: Makefile libc $(MAKEFILES) $(call uniq,$(foreach x,$(HDRS) $(INCS),$(dir $(x)))) $(HDRS) $(INCS)
$(file >$@, $(call uniq,$(COSMOCC_HDRS)))

COSMOPOLITAN_H_ROOT_HDRS = \
Expand Down
60 changes: 60 additions & 0 deletions libc/calls/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,66 @@ o/$(MODE)/libc/calls/sigcrashsig.o: private \
CFLAGS += \
-Os

# avoid legacy sse decoding penalty on avx systems
o//libc/calls/cfmakeraw.o \
o//libc/calls/clock_gettime-xnu.o \
o//libc/calls/CPU_AND.o \
o//libc/calls/CPU_OR.o \
o//libc/calls/CPU_XOR.o \
o//libc/calls/dl_iterate_phdr.o \
o//libc/calls/dup-nt.o \
o//libc/calls/fcntl-nt.o \
o//libc/calls/flock-nt.o \
o//libc/calls/fstatfs-nt.o \
o//libc/calls/fstat-nt.o \
o//libc/calls/futimesat.o \
o//libc/calls/futimes.o \
o//libc/calls/getrlimit.o \
o//libc/calls/gettimeofday.o \
o//libc/calls/ioctl.o \
o//libc/calls/lutimes.o \
o//libc/calls/metaflock.o \
o//libc/calls/ntaccesscheck.o \
o//libc/calls/ntspawn.o \
o//libc/calls/open-nt.o \
o//libc/calls/pledge-linux.o \
o//libc/calls/ppoll.o \
o//libc/calls/preadv.o \
o//libc/calls/pselect.o \
o//libc/calls/pwritev.o \
o//libc/calls/read-nt.o \
o//libc/calls/readv.o \
o//libc/calls/readwrite-nt.o \
o//libc/calls/releasefd.o \
o//libc/calls/select.o \
o//libc/calls/sigaction.o \
o//libc/calls/sigenter-freebsd.o \
o//libc/calls/sigenter-netbsd.o \
o//libc/calls/sigenter-openbsd.o \
o//libc/calls/sigenter-xnu.o \
o//libc/calls/sigignore.o \
o//libc/calls/siginfo2cosmo.o \
o//libc/calls/signal.o \
o//libc/calls/sig.o \
o//libc/calls/sigtimedwait.o \
o//libc/calls/stat2cosmo.o \
o//libc/calls/statfs2cosmo.o \
o//libc/calls/statfs2statvfs.o \
o//libc/calls/tcgetattr-nt.o \
o//libc/calls/tcgetattr.o \
o//libc/calls/tcgetwinsize-nt.o \
o//libc/calls/tcsetattr-nt.o \
o//libc/calls/tcsetwinsize-nt.o \
o//libc/calls/termios2host.o \
o//libc/calls/timespec_sleep.o \
o//libc/calls/uname.o \
o//libc/calls/utimensat-old.o \
o//libc/calls/utimes.o \
o//libc/calls/winexec.o \
o//libc/calls/writev.o: private \
COPTS += \
-mgeneral-regs-only

# these assembly files are safe to build on aarch64
o/$(MODE)/libc/calls/getcontext.o: libc/calls/getcontext.S
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
Expand Down
8 changes: 8 additions & 0 deletions libc/intrin/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,14 @@ o/$(MODE)/libc/intrin/x86.o: private \
-fpatchable-function-entry=0 \
-Os

# avoid the legacy sse decoding penalty on avx systems
o//libc/intrin/dll.o \
o//libc/intrin/fds.o \
o//libc/intrin/mmap.o \
o//libc/intrin/demangle.o: private \
CFLAGS += \
-mgeneral-regs-only

# these assembly files are safe to build on aarch64
o/$(MODE)/libc/intrin/aarch64/%.o: libc/intrin/aarch64/%.S
@$(COMPILE) -AOBJECTIFY.S $(OBJECTIFY.S) $(OUTPUT_OPTION) -c $<
Expand Down
25 changes: 0 additions & 25 deletions libc/str/uselocale.c

This file was deleted.

1 change: 1 addition & 0 deletions libc/testlib/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ LIBC_TESTLIB_A_ASSETS = \
LIBC_TESTLIB_A_HDRS = \
libc/testlib/aspect.internal.h \
libc/testlib/bench.h \
libc/testlib/benchmark.h \
libc/testlib/blocktronics.h \
libc/testlib/ezbench.h \
libc/testlib/fastrandomstring.h \
Expand Down
26 changes: 26 additions & 0 deletions libc/testlib/benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#ifndef COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_
#define COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_
#include "libc/calls/struct/timespec.h"
#include "libc/stdio/stdio.h"
COSMOPOLITAN_C_START_

#define BENCHMARK(ITERATIONS, WORK_PER_RUN, CODE) \
do { \
struct timespec start = timespec_real(); \
for (int __i = 0; __i < ITERATIONS; ++__i) { \
asm volatile("" ::: "memory"); \
CODE; \
} \
long long work = ((WORK_PER_RUN) ? (WORK_PER_RUN) : 1) * (ITERATIONS); \
double nanos = \
(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - 1) / \
(double)work; \
if (nanos < 1000) { \
printf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
} else { \
printf("%10lld ns %2dx %s\n", (long long)nanos, (ITERATIONS), #CODE); \
} \
} while (0)

COSMOPOLITAN_C_END_
#endif /* COSMOPOLITAN_LIBC_TESTLIB_BENCHMARK_H_ */
24 changes: 5 additions & 19 deletions test/ctl/set_bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,12 @@
#include "libc/mem/leaks.h"
#include "libc/stdio/stdio.h"
#include "libc/sysv/consts/rusage.h"
#include "libc/testlib/benchmark.h"

// #include <set>
// #define ctl std
// #define check() size()

#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
do { \
struct timespec start = timespec_real(); \
for (int __i = 0; __i < ITERATIONS; ++__i) { \
asm volatile("" ::: "memory"); \
CODE; \
} \
long long work = (WORK_PER_RUN) * (ITERATIONS); \
double nanos = \
(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - \
1) / \
(double)work; \
printf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
} while (0)

int
rand32(void)
{
Expand All @@ -68,19 +54,19 @@ main()
{
long x = 0;
ctl::set<long> s;
BENCH(1000000, 1, s.insert(rand32() % 1000000));
BENCHMARK(1000000, 1, s.insert(rand32() % 1000000));
// s.check();
BENCH(1000000, 1, {
BENCHMARK(1000000, 1, {
auto i = s.find(rand32() % 1000000);
if (i != s.end())
x += *i;
});
BENCH(1000000, 1, {
BENCHMARK(1000000, 1, {
auto i = s.lower_bound(rand32() % 1000000);
if (i != s.end())
x += *i;
});
BENCH(1000000, 1, s.erase(rand32() % 1000000));
BENCHMARK(1000000, 1, s.erase(rand32() % 1000000));
eat(x);
}

Expand Down
56 changes: 21 additions & 35 deletions test/ctl/string_bench.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,27 +20,13 @@
#include "ctl/utility.h"
#include "libc/dce.h"
#include "libc/mem/leaks.h"
#include "libc/testlib/benchmark.h"

#include "libc/calls/struct/timespec.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"

#define BENCH(ITERATIONS, WORK_PER_RUN, CODE) \
do { \
struct timespec start = timespec_real(); \
for (int __i = 0; __i < ITERATIONS; ++__i) { \
asm volatile("" ::: "memory"); \
CODE; \
} \
long long work = (WORK_PER_RUN) * (ITERATIONS); \
double nanos = \
(timespec_tonanos(timespec_sub(timespec_real(), start)) + work - \
1) / \
(double)work; \
printf("%10g ns %2dx %s\n", nanos, (ITERATIONS), #CODE); \
} while (0)

const char* big_c = "aaaaaaaaaaaaaaaaaaaaaaaa";
const char* small_c = "aaaaaaaaaaaaaaaaaaaaaaa";

Expand All @@ -55,98 +41,98 @@ main()
{
const ctl::string_view big(big_c), small(small_c);

BENCH(ITERATIONS * 10, 1, {
BENCHMARK(ITERATIONS * 10, 1, {
ctl::string s;
s.append("hello ");
s.append("world");
});

BENCH(ITERATIONS, 8, {
BENCHMARK(ITERATIONS, 8, {
ctl::string s;
for (int i = 0; i < 8; ++i) {
s.append('a');
}
});

BENCH(ITERATIONS, 16, {
BENCHMARK(ITERATIONS, 16, {
ctl::string s;
for (int i = 0; i < 16; ++i) {
s.append('a');
}
});

BENCH(ITERATIONS, 23, {
BENCHMARK(ITERATIONS, 23, {
ctl::string s;
for (int i = 0; i < 23; ++i) {
s.append('a');
}
});

BENCH(ITERATIONS, 24, {
BENCHMARK(ITERATIONS, 24, {
ctl::string s;
for (int i = 0; i < 24; ++i) {
s.append('a');
}
});

BENCH(ITERATIONS, 32, {
BENCHMARK(ITERATIONS, 32, {
ctl::string s;
for (int i = 0; i < 32; ++i) {
s.append('a');
}
});

BENCH(ITERATIONS, 1, { ctl::string s(small_c); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(small_c); });

BENCH(ITERATIONS, 1, { ctl::string s(small); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(small); });

{
ctl::string small_copy("hello world");
BENCH(ITERATIONS, 1, { ctl::string s2(small_copy); });
BENCHMARK(ITERATIONS, 1, { ctl::string s2(small_copy); });
}

BENCH(ITERATIONS, 1, {
BENCHMARK(ITERATIONS, 1, {
ctl::string s(small);
ctl::string s2(ctl::move(s));
});

BENCH(ITERATIONS, 1, {
BENCHMARK(ITERATIONS, 1, {
ctl::string s(small);
ctl::string s2(s);
});

BENCH(ITERATIONS, 1, { ctl::string s(big_c); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(big_c); });

BENCH(ITERATIONS, 1, { ctl::string s(big); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(big); });

{
ctl::string big_copy(big);
BENCH(ITERATIONS, 1, { ctl::string s2(big_copy); });
BENCHMARK(ITERATIONS, 1, { ctl::string s2(big_copy); });
}

BENCH(ITERATIONS, 1, {
BENCHMARK(ITERATIONS, 1, {
ctl::string s(big);
ctl::string s2(ctl::move(s));
});

BENCH(ITERATIONS, 1, {
BENCHMARK(ITERATIONS, 1, {
ctl::string s(big);
ctl::string s2(s);
});

BENCH(ITERATIONS, 1, { ctl::string s(23, 'a'); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(23, 'a'); });

BENCH(ITERATIONS, 1, { ctl::string s(24, 'a'); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(24, 'a'); });

{
ctl::string s(5, 'a');
BENCH(ITERATIONS, 1, { ctl::string_view s2(s); });
BENCHMARK(ITERATIONS, 1, { ctl::string_view s2(s); });
}

{
ctl::string big_trunc(48, 'a');
big_trunc.resize(4);
BENCH(ITERATIONS, 1, { ctl::string s(big_trunc); });
BENCHMARK(ITERATIONS, 1, { ctl::string s(big_trunc); });
}

CheckForMemoryLeaks();
Expand Down
26 changes: 14 additions & 12 deletions test/libc/str/blake2_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
╚─────────────────────────────────────────────────────────────────────────────*/
#include "libc/str/blake2.h"
#include "libc/assert.h"
#include "libc/calls/struct/timespec.h"
#include "libc/mem/mem.h"
#include "libc/stdio/rand.h"
#include "libc/stdio/stdio.h"
#include "libc/str/str.h"
#include "libc/str/tab.internal.h"
#include "libc/testlib/ezbench.h"
#include "libc/testlib/benchmark.h"
#include "libc/testlib/hyperion.h"
#include "libc/testlib/testlib.h"

Expand Down Expand Up @@ -90,17 +91,18 @@ TEST(BLAKE2B256Test, vectors) {
free(line);
}

BENCH(blake2, bench) {
BENCH(blake2, benchmark) {
char fun[256];
rngset(fun, 256, _rand64, -1);
EZBENCH_N("blake2b256", 0, EZBLAKE2B256(0, 0));
EZBENCH_N("blake2b256", 8, EZBLAKE2B256("helloooo", 8));
EZBENCH_N("blake2b256", 31, EZBLAKE2B256(fun, 31));
EZBENCH_N("blake2b256", 32, EZBLAKE2B256(fun, 32));
EZBENCH_N("blake2b256", 63, EZBLAKE2B256(fun, 63));
EZBENCH_N("blake2b256", 64, EZBLAKE2B256(fun, 64));
EZBENCH_N("blake2b256", 128, EZBLAKE2B256(fun, 128));
EZBENCH_N("blake2b256", 256, EZBLAKE2B256(fun, 256));
EZBENCH_N("blake2b256", kHyperionSize,
EZBLAKE2B256(kHyperion, kHyperionSize));
BENCHMARK(100, 0, __expropriate(EZBLAKE2B256(0, 0)));
BENCHMARK(100, 1, __expropriate(EZBLAKE2B256("h", 1)));
BENCHMARK(100, 8, __expropriate(EZBLAKE2B256("helloooo", 8)));
BENCHMARK(100, 31, __expropriate(EZBLAKE2B256(fun, 31)));
BENCHMARK(100, 32, __expropriate(EZBLAKE2B256(fun, 32)));
BENCHMARK(100, 63, __expropriate(EZBLAKE2B256(fun, 63)));
BENCHMARK(100, 64, __expropriate(EZBLAKE2B256(fun, 64)));
BENCHMARK(100, 128, __expropriate(EZBLAKE2B256(fun, 128)));
BENCHMARK(100, 256, __expropriate(EZBLAKE2B256(fun, 256)));
BENCHMARK(100, kHyperionSize,
__expropriate(EZBLAKE2B256(kHyperion, kHyperionSize)));
}
Loading

0 comments on commit 8d8aecb

Please sign in to comment.