Skip to content

Commit

Permalink
i#1568 Mac64 TLS: Use pthread TLS slots for DR and clients (#3832)
Browse files Browse the repository at this point in the history
Uses pthread_key_create() to allocate enough contiguous and aligned TLS
slots to fit our os_local_state_t struct.  This makes it easier to share
Linux code for Mac64.

Keeps the scheme from ce8e803 of storing a pointer to the base of
os_local_state_t in TLS slot 6.  This is indirection we don't need with the
entire os_local_state_t struct in TLS but it is not clear we can take that
many TLS slots for large applications, so I'm leaving this mixture until
we're sure which direction to go in.

Disables the options -mangle_app_seg and -safe_read_tls_init for Mac64.

Issue: #1568, #1979
  • Loading branch information
derekbruening authored Sep 14, 2019
1 parent aa69d08 commit 30a8d5a
Show file tree
Hide file tree
Showing 7 changed files with 195 additions and 22 deletions.
6 changes: 6 additions & 0 deletions core/lib/globals_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,12 @@ typedef struct _instr_t instr_t;
# define IF_MACOS_(x)
#endif

#ifdef MACOS64
# define IF_MACOS64(x) x
#else
# define IF_MACOS64(x)
#endif

#ifdef HAVE_MEMINFO_QUERY
# define IF_MEMQUERY(x) x
# define IF_MEMQUERY_(x) x,
Expand Down
29 changes: 16 additions & 13 deletions core/optionsx.h
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ OPTION_DEFAULT(bool, opt_jit, false, "optimize translation of dynamically genera
* It cannot be used with DGC_DIAGNOSTICS.
*/
OPTION_DEFAULT_INTERNAL(bool, mangle_app_seg,
IF_WINDOWS_ELSE(false, true),
IF_WINDOWS_ELSE(false, IF_LINUX_ELSE(true, false)),
"mangle application's segment usage.")
#endif /* X86 */
#ifdef X64
Expand Down Expand Up @@ -1631,6 +1631,20 @@ OPTION_DEFAULT(uint, early_inject_location, 4 /* INJECT_LOCATION_LdrDefault */,
#endif

/* These should be made internal when sufficiently tested */
#if defined(WINDOWS) || defined(MACOS64)
/* We mark as pcache-affecting though we have other explicit checks */
PC_OPTION_DEFAULT(uint, tls_align,
IF_WINDOWS_ELSE(1 /* case 6770: for disabling alignment */, 0),
/* 0 - use processor cache line */
/* 1, 2, 4 - no alignment
* 32 - Pentium III, Pentium M cache line
* 64 - Pentium 4 cache line
*/
/* XXX: if we ever change our -tls_align default from 1 we should
* consider implications on platform-independence of persisted caches
*/
"TLS slots preferred alignment")
#endif
#ifdef WINDOWS
/* FIXME There's gotta be a better name for this. */
OPTION_DEFAULT(bool, ignore_syscalls_follow_sysenter, true,
Expand All @@ -1657,17 +1671,6 @@ OPTION_DEFAULT(uint, early_inject_location, 4 /* INJECT_LOCATION_LdrDefault */,
"use ignorable syscall classification for shared_syscalls")

/* We mark as pcache-affecting though we have other explicit checks */
PC_OPTION_DEFAULT(uint, tls_align, 1, /* case 6770: for disabling alignment */
/* 0 - use processor cache line */
/* 1, 2, 4 - no alignment
* 32 - Pentium III, Pentium M cache line
* 64 - Pentium 4 cache line
*/
/* FIXME: if we ever change our -tls_align default from 1 we should
* consider implications on platform-independence of persisted caches
*/
"TLS slots preferred alignment")
/* We mark as pcache-affecting though we have other explicit checks */
PC_OPTION_DEFAULT(uint, tls_flags, 1|2 /* TLS_FLAG_BITMAP_TOP_DOWN |
* TLS_FLAG_CACHE_LINE_START */,
"TLS allocation choices")
Expand All @@ -1679,7 +1682,7 @@ OPTION_DEFAULT(uint, early_inject_location, 4 /* INJECT_LOCATION_LdrDefault */,
* whether a thread's TLS is initialized yet, on x86.
* XXX: we plan to remove this once we're sure it's stable.
*/
OPTION_DEFAULT_INTERNAL(bool, safe_read_tls_init, true,
OPTION_DEFAULT_INTERNAL(bool, safe_read_tls_init, IF_LINUX_ELSE(true, false),
"use a safe read to identify uninit TLS")

OPTION_DEFAULT(bool, guard_pages, true, "add guard pages to our heap units")
Expand Down
37 changes: 31 additions & 6 deletions core/unix/os.c
Original file line number Diff line number Diff line change
Expand Up @@ -925,6 +925,9 @@ d_r_os_init(void)
if (!standalone_library)
d_r_rseq_init();
#endif
#ifdef MACOS64
tls_process_init();
#endif
}

/* called before any logfiles are opened */
Expand Down Expand Up @@ -1267,6 +1270,9 @@ find_stack_bottom()
void
os_slow_exit(void)
{
#ifdef MACOS64
tls_process_exit();
#endif
#ifdef LINUX
if (!standalone_library)
d_r_rseq_exit();
Expand Down Expand Up @@ -1395,6 +1401,10 @@ os_timeout(int time_in_milliseconds)
* glibc comments on THREAD_SELF.
*/
#ifdef MACOS64
/* For now we have both a directly-addressable os_local_state_t and a pointer to
* it in slot 6. If we settle on always doing the full os_local_state_t in slots,
* we would probably get rid of the indirection here and directly access slot fields.
*/
# define WRITE_TLS_SLOT_IMM(imm, var) \
IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED()); \
ASSERT(sizeof(var) == sizeof(void *)); \
Expand Down Expand Up @@ -1532,6 +1542,11 @@ static bool
is_thread_tls_initialized(void)
{
#ifdef MACOS64
/* For now we have both a directly-addressable os_local_state_t and a pointer to
* it in slot 6. If we settle on always doing the full os_local_state_t in slots,
* we would probably get rid of the indirection here and directly read the magic
* field from its slot.
*/
byte **tls_swap_slot;
tls_swap_slot = (byte **)get_app_tls_swap_slot_addr();
if (tls_swap_slot == NULL || *tls_swap_slot == NULL ||
Expand Down Expand Up @@ -1666,7 +1681,7 @@ os_tls_offset(ushort tls_offs)
/* no ushort truncation issues b/c TLS_LOCAL_STATE_OFFSET is 0 */
IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
ASSERT(TLS_LOCAL_STATE_OFFSET == 0);
return (TLS_LOCAL_STATE_OFFSET + tls_offs);
return (TLS_LOCAL_STATE_OFFSET + tls_offs IF_MACOS64(+tls_get_dr_offs()));
}

/* converts a segment offset to a local_state_t offset */
Expand All @@ -1676,7 +1691,7 @@ os_local_state_offset(ushort seg_offs)
/* no ushort truncation issues b/c TLS_LOCAL_STATE_OFFSET is 0 */
IF_NOT_HAVE_TLS(ASSERT_NOT_REACHED());
ASSERT(TLS_LOCAL_STATE_OFFSET == 0);
return (seg_offs - TLS_LOCAL_STATE_OFFSET);
return (seg_offs - TLS_LOCAL_STATE_OFFSET IF_MACOS64(-tls_get_dr_offs()));
}

/* XXX: Will return NULL if called before os_thread_init(), which sets
Expand Down Expand Up @@ -1827,7 +1842,8 @@ byte *
get_segment_base(uint seg)
{
#ifdef MACOS64
return (byte *)read_thread_register(seg);
ptr_uint_t *pthread_self = (ptr_uint_t *)read_thread_register(seg);
return (byte *)&pthread_self[SEG_TLS_BASE_OFFSET];
#elif defined(X86)
if (seg == SEG_CS || seg == SEG_SS || seg == SEG_DS || seg == SEG_ES)
return NULL;
Expand Down Expand Up @@ -2028,7 +2044,14 @@ os_tls_init(void)
* FIXME PR 205276: this whole scheme currently does not check if app is using
* segments need to watch modify_ldt syscall
*/
# ifdef MACOS64
/* Today we're allocating enough contiguous TLS slots to hold os_local_state_t.
* We also store a pointer to it in TLS slot 6.
*/
byte *segment = tls_get_dr_addr();
# else
byte *segment = heap_mmap(PAGE_SIZE, MEMPROT_READ | MEMPROT_WRITE, VMM_SPECIAL_MMAP);
# endif
os_local_state_t *os_tls = (os_local_state_t *)segment;

LOG(GLOBAL, LOG_THREADS, 1, "os_tls_init for thread " TIDFMT "\n",
Expand Down Expand Up @@ -2144,9 +2167,6 @@ os_tls_exit(local_state_t *local_state, bool other_thread)
static const ptr_uint_t zero = 0;
# endif /* X86 */
/* We can't read from fs: as we can be called from other threads */
/* ASSUMPTION: local_state_t is laid out at same start as local_state_extended_t */
os_local_state_t *os_tls =
(os_local_state_t *)(((byte *)local_state) - offsetof(os_local_state_t, state));
# if defined(X86) && !defined(MACOS64)
/* If the MSR is in use, writing to the reg faults. We rely on it being 0
* to indicate that.
Expand All @@ -2164,8 +2184,13 @@ os_tls_exit(local_state_t *local_state, bool other_thread)
if (!other_thread)
os_tls_thread_exit(local_state);

# ifndef MACOS64
/* We can't free prior to tls_thread_free() in case that routine refs os_tls */
/* ASSUMPTION: local_state_t is laid out at same start as local_state_extended_t */
os_local_state_t *os_tls =
(os_local_state_t *)(((byte *)local_state) - offsetof(os_local_state_t, state));
heap_munmap(os_tls->self, PAGE_SIZE, VMM_SPECIAL_MMAP);
# endif
#else
global_heap_free(tls_table, MAX_THREADS * sizeof(tls_slot_t) HEAPACCT(ACCT_OTHER));
DELETE_LOCK(tls_lock);
Expand Down
8 changes: 5 additions & 3 deletions core/unix/os_exports.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@
*/
#ifdef X86
# if defined(MACOS64)
# define SEG_TLS SEG_FS /* XXX: no way to set on MacOS 64-bit */
# define SEG_TLS SEG_GS /* DR is sharing the app's segment. */
# define LIB_SEG_TLS SEG_GS /* libc+loader tls */
# elif defined(X64)
# define SEG_TLS SEG_GS
Expand Down Expand Up @@ -122,8 +122,10 @@
* limited interoperability w/ code targeting the Windows x64 ABI. We steal slot 6
* for our own use.
*/
# define DR_TLS_BASE_OFFSET 34 /* offset from pthread_t struct to slot 6 */
# define DR_TLS_BASE_SLOT 6 /* the TLS slot for DR's TLS base */
# define SEG_TLS_BASE_OFFSET 28 /* offset from pthread_t struct to segment base */
# define DR_TLS_BASE_SLOT 6 /* the TLS slot for DR's TLS base */
/* offset from pthread_t struct to slot 6 */
# define DR_TLS_BASE_OFFSET (SEG_TLS_BASE_OFFSET + DR_TLS_BASE_SLOT)
#endif

#ifdef AARCHXX
Expand Down
12 changes: 12 additions & 0 deletions core/unix/tls.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,18 @@ get_dr_tls_base_addr(void);
#endif

#ifdef MACOS64
void
tls_process_init(void);

void
tls_process_exit(void);

int
tls_get_dr_offs(void);

byte *
tls_get_dr_addr(void);

byte **
get_app_tls_swap_slot_addr(void);
#endif
Expand Down
117 changes: 117 additions & 0 deletions core/unix/tls_macos.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "tls.h"
#include <architecture/i386/table.h>
#include <i386/user_ldt.h>
#include <pthread.h>

#ifndef MACOS
# error Mac-only
Expand All @@ -64,6 +65,116 @@
static uint tls_app_index;

#ifdef X64
static pthread_key_t keys_start;

static pthread_key_t
tls_alloc_key(void)
{
pthread_key_t key;
if (pthread_key_create(&key, NULL) != 0) {
REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_ALLOCATE_TLS, 3, get_application_name(),
get_application_pid(),
"System is out of slots or out of memory.");
ASSERT_NOT_REACHED();
}
return key;
}

void
tls_process_init(void)
{
/* Our strategy is to rely on libpthread and allocate directly-addressable
* slots using pthread_key_create(). Our initial implementation allocates
* enough to fit our entire os_local_state_t struct, to make Mac64 behave
* like Linux. If this proves to be too many slots taken from the app,
* we'll want to shift to a strategy like Windows where we only put
* local_state_extended_t in slots and have a separate DR allocation for our
* other data, pointed at by a TLS slot (one of these, or slot 6).
*/
int num_slots_needed = sizeof(os_local_state_t) / sizeof(void *);
byte *seg_base = get_segment_base(TLS_REG_LIB);
uint alignment;
if (DYNAMO_OPTION(tls_align) == 0) {
IF_X64(ASSERT_TRUNCATE(alignment, uint, proc_get_cache_line_size()));
alignment = (uint)proc_get_cache_line_size();
} else {
alignment = DYNAMO_OPTION(tls_align);
}
int i;
pthread_key_t delete_start = 0, delete_end = 0;
for (i = 0; i < alignment / sizeof(void *); i++) {
pthread_key_t key = tls_alloc_key();
if (ALIGNED(seg_base + key * sizeof(void *), alignment)) {
keys_start = key;
break;
}
if (i == 0)
delete_start = key;
delete_end = key;
}
if (keys_start == 0) {
REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_ALLOCATE_TLS, 3, get_application_name(),
get_application_pid(),
"Failed to find aligned slot.");
ASSERT_NOT_REACHED();
}
for (i = 1; i < num_slots_needed; i++) {
pthread_key_t key = tls_alloc_key();
if (key != keys_start + i) {
/* TODO i#1979: To support attach we'll need to keep looking for a
* contiguous range elsewhere in the TLS space, like we do on Windows,
* instead of assuming the first free set is big enough.
*/
REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_ALLOCATE_TLS, 3, get_application_name(),
get_application_pid(),
"Slots are not contiguous.");
ASSERT_NOT_REACHED();
}
}
if (delete_start > 0) {
for (pthread_key_t key = delete_start; key <= delete_end; key++) {
int res = pthread_key_delete(key);
ASSERT(res == 0); /* Can only fail with an invalid key. */
}
}
LOG(GLOBAL, LOG_THREADS, 1, "Reserved TLS keys %d-%d from base " PFX "\n", keys_start,
keys_start + num_slots_needed - 1, get_segment_base(TLS_REG_LIB));
/* Sanity check that the key is just an offset from the segment base. */
DODEBUG({
int seg_offs = keys_start * sizeof(void *);
ASSERT((ptr_int_t)pthread_getspecific(keys_start) == 0);
ASSERT(*(ptr_int_t *)(seg_base + seg_offs) == 0);
# define MAGIC_VALUE 0xdeadbeef12345678UL
int res = pthread_setspecific(keys_start, (void *)MAGIC_VALUE);
ASSERT(res == 0);
ASSERT((ptr_int_t)pthread_getspecific(keys_start) == MAGIC_VALUE);
ASSERT(*(ptr_int_t *)(seg_base + seg_offs) == MAGIC_VALUE);
});
}

void
tls_process_exit(void)
{
int num_slots_needed = sizeof(os_local_state_t) / sizeof(void *);
for (int i = 0; i < num_slots_needed; i++) {
int res = pthread_key_delete(keys_start + i);
ASSERT(res == 0); /* Can only fail with an invalid key. */
}
}

int
tls_get_dr_offs(void)
{
return keys_start * sizeof(void *);
}

byte *
tls_get_dr_addr(void)
{
byte *seg_base = get_segment_base(TLS_REG_LIB);
return seg_base + keys_start * sizeof(void *);
}

byte **
get_app_tls_swap_slot_addr(void)
{
Expand All @@ -79,6 +190,10 @@ void
tls_thread_init(os_local_state_t *os_tls, byte *segment)
{
#ifdef X64
/* For now we have both a directly-addressable os_local_state_t and a pointer to
* it in slot 6. If we settle on always doing the full os_local_state_t in slots,
* we would probably get rid of the use of slot 6.
*/
byte **tls_swap_slot;
ASSERT((byte *)(os_tls->self) == segment);
tls_swap_slot = get_app_tls_swap_slot_addr();
Expand Down Expand Up @@ -178,6 +293,8 @@ tls_get_fs_gs_segment_base(uint seg)
byte *base;
int res;

IF_X64(ASSERT_NOT_REACHED()); /* Not used for x64. */

if (seg != SEG_FS && seg != SEG_GS)
return (byte *)POINTER_MAX;

Expand Down
8 changes: 8 additions & 0 deletions core/win32/events.mc
Original file line number Diff line number Diff line change
Expand Up @@ -678,4 +678,12 @@ Language=English
Application %1!s! (%2!s!). Restartable sequence behavior is not supported: %3!s!.
.
MessageId =
Severity = Error
Facility = DRCore
SymbolicName = MSG_FAILED_TO_ALLOCATE_TLS
Language=English
Application %1!s! (%2!s!). Unable to allocate TLS slots. %3!s!
.
;// ADD NEW MESSAGES HERE

0 comments on commit 30a8d5a

Please sign in to comment.