Skip to content

Commit

Permalink
[mono] Optimize startup vtable setup (dotnet#101312)
Browse files Browse the repository at this point in the history
* Add new [ptr, ptr] -> ptr simdhash variant for caching
* Cache mono_class_implement_interface_slow because we perform many redundant calls to it during application startup
* Verify cache in checked builds
  • Loading branch information
kg authored and michaelgsharp committed May 8, 2024
1 parent 6c90aae commit 485ed8e
Show file tree
Hide file tree
Showing 8 changed files with 196 additions and 9 deletions.
3 changes: 2 additions & 1 deletion src/mono/mono/metadata/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@ endif()
set(imported_native_sources
../../../native/containers/dn-simdhash.c
../../../native/containers/dn-simdhash-string-ptr.c
../../../native/containers/dn-simdhash-u32-ptr.c)
../../../native/containers/dn-simdhash-u32-ptr.c
../../../native/containers/dn-simdhash-ptrpair-ptr.c)

set(metadata_common_sources
appdomain.c
Expand Down
9 changes: 8 additions & 1 deletion src/mono/mono/metadata/class-setup-vtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,13 @@ mono_method_get_method_definition (MonoMethod *method)
static gboolean
verify_class_overrides (MonoClass *klass, MonoMethod **overrides, int onum)
{
// on windows and arm, we define NDEBUG for release builds
// on browser and wasi, we define DEBUG for debug builds
#ifdef ENABLE_CHECKED_BUILD
if (klass->image == mono_defaults.corlib)
return TRUE;
#endif

int i;

for (i = 0; i < onum; ++i) {
Expand Down Expand Up @@ -1760,7 +1767,7 @@ mono_class_setup_vtable_general (MonoClass *klass, MonoMethod **overrides, int o
MonoMethod *override = iface_overrides [i*2 + 1];
if (mono_class_is_gtd (override->klass)) {
override = mono_class_inflate_generic_method_full_checked (override, ic, mono_class_get_context (ic), error);
}
}
// there used to be code here to inflate decl if decl->is_inflated, but in https://github.com/dotnet/runtime/pull/64102#discussion_r790019545 we
// think that this does not correspond to any real code.
if (!apply_override (klass, ic, vtable, decl, override, &override_map, &override_class_map, &conflict_map))
Expand Down
116 changes: 111 additions & 5 deletions src/mono/mono/metadata/class.c
Original file line number Diff line number Diff line change
Expand Up @@ -4331,12 +4331,16 @@ mono_class_is_variant_compatible_slow (MonoClass *klass, MonoClass *oklass)
}
return TRUE;
}
/*Check if @candidate implements the interface @target*/

static gboolean
mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate)
mono_class_implement_interface_slow_cached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache);

static gboolean
mono_class_implement_interface_slow_uncached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache)
{
ERROR_DECL (error);
int i;

gboolean is_variant = mono_class_has_variant_generic_params (target);

if (is_variant && MONO_CLASS_IS_INTERFACE_INTERNAL (candidate)) {
Expand Down Expand Up @@ -4365,7 +4369,7 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate)
return TRUE;
if (is_variant && mono_class_is_variant_compatible_slow (target, iface_class))
return TRUE;
if (mono_class_implement_interface_slow (target, iface_class))
if (mono_class_implement_interface_slow_cached (target, iface_class, cache))
return TRUE;
}
}
Expand All @@ -4390,7 +4394,7 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate)
if (is_variant && mono_class_is_variant_compatible_slow (target, candidate_interfaces [i]))
return TRUE;

if (mono_class_implement_interface_slow (target, candidate_interfaces [i]))
if (mono_class_implement_interface_slow_cached (target, candidate_interfaces [i], cache))
return TRUE;
}
}
Expand All @@ -4400,6 +4404,107 @@ mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate)
return FALSE;
}

// #define LOG_INTERFACE_CACHE_HITS 1

#if LOG_INTERFACE_CACHE_HITS
static gint64 implement_interface_hits = 0, implement_interface_misses = 0;

static void
log_hit_rate (dn_simdhash_ptrpair_ptr_t *cache)
{
gint64 total_calls = implement_interface_hits + implement_interface_misses;
if ((total_calls % 500) != 0)
return;
double hit_rate = implement_interface_hits * 100.0 / total_calls;
g_printf ("implement_interface cache hit rate: %f (%lld total calls). Overflow count: %u\n", hit_rate, total_calls, dn_simdhash_overflow_count (cache));
}
#endif

static gboolean
mono_class_implement_interface_slow_cached (MonoClass *target, MonoClass *candidate, dn_simdhash_ptrpair_ptr_t *cache)
{
gpointer cached_result = NULL;
dn_ptrpair_t key = { target, candidate };
gboolean result = 0, cache_hit = 0;

// Skip the caching logic for exact matches
if (candidate == target)
return TRUE;

cache_hit = dn_simdhash_ptrpair_ptr_try_get_value (cache, key, &cached_result);
if (cache_hit) {
// Testing shows a cache hit rate of 60% on S.R.Tests and S.T.J.Tests,
// and 40-50% for small app startup. Near-zero overflow count.
#if LOG_INTERFACE_CACHE_HITS
implement_interface_hits++;
log_hit_rate (cache);
#endif
result = (cached_result != NULL);
#ifndef ENABLE_CHECKED_BUILD
return result;
#endif
}

gboolean uncached_result = mono_class_implement_interface_slow_uncached (target, candidate, cache);

if (!cache_hit) {
#if LOG_INTERFACE_CACHE_HITS
implement_interface_misses++;
log_hit_rate (cache);
#endif
dn_simdhash_ptrpair_ptr_try_add (cache, key, uncached_result ? GUINT_TO_POINTER(1) : NULL);
}

#ifdef ENABLE_CHECKED_BUILD
if (cache_hit) {
if (result != uncached_result)
g_print (
"Cache mismatch for %s.%s and %s.%s: cached=%d, uncached=%d\n",
m_class_get_name_space (target), m_class_get_name (target),
m_class_get_name_space (candidate), m_class_get_name (candidate),
result, uncached_result
);
g_assert (result == uncached_result);
}
#endif
return uncached_result;
}

static dn_simdhash_ptrpair_ptr_t *implement_interface_scratch_cache = NULL;

/*Check if @candidate implements the interface @target*/
static gboolean
mono_class_implement_interface_slow (MonoClass *target, MonoClass *candidate)
{
gpointer cas_result;
gboolean result;
dn_simdhash_ptrpair_ptr_t *cache = (dn_simdhash_ptrpair_ptr_t *)mono_atomic_xchg_ptr ((volatile gpointer *)&implement_interface_scratch_cache, NULL);
if (!cache)
// Roughly 64KB of memory usage and big enough to have fast lookups
// Smaller is viable but makes the hit rate worse
cache = dn_simdhash_ptrpair_ptr_new (2048, NULL);
else if (dn_simdhash_count (cache) >= 2250) {
// FIXME: 2250 is arbitrary (roughly 256 11-item buckets w/load factor)
// One step down reduces hit rate by approximately 2-4%
// HACK: Only clear the scratch cache once it gets too big.
// The pattern is that (especially during startup), we have lots
// of mono_class_implement_interface_slow calls back to back that
// perform similar checks, so keeping the cache data around between
// sequential calls will potentially optimize them a lot.
dn_simdhash_clear (cache);
}

result = mono_class_implement_interface_slow_cached (target, candidate, cache);

// Under most circumstances we won't have multiple threads competing to run implement_interface_slow,
// so it's not worth making this thread-local and potentially keeping a cache instance around per-thread.
cas_result = mono_atomic_cas_ptr ((volatile gpointer *)&implement_interface_scratch_cache, cache, NULL);
if (cas_result != NULL)
dn_simdhash_free (cache);

return result;
}

/*
* Check if @oklass can be assigned to @klass.
* This function does the same as mono_class_is_assignable_from_internal but is safe to be used from mono_class_init_internal context.
Expand All @@ -4416,8 +4521,9 @@ mono_class_is_assignable_from_slow (MonoClass *target, MonoClass *candidate)
return TRUE;

/*If target is not an interface there is no need to check them.*/
if (MONO_CLASS_IS_INTERFACE_INTERNAL (target))
if (MONO_CLASS_IS_INTERFACE_INTERNAL (target)) {
return mono_class_implement_interface_slow (target, candidate);
}

if (m_class_is_delegate (target) && mono_class_has_variant_generic_params (target))
return mono_class_is_variant_compatible (target, candidate, FALSE);
Expand Down
2 changes: 2 additions & 0 deletions src/native/containers/containers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ list(APPEND SHARED_CONTAINER_SOURCES
# dn-simdhash-string-ptr.c
# dn-simdhash-u32-ptr.c
# dn-simdhash-ptr-ptr.c
# dn-simdhash-ght-compatible.c
# dn-simdhash-ptrpair-ptr.c
)

list(APPEND SHARED_CONTAINER_HEADERS
Expand Down
39 changes: 39 additions & 0 deletions src/native/containers/dn-simdhash-ptrpair-ptr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

#include <config.h>
#include "dn-simdhash.h"

#include "dn-simdhash-utils.h"

typedef struct dn_ptrpair_t {
void *first;
void *second;
} dn_ptrpair_t;

static inline uint32_t
dn_ptrpair_t_hash (dn_ptrpair_t key)
{
return (MurmurHash3_32_ptr(key.first, 0) ^ MurmurHash3_32_ptr(key.second, 1));
}

static inline uint8_t
dn_ptrpair_t_equals (dn_ptrpair_t lhs, dn_ptrpair_t rhs)
{
return (lhs.first == rhs.first) && (lhs.second == rhs.second);
}

#define DN_SIMDHASH_T dn_simdhash_ptrpair_ptr
#define DN_SIMDHASH_KEY_T dn_ptrpair_t
#define DN_SIMDHASH_VALUE_T void *
#define DN_SIMDHASH_KEY_HASHER(hash, key) dn_ptrpair_t_hash(key)
#define DN_SIMDHASH_KEY_EQUALS(hash, lhs, rhs) dn_ptrpair_t_equals(lhs, rhs)
#if SIZEOF_VOID_P == 8
// 192 bytes holds 12 16-byte blocks, so 11 keys and one suffix table
#define DN_SIMDHASH_BUCKET_CAPACITY 11
#else
// 128 bytes holds 16 8-byte blocks, so 14 keys and one suffix table
#define DN_SIMDHASH_BUCKET_CAPACITY 14
#endif

#include "dn-simdhash-specialization.h"
15 changes: 15 additions & 0 deletions src/native/containers/dn-simdhash-specializations.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,19 @@ typedef struct dn_simdhash_str_key dn_simdhash_str_key;

#include "dn-simdhash-ght-compatible.h"


typedef struct dn_ptrpair_t {
void *first, *second;
} dn_ptrpair_t;

#define DN_SIMDHASH_T dn_simdhash_ptrpair_ptr
#define DN_SIMDHASH_KEY_T dn_ptrpair_t
#define DN_SIMDHASH_VALUE_T void *

#include "dn-simdhash-specialization-declarations.h"

#undef DN_SIMDHASH_T
#undef DN_SIMDHASH_KEY_T
#undef DN_SIMDHASH_VALUE_T

#endif
16 changes: 14 additions & 2 deletions src/native/containers/dn-simdhash.c
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ dn_simdhash_clear (dn_simdhash_t *hash)
if (hash->vtable.destroy_all)
hash->vtable.destroy_all(hash);
hash->count = 0;
// TODO: Scan through buckets sequentially and only erase ones with data in them
// Maybe skip erasing the key slots too?
// TODO: Implement a fast clear algorithm that scans buckets and only clears ones w/nonzero count
memset(hash->buffers.buckets, 0, hash->buffers.buckets_length * hash->meta->bucket_size_bytes);
// Skip this for performance; memset is especially slow in wasm
// memset(hash->buffers.values, 0, hash->buffers.values_length * hash->meta->value_size);
Expand All @@ -140,6 +139,19 @@ dn_simdhash_count (dn_simdhash_t *hash)
return hash->count;
}

uint32_t
dn_simdhash_overflow_count (dn_simdhash_t *hash)
{
assert(hash);
uint32_t result = 0;
for (uint32_t bucket_index = 0; bucket_index < hash->buffers.buckets_length; bucket_index++) {
uint8_t *suffixes = ((uint8_t *)hash->buffers.buckets) + (bucket_index * hash->meta->bucket_size_bytes);
uint8_t cascade_count = suffixes[DN_SIMDHASH_CASCADED_SLOT];
result += cascade_count;
}
return result;
}

void
dn_simdhash_ensure_capacity (dn_simdhash_t *hash, uint32_t capacity)
{
Expand Down
5 changes: 5 additions & 0 deletions src/native/containers/dn-simdhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,11 @@ dn_simdhash_capacity (dn_simdhash_t *hash);
uint32_t
dn_simdhash_count (dn_simdhash_t *hash);

// Returns the estimated number of items that have overflowed out of a bucket.
// WARNING: This is expensive to calculate.
uint32_t
dn_simdhash_overflow_count (dn_simdhash_t *hash);

// Automatically resizes the table if it is too small to hold the requested number
// of items. Will not shrink the table if it is already bigger.
void
Expand Down

0 comments on commit 485ed8e

Please sign in to comment.