From 5bc15f8e167942503e2ed69481b41d7916e7978e Mon Sep 17 00:00:00 2001
From: Eduardo Manuel Velarde Polar <evelardepola@microsoft.com>
Date: Fri, 2 Aug 2024 17:52:36 -0700
Subject: [PATCH] Revert "Port yield normalization from CoreCLR to Native AOT
 (#103675)"

This reverts commit d35f3021b91d67eeac232a0370c6efb6c256f060.
---
 src/coreclr/gc/env/gcenv.os.h                 |   6 +
 src/coreclr/inc/yieldprocessornormalized.h    |  39 +-
 src/coreclr/nativeaot/Runtime/Crst.h          |   1 +
 .../nativeaot/Runtime/FinalizerHelpers.cpp    |   8 +-
 .../eventpipe/gen-eventing-event-inc.lst      |   1 -
 src/coreclr/nativeaot/Runtime/startup.cpp     |   2 +
 .../Runtime/windows/PalRedhawkInline.h        |  20 -
 .../Runtime/yieldprocessornormalized.cpp      | 102 +++++-
 .../Runtime/yieldprocessornormalized.h        | 228 +++++++++++-
 .../utilcode/yieldprocessornormalized.cpp     |   1 +
 src/coreclr/vm/yieldprocessornormalized.cpp   | 294 ++++++++++++++-
 .../vm/yieldprocessornormalizedshared.cpp     | 341 ------------------
 12 files changed, 653 insertions(+), 390 deletions(-)
 delete mode 100644 src/coreclr/vm/yieldprocessornormalizedshared.cpp

diff --git a/src/coreclr/gc/env/gcenv.os.h b/src/coreclr/gc/env/gcenv.os.h
index aa7223850eaa9b..01ed27dac3e59b 100644
--- a/src/coreclr/gc/env/gcenv.os.h
+++ b/src/coreclr/gc/env/gcenv.os.h
@@ -6,6 +6,12 @@
 #ifndef __GCENV_OS_H__
 #define __GCENV_OS_H__
 
+#ifdef HAS_SYSTEM_YIELDPROCESSOR
+// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
+#undef YieldProcessor
+#define YieldProcessor System_YieldProcessor
+#endif
+
 #define NUMA_NODE_UNDEFINED UINT16_MAX
 
 bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
diff --git a/src/coreclr/inc/yieldprocessornormalized.h b/src/coreclr/inc/yieldprocessornormalized.h
index e37bf79f0c5089..121e60b033356d 100644
--- a/src/coreclr/inc/yieldprocessornormalized.h
+++ b/src/coreclr/inc/yieldprocessornormalized.h
@@ -3,11 +3,14 @@
 
 #pragma once
 
-#ifdef FEATURE_NATIVEAOT
-FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
-#else
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
 FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
 #endif
+#define YieldProcessor Dont_Use_YieldProcessor
 
 #define DISABLE_COPY(T) \
     T(const T &) = delete; \
@@ -141,9 +144,9 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
 {
     _ASSERTE(count != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (count > MaxCount)
         {
@@ -151,7 +154,7 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
         }
     }
 
-    size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
     _ASSERTE(n != 0);
     do
     {
@@ -186,9 +189,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
 {
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -197,7 +200,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
     {
         n = 1;
@@ -224,9 +227,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
 
     _ASSERTE(preSkylakeCount != 0);
 
-    if (sizeof(size_t) <= sizeof(unsigned int))
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
     {
-        // On platforms with a small size_t, prevent overflow on the multiply below
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below
         const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
         if (preSkylakeCount > MaxCount)
         {
@@ -235,8 +238,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
     }
 
     const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
-    size_t n =
-        (size_t)preSkylakeCount *
+    SIZE_T n =
+        (SIZE_T)preSkylakeCount *
         YieldProcessorNormalization::s_yieldsPerNormalizedYield /
         PreSkylakeCountToSkylakeCountDivisor;
     if (n == 0)
@@ -265,11 +268,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
     unsigned int spinIteration)
 {
     // This shift value should be adjusted based on the asserted conditions below
-    const uint8_t MaxShift = 3;
-    static_assert(
-        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
-    static_assert(
-        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+    const UINT8 MaxShift = 3;
+    static_assert_no_msg(
+        ((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+    static_assert_no_msg(
+        ((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
 
     unsigned int n;
     if (spinIteration <= MaxShift &&
diff --git a/src/coreclr/nativeaot/Runtime/Crst.h b/src/coreclr/nativeaot/Runtime/Crst.h
index 4ab9db08e0f5e3..31bf8fde9eec8a 100644
--- a/src/coreclr/nativeaot/Runtime/Crst.h
+++ b/src/coreclr/nativeaot/Runtime/Crst.h
@@ -20,6 +20,7 @@ enum CrstType
     CrstRestrictedCallouts,
     CrstGcStressControl,
     CrstThreadStore,
+    CrstYieldProcessorNormalized,
     CrstEventPipe,
     CrstEventPipeConfig,
     CrstGcEvent,
diff --git a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
index 8fa60538189697..dd9f1e096842fb 100644
--- a/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
+++ b/src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
@@ -48,6 +48,9 @@ uint32_t WINAPI FinalizerStart(void* pContext)
 
     g_pFinalizerThread = PTR_Thread(pThread);
 
+    // We have some time until the first finalization request - use the time to calibrate normalized waits.
+    EnsureYieldProcessorNormalizedInitialized();
+
     // Wait for a finalization request.
     uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
     ASSERT(uResult == WAIT_OBJECT_0);
@@ -181,11 +184,6 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
 {
     FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
     g_FinalizerDoneEvent.Set();
-
-    if (YieldProcessorNormalization::IsMeasurementScheduled())
-    {
-        YieldProcessorNormalization::PerformMeasurement();
-    }
 }
 
 //
diff --git a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
index 0f4c932719a399..901af659ff84b6 100644
--- a/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
+++ b/src/coreclr/nativeaot/Runtime/eventpipe/gen-eventing-event-inc.lst
@@ -113,4 +113,3 @@ ThreadPoolWorkingThreadCount
 ThreadRunning
 WaitHandleWaitStart
 WaitHandleWaitStop
-YieldProcessorMeasurement
diff --git a/src/coreclr/nativeaot/Runtime/startup.cpp b/src/coreclr/nativeaot/Runtime/startup.cpp
index af835018e1823a..db2802dcb115ef 100644
--- a/src/coreclr/nativeaot/Runtime/startup.cpp
+++ b/src/coreclr/nativeaot/Runtime/startup.cpp
@@ -133,6 +133,8 @@ static bool InitDLL(HANDLE hPalInstance)
 #endif
 #endif // !USE_PORTABLE_HELPERS
 
+    InitializeYieldProcessorNormalizedCrst();
+
 #ifdef STRESS_LOG
     uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
     uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
diff --git a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
index 1f2a74dcd15100..187ad26fb8bf11 100644
--- a/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
+++ b/src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
@@ -56,26 +56,6 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
     return _InterlockedCompareExchange64(pDst, iValue, iComparand);
 }
 
-#ifdef HOST_X86
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    int64_t iOldValue;
-    do {
-        iOldValue = *pDst;
-    } while (PalInterlockedCompareExchange64(pDst,
-                                          iValue,
-                                          iOldValue) != iOldValue);
-    return iOldValue;
-}
-#else // HOST_X86
-EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
-#pragma intrinsic(_InterlockedExchange64)
-FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
-{
-    return _InterlockedExchange64(pDst, iValue);
-}
-#endif // HOST_X86
-
 #if defined(HOST_AMD64) || defined(HOST_ARM64)
 EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
 #pragma intrinsic(_InterlockedCompareExchange128)
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
index efaf4e8bb20704..444d52b0114c03 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
@@ -15,6 +15,104 @@
 #include "volatile.h"
 #include "yieldprocessornormalized.h"
 
-#include "../../utilcode/yieldprocessornormalized.cpp"
+#define ULONGLONG int64_t
 
-#include "../../vm/yieldprocessornormalizedshared.cpp"
+static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
+static CrstStatic s_initializeYieldProcessorNormalizedCrst;
+
+// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
+// tuned for Skylake processors
+unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
+unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;
+
+void InitializeYieldProcessorNormalizedCrst()
+{
+    WRAPPER_NO_CONTRACT;
+    s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
+}
+
+static void InitializeYieldProcessorNormalized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);
+
+    if (s_isYieldProcessorNormalizedInitialized)
+    {
+        return;
+    }
+
+    // Intel pre-Skylake processor: measured typically 14-17 cycles per yield
+    // Intel post-Skylake processor: measured typically 125-150 cycles per yield
+    const int MeasureDurationMs = 10;
+    const int NsPerSecond = 1000 * 1000 * 1000;
+
+    ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();
+
+    if (ticksPerSecond < 1000 / MeasureDurationMs)
+    {
+        // High precision clock not available or clock resolution is too low, resort to defaults
+        s_isYieldProcessorNormalizedInitialized = true;
+        return;
+    }
+
+    // Measure the nanosecond delay per yield
+    ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
+    unsigned int yieldCount = 0;
+      ULONGLONG startTicks = PalQueryPerformanceCounter();
+    ULONGLONG elapsedTicks;
+    do
+    {
+        // On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
+        // the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
+        // low microsecond range.
+        for (int i = 0; i < 1000; ++i)
+        {
+            System_YieldProcessor();
+        }
+        yieldCount += 1000;
+
+        ULONGLONG nowTicks = PalQueryPerformanceCounter();
+        elapsedTicks = nowTicks - startTicks;
+    } while (elapsedTicks < measureDurationTicks);
+    double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
+    if (nsPerYield < 1)
+    {
+        nsPerYield = 1;
+    }
+
+    // Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
+    // value is naturally limited to MinNsPerNormalizedYield.
+    int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
+    if (yieldsPerNormalizedYield < 1)
+    {
+        yieldsPerNormalizedYield = 1;
+    }
+    _ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    int optimalMaxNormalizedYieldsPerSpinIteration =
+        (int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
+    if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
+    {
+        optimalMaxNormalizedYieldsPerSpinIteration = 1;
+    }
+
+    g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+    g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
+    s_isYieldProcessorNormalizedInitialized = true;
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+}
+
+void EnsureYieldProcessorNormalizedInitialized()
+{
+    WRAPPER_NO_CONTRACT;
+
+    if (!s_isYieldProcessorNormalizedInitialized)
+    {
+        InitializeYieldProcessorNormalized();
+    }
+}
diff --git a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
index 5539ebf90561bc..8c74bf3cfe3002 100644
--- a/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
+++ b/src/coreclr/nativeaot/Runtime/yieldprocessornormalized.h
@@ -1,5 +1,229 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
-#include "PalRedhawk.h"
-#include "../../inc/yieldprocessornormalized.h"
+#pragma once
+
+#include <limits.h>
+
+// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
+// the intention is to use the system-default implementation of YieldProcessor().
+#define HAS_SYSTEM_YIELDPROCESSOR
+FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
+#ifdef YieldProcessor
+#undef YieldProcessor
+#endif
+#define YieldProcessor Dont_Use_YieldProcessor
+#ifdef PalYieldProcessor
+#undef PalYieldProcessor
+#endif
+#define PalYieldProcessor Dont_Use_PalYieldProcessor
+
+#define SIZE_T uintptr_t
+
+const unsigned int MinNsPerNormalizedYield = 37; // measured typically 37-46 on post-Skylake
+const unsigned int NsPerOptimalMaxSpinIterationDuration = 272; // approx. 900 cycles, measured 281 on pre-Skylake, 263 on post-Skylake
+
+extern unsigned int g_yieldsPerNormalizedYield;
+extern unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration;
+
+void InitializeYieldProcessorNormalizedCrst();
+void EnsureYieldProcessorNormalizedInitialized();
+
+class YieldProcessorNormalizationInfo
+{
+private:
+    unsigned int yieldsPerNormalizedYield;
+    unsigned int optimalMaxNormalizedYieldsPerSpinIteration;
+    unsigned int optimalMaxYieldsPerSpinIteration;
+
+public:
+    YieldProcessorNormalizationInfo()
+        : yieldsPerNormalizedYield(g_yieldsPerNormalizedYield),
+        optimalMaxNormalizedYieldsPerSpinIteration(g_optimalMaxNormalizedYieldsPerSpinIteration),
+        optimalMaxYieldsPerSpinIteration(yieldsPerNormalizedYield * optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+    }
+
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &);
+    friend void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorNormalizedForPreSkylakeCount(const YieldProcessorNormalizationInfo &, unsigned int);
+    friend void YieldProcessorWithBackOffNormalized(const YieldProcessorNormalizationInfo &, unsigned int);
+};
+
+// See YieldProcessorNormalized() for preliminary info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo)
+{
+    unsigned int n = normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Delays execution of the current thread for a short duration. Unlike YieldProcessor(), an effort is made to normalize the
+// delay across processors. The actual delay may be meaningful in several ways, including but not limited to the following:
+//   - The delay should be long enough that a tiny spin-wait like the following has a decent likelihood of observing a new value
+//     for the condition (when changed by a different thread) on each iteration, otherwise it may unnecessary increase CPU usage
+//     and decrease scalability of the operation.
+//         while(!condition)
+//         {
+//             YieldProcessorNormalized();
+//         }
+//   - The delay should be short enough that a tiny spin-wait like above would not miss multiple cross-thread changes to the
+//     condition, otherwise it may unnecessarily increase latency of the operation
+//   - In reasonably short spin-waits, the actual delay may not matter much. In unreasonably long spin-waits that progress in
+//     yield count per iteration for each failed check of the condition, the progression can significantly magnify the second
+//     issue above on later iterations.
+//   - This function and variants are intended to provide a decent balance between the above issues, as ideal solutions to each
+//     issue have trade-offs between them. If latency of the operation is far more important in the scenario, consider using
+//     System_YieldProcessor() instead, which would issue a delay that is typically <= the delay issued by this method.
+FORCEINLINE void YieldProcessorNormalized()
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo());
+}
+
+// See YieldProcessorNormalized(count) for preliminary info. Typical usage:
+//     if (!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalized(normalizationInfo, 2);
+//         } while (!moreExpensiveCondition);
+//     }
+FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo &normalizationInfo, unsigned int count)
+{
+    _ASSERTE(count != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        if (count > MaxCount)
+        {
+            count = MaxCount;
+        }
+    }
+
+    SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function repeats the delay 'count' times. This overload is
+// preferred over the single-count overload when multiple yields are desired per spin-wait iteration. Typical usage:
+//     while(!moreExpensiveCondition)
+//     {
+//         YieldProcessorNormalized(2);
+//     }
+FORCEINLINE void YieldProcessorNormalized(unsigned int count)
+{
+    YieldProcessorNormalized(YieldProcessorNormalizationInfo(), count);
+}
+
+// Please DO NOT use this function in new code! See YieldProcessorNormalizedForPreSkylakeCount(preSkylakeCount) for preliminary
+// info. Typical usage:
+//     if (!condition)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 100);
+//         } while (!condition);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int preSkylakeCount)
+{
+    _ASSERTE(preSkylakeCount != 0);
+
+    if (sizeof(SIZE_T) <= sizeof(unsigned int))
+    {
+        // On platforms with a small SIZE_T, prevent overflow on the multiply below. normalizationInfo.yieldsPerNormalizedYield
+        // is limited to MinNsPerNormalizedYield by InitializeYieldProcessorNormalized().
+        const unsigned int MaxCount = UINT_MAX / MinNsPerNormalizedYield;
+        if (preSkylakeCount > MaxCount)
+        {
+            preSkylakeCount = MaxCount;
+        }
+    }
+
+    const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
+    SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
+    if (n == 0)
+    {
+        n = 1;
+    }
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
+
+// Please DO NOT use this function in new code! This function is to be used for old spin-wait loops that have not been retuned
+// for recent processors, and especially where the yield count may be unreasonably high. The function scales the yield count in
+// an attempt to normalize the total delay across processors, to approximately the total delay that would be issued on a
+// pre-Skylake processor. New code should be tuned with YieldProcessorNormalized() or variants instead. Typical usage:
+//     while(!condition)
+//     {
+//         YieldProcessorNormalizedForPreSkylakeCount(100);
+//     }
+FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkylakeCount)
+{
+    YieldProcessorNormalizedForPreSkylakeCount(YieldProcessorNormalizationInfo(), preSkylakeCount);
+}
+
+// See YieldProcessorNormalized() for preliminary info. This function is to be used when there is a decent possibility that the
+// condition would not be satisfied within a short duration. The current implementation increases the delay per spin-wait
+// iteration exponentially up to a limit. Typical usage:
+//     if (!conditionThatMayNotBeSatisfiedSoon)
+//     {
+//         YieldProcessorNormalizationInfo normalizationInfo;
+//         do
+//         {
+//             YieldProcessorWithBackOffNormalized(normalizationInfo); // maybe Sleep(0) occasionally
+//         } while (!conditionThatMayNotBeSatisfiedSoon);
+//     }
+FORCEINLINE void YieldProcessorWithBackOffNormalized(
+    const YieldProcessorNormalizationInfo &normalizationInfo,
+    unsigned int spinIteration)
+{
+    // normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration cannot exceed the value below based on calculations done in
+    // InitializeYieldProcessorNormalized()
+    const unsigned int MaxOptimalMaxNormalizedYieldsPerSpinIteration =
+        NsPerOptimalMaxSpinIterationDuration * 3 / (MinNsPerNormalizedYield * 2) + 1;
+    _ASSERTE(normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    // This shift value should be adjusted based on the asserted condition below
+    const uint8_t MaxShift = 3;
+    static_assert(((unsigned int)1 << (MaxShift + 1)) >= MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
+
+    unsigned int n;
+    if (spinIteration <= MaxShift &&
+        ((unsigned int)1 << spinIteration) < normalizationInfo.optimalMaxNormalizedYieldsPerSpinIteration)
+    {
+        n = ((unsigned int)1 << spinIteration) * normalizationInfo.yieldsPerNormalizedYield;
+    }
+    else
+    {
+        n = normalizationInfo.optimalMaxYieldsPerSpinIteration;
+    }
+    _ASSERTE(n != 0);
+    do
+    {
+        System_YieldProcessor();
+    } while (--n != 0);
+}
diff --git a/src/coreclr/utilcode/yieldprocessornormalized.cpp b/src/coreclr/utilcode/yieldprocessornormalized.cpp
index c6aaaa19557fa7..020d8d7cc79e4e 100644
--- a/src/coreclr/utilcode/yieldprocessornormalized.cpp
+++ b/src/coreclr/utilcode/yieldprocessornormalized.cpp
@@ -1,6 +1,7 @@
 // Licensed to the .NET Foundation under one or more agreements.
 // The .NET Foundation licenses this file to you under the MIT license.
 
+#include "stdafx.h"
 #include "yieldprocessornormalized.h"
 
 bool YieldProcessorNormalization::s_isMeasurementScheduled;
diff --git a/src/coreclr/vm/yieldprocessornormalized.cpp b/src/coreclr/vm/yieldprocessornormalized.cpp
index 258e30d634c7ce..14166de34dd641 100644
--- a/src/coreclr/vm/yieldprocessornormalized.cpp
+++ b/src/coreclr/vm/yieldprocessornormalized.cpp
@@ -7,4 +7,296 @@
 
 #include "finalizerthread.h"
 
-#include "yieldprocessornormalizedshared.cpp"
+enum class NormalizationState : UINT8
+{
+    Uninitialized,
+    Initialized,
+    Failed
+};
+
+static const int NsPerYieldMeasurementCount = 8;
+static const unsigned int MeasurementPeriodMs = 4000;
+
+static const unsigned int NsPerS = 1000 * 1000 * 1000;
+
+static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
+static unsigned int s_previousNormalizationTimeMs;
+
+static UINT64 s_performanceCounterTicksPerS;
+static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
+static int s_nextMeasurementIndex;
+static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
+
+static unsigned int DetermineMeasureDurationUs()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
+    // if the overhead seems high relative to the measure duration.
+    unsigned int measureDurationUs = 1;
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
+    {
+        measureDurationUs *= 4;
+    }
+    return measureDurationUs;
+}
+
+static double MeasureNsPerYield(unsigned int measureDurationUs)
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_normalizationState != NormalizationState::Failed);
+
+    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
+    UINT64 ticksPerS = s_performanceCounterTicksPerS;
+    UINT64 measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
+
+    LARGE_INTEGER li;
+    QueryPerformanceCounter(&li);
+    UINT64 startTicks = li.QuadPart;
+
+    for (int i = 0; i < yieldCount; ++i)
+    {
+        System_YieldProcessor();
+    }
+
+    QueryPerformanceCounter(&li);
+    UINT64 elapsedTicks = li.QuadPart - startTicks;
+    while (elapsedTicks < measureDurationTicks)
+    {
+        int nextYieldCount =
+            Max(4,
+                elapsedTicks == 0
+                    ? yieldCount / 4
+                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
+        for (int i = 0; i < nextYieldCount; ++i)
+        {
+            System_YieldProcessor();
+        }
+
+        QueryPerformanceCounter(&li);
+        elapsedTicks = li.QuadPart - startTicks;
+        yieldCount += nextYieldCount;
+    }
+
+    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
+    const double MinNsPerYield = 0.1;
+
+    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
+    // really take this long. Limit the maximum to keep the recorded values reasonable.
+    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
+
+    return Max(MinNsPerYield, Min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
+}
+
+void YieldProcessorNormalization::PerformMeasurement()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_PREEMPTIVE;
+    }
+    CONTRACTL_END;
+
+    _ASSERTE(s_isMeasurementScheduled);
+
+    double latestNsPerYield;
+    if (s_normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+
+        int nextMeasurementIndex = s_nextMeasurementIndex;
+        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
+        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
+        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
+        {
+            nextMeasurementIndex = 0;
+        }
+        s_nextMeasurementIndex = nextMeasurementIndex;
+    }
+    else if (s_normalizationState == NormalizationState::Uninitialized)
+    {
+        LARGE_INTEGER li;
+        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
+        {
+            // High precision clock not available or clock resolution is too low, resort to defaults
+            s_normalizationState = NormalizationState::Failed;
+            return;
+        }
+        s_performanceCounterTicksPerS = li.QuadPart;
+
+        unsigned int measureDurationUs = DetermineMeasureDurationUs();
+        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+        {
+            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
+            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
+            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
+            {
+                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
+            }
+
+            if (i < NsPerYieldMeasurementCount - 1)
+            {
+                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+            }
+        }
+    }
+    else
+    {
+        _ASSERTE(s_normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
+    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = s_nsPerYieldMeasurements[i];
+        if (nsPerYield < establishedNsPerYield)
+        {
+            establishedNsPerYield = nsPerYield;
+        }
+    }
+    if (establishedNsPerYield != s_establishedNsPerYield)
+    {
+        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
+    }
+
+    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
+
+    // Calculate the number of yields required to span the duration of a normalized yield
+    unsigned int yieldsPerNormalizedYield = Max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
+    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
+    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
+
+    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
+    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
+    // better job of allowing other work to run.
+    s_optimalMaxNormalizedYieldsPerSpinIteration =
+        Max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
+    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
+
+    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
+
+    s_previousNormalizationTimeMs = GetTickCount();
+    s_normalizationState = NormalizationState::Initialized;
+    s_isMeasurementScheduled = false;
+}
+
+
+void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
+    if (normalizationState == NormalizationState::Initialized)
+    {
+        if (GetTickCount() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
+        {
+            return;
+        }
+    }
+    else if (normalizationState == NormalizationState::Uninitialized)
+    {
+    }
+    else
+    {
+        _ASSERTE(normalizationState == NormalizationState::Failed);
+        return;
+    }
+
+    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
+    if (s_isMeasurementScheduled || !g_fEEStarted)
+    {
+        return;
+    }
+
+    s_isMeasurementScheduled = true;
+    FinalizerThread::EnableFinalization();
+}
+
+
+void YieldProcessorNormalization::FireMeasurementEvents()
+{
+    CONTRACTL
+    {
+        NOTHROW;
+        GC_NOTRIGGER;
+        MODE_ANY;
+    }
+    CONTRACTL_END;
+
+    if (!EventEnabledYieldProcessorMeasurement())
+    {
+        return;
+    }
+
+    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
+    // recorded information, so try to enumerate the array with some care.
+    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
+    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
+    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
+    {
+        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
+        if (nsPerYield != 0) // the array may not be fully initialized yet
+        {
+            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
+        }
+
+        if (++nextIndex >= NsPerYieldMeasurementCount)
+        {
+            nextIndex = 0;
+        }
+    }
+}
+
+double YieldProcessorNormalization::AtomicLoad(double *valueRef)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    return VolatileLoadWithoutBarrier(valueRef);
+#else
+    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
+#endif
+}
+
+void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
+{
+    WRAPPER_NO_CONTRACT;
+
+#ifdef TARGET_64BIT
+    *valueRef = value;
+#else
+    InterlockedExchangeT(valueRef, value);
+#endif
+}
+
diff --git a/src/coreclr/vm/yieldprocessornormalizedshared.cpp b/src/coreclr/vm/yieldprocessornormalizedshared.cpp
deleted file mode 100644
index 05daee21947376..00000000000000
--- a/src/coreclr/vm/yieldprocessornormalizedshared.cpp
+++ /dev/null
@@ -1,341 +0,0 @@
-// Licensed to the .NET Foundation under one or more agreements.
-// The .NET Foundation licenses this file to you under the MIT license.
-
-enum class NormalizationState : uint8_t
-{
-    Uninitialized,
-    Initialized,
-    Failed
-};
-
-static const int NsPerYieldMeasurementCount = 8;
-static const unsigned int MeasurementPeriodMs = 4000;
-
-static const unsigned int NsPerS = 1000 * 1000 * 1000;
-
-static NormalizationState s_normalizationState = NormalizationState::Uninitialized;
-static unsigned int s_previousNormalizationTimeMs;
-
-static uint64_t s_performanceCounterTicksPerS;
-static double s_nsPerYieldMeasurements[NsPerYieldMeasurementCount];
-static int s_nextMeasurementIndex;
-static double s_establishedNsPerYield = YieldProcessorNormalization::TargetNsPerNormalizedYield;
-
-void RhEnableFinalization();
-
-inline unsigned int GetTickCountPortable()
-{
-#ifdef FEATURE_NATIVEAOT
-    return (unsigned int)PalGetTickCount64();
-#else
-    return GetTickCount();
-#endif
-}
-
-static uint64_t GetPerformanceCounter()
-{
-#ifdef FEATURE_NATIVEAOT
-    return PalQueryPerformanceCounter();
-#else
-    LARGE_INTEGER li;
-    QueryPerformanceCounter(&li);
-    return li.QuadPart;
-#endif
-}
-
-static unsigned int DetermineMeasureDurationUs()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    // On some systems, querying the high performance counter has relatively significant overhead. Increase the measure duration
-    // if the overhead seems high relative to the measure duration.
-    unsigned int measureDurationUs = 1;
-    uint64_t startTicks = GetPerformanceCounter();
-    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
-    if (elapsedTicks >= s_performanceCounterTicksPerS * measureDurationUs * (1000 / 4) / NsPerS) // elapsed >= 1/4 of the measure duration
-    {
-        measureDurationUs *= 4;
-    }
-    return measureDurationUs;
-}
-
-static double MeasureNsPerYield(unsigned int measureDurationUs)
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_normalizationState != NormalizationState::Failed);
-
-    int yieldCount = (int)(measureDurationUs * 1000 / s_establishedNsPerYield) + 1;
-    uint64_t ticksPerS = s_performanceCounterTicksPerS;
-    uint64_t measureDurationTicks = ticksPerS * measureDurationUs / (1000 * 1000);
-
-    uint64_t startTicks = GetPerformanceCounter();
-
-    for (int i = 0; i < yieldCount; ++i)
-    {
-        System_YieldProcessor();
-    }
-
-    uint64_t elapsedTicks = GetPerformanceCounter() - startTicks;
-    while (elapsedTicks < measureDurationTicks)
-    {
-        int nextYieldCount =
-            max(4,
-                elapsedTicks == 0
-                    ? yieldCount / 4
-                    : (int)(yieldCount * (measureDurationTicks - elapsedTicks) / (double)elapsedTicks) + 1);
-        for (int i = 0; i < nextYieldCount; ++i)
-        {
-            System_YieldProcessor();
-        }
-
-        elapsedTicks = GetPerformanceCounter() - startTicks;
-        yieldCount += nextYieldCount;
-    }
-
-    // Limit the minimum to a reasonable value considering that on some systems a yield may be implemented as a no-op
-    const double MinNsPerYield = 0.1;
-
-    // Measured values higher than this don't affect values calculated for normalization, and it's very unlikely for a yield to
-    // really take this long. Limit the maximum to keep the recorded values reasonable.
-    const double MaxNsPerYield = YieldProcessorNormalization::TargetMaxNsPerSpinIteration / 1.5 + 1;
-
-    return max(MinNsPerYield, min((double)elapsedTicks * NsPerS / ((double)yieldCount * ticksPerS), MaxNsPerYield));
-}
-
-void YieldProcessorNormalization::PerformMeasurement()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-#ifndef FEATURE_NATIVEAOT
-        MODE_PREEMPTIVE;
-#endif
-    }
-    CONTRACTL_END;
-
-    _ASSERTE(s_isMeasurementScheduled);
-
-    double latestNsPerYield;
-    if (s_normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-
-        int nextMeasurementIndex = s_nextMeasurementIndex;
-        latestNsPerYield = MeasureNsPerYield(DetermineMeasureDurationUs());
-        AtomicStore(&s_nsPerYieldMeasurements[nextMeasurementIndex], latestNsPerYield);
-        if (++nextMeasurementIndex >= NsPerYieldMeasurementCount)
-        {
-            nextMeasurementIndex = 0;
-        }
-        s_nextMeasurementIndex = nextMeasurementIndex;
-    }
-    else if (s_normalizationState == NormalizationState::Uninitialized)
-    {
-#ifdef FEATURE_NATIVEAOT
-        if ((s_performanceCounterTicksPerS = PalQueryPerformanceFrequency()) < 1000 * 1000)
-#else
-        LARGE_INTEGER li;
-        if (!QueryPerformanceFrequency(&li) || li.QuadPart < 1000 * 1000)
-#endif
-        {
-            // High precision clock not available or clock resolution is too low, resort to defaults
-            s_normalizationState = NormalizationState::Failed;
-            return;
-        }
-
-#ifndef FEATURE_NATIVEAOT
-        s_performanceCounterTicksPerS = li.QuadPart;
-#endif
-
-        unsigned int measureDurationUs = DetermineMeasureDurationUs();
-        for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-        {
-            latestNsPerYield = MeasureNsPerYield(measureDurationUs);
-            AtomicStore(&s_nsPerYieldMeasurements[i], latestNsPerYield);
-            if (i == 0 || latestNsPerYield < s_establishedNsPerYield)
-            {
-                AtomicStore(&s_establishedNsPerYield, latestNsPerYield);
-            }
-            if (i < NsPerYieldMeasurementCount - 1)
-            {
-                FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-            }
-        }
-    }
-    else
-    {
-        _ASSERTE(s_normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-    double establishedNsPerYield = s_nsPerYieldMeasurements[0];
-    for (int i = 1; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = s_nsPerYieldMeasurements[i];
-        if (nsPerYield < establishedNsPerYield)
-        {
-            establishedNsPerYield = nsPerYield;
-        }
-    }
-    if (establishedNsPerYield != s_establishedNsPerYield)
-    {
-        AtomicStore(&s_establishedNsPerYield, establishedNsPerYield);
-    }
-
-    FireEtwYieldProcessorMeasurement(GetClrInstanceId(), latestNsPerYield, s_establishedNsPerYield);
-
-    // Calculate the number of yields required to span the duration of a normalized yield
-    unsigned int yieldsPerNormalizedYield = max(1u, (unsigned int)(TargetNsPerNormalizedYield / establishedNsPerYield + 0.5));
-    _ASSERTE(yieldsPerNormalizedYield <= MaxYieldsPerNormalizedYield);
-    s_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
-
-    // Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
-    // spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
-    // better job of allowing other work to run.
-    s_optimalMaxNormalizedYieldsPerSpinIteration =
-        max(1u, (unsigned int)(TargetMaxNsPerSpinIteration / (yieldsPerNormalizedYield * establishedNsPerYield) + 0.5));
-    _ASSERTE(s_optimalMaxNormalizedYieldsPerSpinIteration <= MaxOptimalMaxNormalizedYieldsPerSpinIteration);
-
-    GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
-
-    s_previousNormalizationTimeMs = GetTickCountPortable();
-    s_normalizationState = NormalizationState::Initialized;
-    s_isMeasurementScheduled = false;
-}
-
-
-void YieldProcessorNormalization::ScheduleMeasurementIfNecessary()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    NormalizationState normalizationState = VolatileLoadWithoutBarrier(&s_normalizationState);
-    if (normalizationState == NormalizationState::Initialized)
-    {
-        if (GetTickCountPortable() - s_previousNormalizationTimeMs < MeasurementPeriodMs)
-        {
-            return;
-        }
-    }
-    else if (normalizationState == NormalizationState::Uninitialized)
-    {
-    }
-    else
-    {
-        _ASSERTE(normalizationState == NormalizationState::Failed);
-        return;
-    }
-
-#ifdef FEATURE_NATIVEAOT
-    if (s_isMeasurementScheduled)
-#else
-    // !g_fEEStarted is required for FinalizerThread::EnableFinalization() below
-    if (s_isMeasurementScheduled || !g_fEEStarted)
-#endif
-    {
-        return;
-    }
-
-    s_isMeasurementScheduled = true;
-#ifdef FEATURE_NATIVEAOT
-    RhEnableFinalization();
-#else
-    FinalizerThread::EnableFinalization();
-#endif
-}
-
-void YieldProcessorNormalization::FireMeasurementEvents()
-{
-    CONTRACTL
-    {
-        NOTHROW;
-        GC_NOTRIGGER;
-        MODE_ANY;
-    }
-    CONTRACTL_END;
-
-    if (!EventEnabledYieldProcessorMeasurement())
-    {
-        return;
-    }
-
-    // This function may be called at any time to fire events about recorded measurements. There is no synchronization for the
-    // recorded information, so try to enumerate the array with some care.
-    double establishedNsPerYield = AtomicLoad(&s_establishedNsPerYield);
-    int nextIndex = VolatileLoadWithoutBarrier(&s_nextMeasurementIndex);
-    for (int i = 0; i < NsPerYieldMeasurementCount; ++i)
-    {
-        double nsPerYield = AtomicLoad(&s_nsPerYieldMeasurements[nextIndex]);
-        if (nsPerYield != 0) // the array may not be fully initialized yet
-        {
-            FireEtwYieldProcessorMeasurement(GetClrInstanceId(), nsPerYield, establishedNsPerYield);
-        }
-
-        if (++nextIndex >= NsPerYieldMeasurementCount)
-        {
-            nextIndex = 0;
-        }
-    }
-}
-
-double YieldProcessorNormalization::AtomicLoad(double *valueRef)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    return VolatileLoadWithoutBarrier(valueRef);
-#else
-#ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double), "");
-    int64_t intRes = PalInterlockedCompareExchange64((int64_t*)valueRef, 0, 0);
-    return *(double*)(int64_t*)(&intRes);
-#else
-    return InterlockedCompareExchangeT(valueRef, 0.0, 0.0);
-#endif
-#endif
-}
-
-void YieldProcessorNormalization::AtomicStore(double *valueRef, double value)
-{
-    WRAPPER_NO_CONTRACT;
-
-#ifdef TARGET_64BIT
-    *valueRef = value;
-#else
-#ifdef FEATURE_NATIVEAOT
-    static_assert(sizeof(int64_t) == sizeof(double), "");
-    PalInterlockedExchange64((int64_t *)valueRef, *(int64_t *)(double*)&value);
-#else
-    InterlockedExchangeT(valueRef, value);
-#endif
-#endif
-}
-