Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Dispatcher] improve api, reduce overhead, improve performances for items > 1k #2083

Merged
merged 3 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
647 changes: 251 additions & 396 deletions sources/core/Stride.Core/Threading/Dispatcher.cs

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions sources/core/Stride.Core/Threading/ThreadPool.SemaphoreW.cs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public sealed partial class ThreadPool
/// <summary>
/// Mostly lifted from dotnet's LowLevelLifoSemaphore
/// </summary>
private class SemaphoreW
private class SemaphoreW : ISemaphore
{
private const int SpinSleep0Threshold = 10;

Expand Down Expand Up @@ -60,7 +60,9 @@ public SemaphoreW(int spinCountParam)

public void Wait(int timeout = -1) => internals.Wait(spinCount, lifoSemaphore, timeout);

public void Release(int releaseCount) => internals.Release(releaseCount, lifoSemaphore);
public void Release(int count) => internals.Release(count, lifoSemaphore);

public void Dispose() => lifoSemaphore?.Dispose();

[StructLayout(LayoutKind.Explicit)]
private struct Counts
Expand Down Expand Up @@ -367,4 +369,4 @@ private struct PaddingFalseSharing
#endif
}
}
}
}
132 changes: 116 additions & 16 deletions sources/core/Stride.Core/Threading/ThreadPool.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
using Stride.Core.Diagnostics;
using System;
using System.Collections.Concurrent;
using System.Reflection;
using System.Runtime.InteropServices;
using System.Threading;

namespace Stride.Core.Threading
Expand All @@ -15,6 +17,8 @@ namespace Stride.Core.Threading
/// </summary>
public sealed partial class ThreadPool : IDisposable
{
private static readonly Logger Logger = GlobalLogger.GetLogger(nameof(ThreadPool));

/// <summary>
/// The default instance that the whole process shares, use this one to avoid wasting process memory.
/// </summary>
Expand All @@ -28,9 +32,9 @@ public sealed partial class ThreadPool : IDisposable

private static readonly ProfilingKey ProcessWorkItemKey = new ProfilingKey($"{nameof(ThreadPool)}.ProcessWorkItem");

private readonly ConcurrentQueue<Action> workItems = new ConcurrentQueue<Action>();
private readonly SemaphoreW semaphore;
private readonly ConcurrentQueue<Work> workItems = new ConcurrentQueue<Work>();
private readonly ISemaphore semaphore;

private long completionCounter;
private int workScheduled, threadsBusy;
private int disposing;
Expand All @@ -47,8 +51,30 @@ public sealed partial class ThreadPool : IDisposable

public ThreadPool(int? threadCount = null)
{
semaphore = new SemaphoreW(spinCountParam:70);

int spinCount = 70;

if(RuntimeInformation.ProcessArchitecture is Architecture.Arm or Architecture.Arm64)
{
// Dotnet:
// On systems with ARM processors, more spin-waiting seems to be necessary to avoid perf regressions from incurring
// the full wait when work becomes available soon enough. This is more noticeable after reducing the number of
// thread requests made to the thread pool because otherwise the extra thread requests cause threads to do more
// busy-waiting instead and adding to contention in trying to look for work items, which is less preferable.
spinCount *= 4;
}
try
{
semaphore = new DotnetLifoSemaphore(spinCount);
}
catch(Exception e)
{
// For net6+ this should not happen, logging instead of throwing as this is just a performance regression
if(Environment.Version.Major >= 6)
Logger.Warning($"Could not bind to dotnet's Lifo Semaphore, falling back to suboptimal semaphore:\n{e}");

semaphore = new SemaphoreW(spinCountParam:70);
}

WorkerThreadsCount = threadCount ?? (Environment.ProcessorCount == 1 ? 1 : Environment.ProcessorCount - 1);
leftToDispose = WorkerThreadsCount;
for (int i = 0; i < WorkerThreadsCount; i++)
Expand All @@ -66,7 +92,7 @@ static ThreadPool()
/// Queue an action to run on one of the available threads,
/// it is strongly recommended that the action takes less than a millisecond.
/// </summary>
public void QueueWorkItem([NotNull, Pooled] Action workItem, int amount = 1)
public unsafe void QueueWorkItem([NotNull, Pooled] Action workItem, int amount = 1)
{
// Throw right here to help debugging
if (workItem == null)
Expand All @@ -85,10 +111,55 @@ public void QueueWorkItem([NotNull, Pooled] Action workItem, int amount = 1)
}

Interlocked.Add(ref workScheduled, amount);
var work = new Work { WorkHandler = &ActionHandler, Data = workItem };
for (int i = 0; i < amount; i++)
{
PooledDelegateHelper.AddReference(workItem);
workItems.Enqueue(workItem);
workItems.Enqueue(work);
}
semaphore.Release(amount);
}

static void ActionHandler(object param)
{
Action action = (Action)param;
try
{
action();
}
finally
{
PooledDelegateHelper.Release(action);
}
}

/// <summary>
/// Queue some work item to run on one of the available threads,
/// it is strongly recommended that the action takes less than a millisecond.
/// Additionally, the parameter provided must be fixed from this call onward until the action has finished executing
/// </summary>
public unsafe void QueueUnsafeWorkItem(object parameter, delegate*<object, void> obj, int amount = 1)
{
if (parameter == null)
{
throw new NullReferenceException(nameof(parameter));
}

if (amount < 1)
{
throw new ArgumentOutOfRangeException(nameof(amount));
}

if (disposing > 0)
{
throw new ObjectDisposedException(ToString());
}

Interlocked.Add(ref workScheduled, amount);
var work = new Work { WorkHandler = obj, Data = parameter };
for (int i = 0; i < amount; i++)
{
workItems.Enqueue(work);
}
semaphore.Release(amount);
}
Expand All @@ -98,20 +169,19 @@ public void QueueWorkItem([NotNull, Pooled] Action workItem, int amount = 1)
/// If you absolutely have to block inside one of the threadpool's thread for whatever
/// reason do a busy loop over this function.
/// </summary>
public bool TryCooperate()
public unsafe bool TryCooperate()
{
if (workItems.TryDequeue(out var workItem))
{
Interlocked.Increment(ref threadsBusy);
Interlocked.Decrement(ref workScheduled);
try
{
using var _ = Profiler.Begin(ProcessWorkItemKey);
workItem.Invoke();
using (Profiler.Begin(ProcessWorkItemKey))
workItem.WorkHandler(workItem.Data);
}
finally
{
PooledDelegateHelper.Release(workItem);
Interlocked.Decrement(ref threadsBusy);
Interlocked.Increment(ref completionCounter);
}
Expand Down Expand Up @@ -172,14 +242,11 @@ public void Dispose()
{
return;
}

semaphore.Release(WorkerThreadsCount);
semaphore.Dispose();
while (Volatile.Read(ref leftToDispose) != 0)
{
if (semaphore.SignalCount == 0)
{
semaphore.Release(1);
}
Thread.Yield();
}

Expand All @@ -189,5 +256,38 @@ public void Dispose()

}
}

unsafe struct Work
{
public object Data;
public delegate*<object, void> WorkHandler;
}

private interface ISemaphore : IDisposable
{
public void Release(int count);
public void Wait(int timeout = -1);
}

private sealed class DotnetLifoSemaphore : ISemaphore
{
private readonly IDisposable semaphore;
private readonly Func<int, bool, bool> wait;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious, can C# 9.0 function pointers (and GetFunctionPointer() be useful in this scenario?

(not sure it would make an actual perf difference though, but curious as if usable enough for this use case, as this would mean I can probably use it in some other places)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From my limited testing, yes, although I do not know the implications this has for JIT and such; I do remember that the address static function pointers lay on when taking its address is not fixed. If the method is 'moved' after JIT took care of the method, then we might have to retrieve the function pointer from its runtime method handle on every call to make sure we run the optimal version ...

private readonly Action<int> release;

public DotnetLifoSemaphore(int spinCount)
{
// The semaphore Dotnet uses for its own threadpool is more efficient than what's publicly available,
// but sadly it is internal - we'll hijack it through reflection
Type lifoType = Type.GetType("System.Threading.LowLevelLifoSemaphore");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to add a comment here that we use reflection to access an internal type

semaphore = Activator.CreateInstance(lifoType, new object[]{ 0, short.MaxValue, spinCount, new Action( () => {} ) }) as IDisposable;
wait = lifoType.GetMethod("Wait", BindingFlags.Instance | BindingFlags.Public).CreateDelegate<Func<int, bool, bool>>(semaphore);
release = lifoType.GetMethod("Release", BindingFlags.Instance | BindingFlags.Public).CreateDelegate<Action<int>>(semaphore);
}

public void Dispose() => semaphore.Dispose();
public void Release(int count) => release(count);
public void Wait(int timeout = -1) => wait(timeout, true);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,9 @@ protected override void OnEntityComponentRemoved(Entity entity, TransformCompone
}
}

internal void UpdateTransformations(FastCollection<TransformComponent> transformationComponents)
internal unsafe void UpdateTransformations(FastCollection<TransformComponent> transformationComponents)
{
Dispatcher.ForEach(transformationComponents, UpdateTransformationsRecursive);
Dispatcher.ForBatched(transformationComponents.Count, transformationComponents, &UpdateTransformationsRecursive);

// Re-update model node links to avoid one frame delay compared reference model (ideally entity should be sorted to avoid this in future).
if (ModelNodeLinkProcessor != null)
Expand All @@ -105,17 +105,18 @@ internal void UpdateTransformations(FastCollection<TransformComponent> transform
{
modelNodeLinkComponents.Add(modelNodeLink.Entity.Transform);
}
Dispatcher.ForEach(modelNodeLinkComponents, UpdateTransformationsRecursive);
Dispatcher.ForBatched(modelNodeLinkComponents.Count, modelNodeLinkComponents, &UpdateTransformationsRecursive);
}
}

private static void UpdateTransformationsRecursive(TransformComponent transform)
private static void UpdateTransformationsRecursive(FastCollection<TransformComponent> transforms, int from, int toExclusive)
{
transform.UpdateLocalMatrix();
transform.UpdateWorldMatrixInternal(false);
foreach (var child in transform.Children)
for (int i = from; i < toExclusive; i++)
{
UpdateTransformationsRecursive(child);
var transform = transforms[i];
transform.UpdateLocalMatrix();
transform.UpdateWorldMatrixInternal(false);
UpdateTransformationsRecursive(transform.Children, 0, transform.Children.Count);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -280,31 +280,35 @@ public override void Prepare(RenderDrawContext context)
// Assign descriptor sets to each render node
var resourceGroupPool = ((RootEffectRenderFeature)RootRenderFeature).ResourceGroupPool;

Dispatcher.For(0, RootRenderFeature.RenderNodes.Count, () => context.RenderContext.GetThreadContext(), (renderNodeIndex, threadContext) =>
Dispatcher.ForBatched(RootRenderFeature.RenderNodes.Count, (from, toExclusive) =>
{
var renderNodeReference = new RenderNodeReference(renderNodeIndex);
var renderNode = RootRenderFeature.RenderNodes[renderNodeIndex];
var renderMesh = (RenderMesh)renderNode.RenderObject;
var threadContext = context.RenderContext.GetThreadContext();
for (int i = from; i < toExclusive; i++)
{
var renderNodeReference = new RenderNodeReference(i);
var renderNode = RootRenderFeature.RenderNodes[i];
var renderMesh = (RenderMesh)renderNode.RenderObject;

// Ignore fallback effects
if (renderNode.RenderEffect.State != RenderEffectState.Normal)
return;
// Ignore fallback effects
if (renderNode.RenderEffect.State != RenderEffectState.Normal)
continue;

// Collect materials and create associated MaterialInfo (includes reflection) first time
// TODO: We assume same material will generate same ResourceGroup (i.e. same resources declared in same order)
// Need to offer some protection if this invariant is violated (or support it if it can actually happen in real scenario)
var material = renderMesh.MaterialPass;
var materialInfo = renderMesh.MaterialInfo;
var materialParameters = material.Parameters;
// Collect materials and create associated MaterialInfo (includes reflection) first time
// TODO: We assume same material will generate same ResourceGroup (i.e. same resources declared in same order)
// Need to offer some protection if this invariant is violated (or support it if it can actually happen in real scenario)
var material = renderMesh.MaterialPass;
var materialInfo = renderMesh.MaterialInfo;
var materialParameters = material.Parameters;

// Register resources usage
Context.StreamingManager?.StreamResources(materialParameters);
// Register resources usage
Context.StreamingManager?.StreamResources(materialParameters);

if (!UpdateMaterial(RenderSystem, threadContext, materialInfo, perMaterialDescriptorSetSlot.Index, renderNode.RenderEffect, materialParameters))
return;
if (!UpdateMaterial(RenderSystem, threadContext, materialInfo, perMaterialDescriptorSetSlot.Index, renderNode.RenderEffect, materialParameters))
continue;

var descriptorSetPoolOffset = ((RootEffectRenderFeature)RootRenderFeature).ComputeResourceGroupOffset(renderNodeReference);
resourceGroupPool[descriptorSetPoolOffset + perMaterialDescriptorSetSlot.Index] = materialInfo.Resources;
var descriptorSetPoolOffset = ((RootEffectRenderFeature)RootRenderFeature).ComputeResourceGroupOffset(renderNodeReference);
resourceGroupPool[descriptorSetPoolOffset + perMaterialDescriptorSetSlot.Index] = materialInfo.Resources;
}
});
}

Expand Down
34 changes: 19 additions & 15 deletions sources/engine/Stride.Rendering/Rendering/SkinningRenderFeature.cs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ public override void PrepareEffectPermutations(RenderDrawContext context)
int effectSlotCount = ((RootEffectRenderFeature)RootRenderFeature).EffectPermutationSlotCount;

//foreach (var objectNodeReference in RootRenderFeature.ObjectNodeReferences)
Dispatcher.ForEach(((RootEffectRenderFeature)RootRenderFeature).ObjectNodeReferences, objectNodeReference =>
Dispatcher.ForEach(RootRenderFeature.ObjectNodeReferences, objectNodeReference =>
{
var objectNode = RootRenderFeature.GetObjectNode(objectNodeReference);
var renderMesh = (RenderMesh)objectNode.RenderObject;
Expand Down Expand Up @@ -116,25 +116,29 @@ public override unsafe void Prepare(RenderDrawContext context)
{
var renderModelObjectInfoData = RootRenderFeature.RenderData.GetData(renderModelObjectInfoKey);

Dispatcher.ForEach(((RootEffectRenderFeature)RootRenderFeature).RenderNodes, (ref RenderNode renderNode) =>
Dispatcher.ForBatched(RootRenderFeature.RenderNodes.Count, (from, toExclusive) =>
{
var perDrawLayout = renderNode.RenderEffect.Reflection?.PerDrawLayout;
if (perDrawLayout == null)
return;
for (int i = from; i < toExclusive; i++)
{
var renderNode = RootRenderFeature.RenderNodes[i];
var perDrawLayout = renderNode.RenderEffect.Reflection?.PerDrawLayout;
if (perDrawLayout == null)
continue;

var blendMatricesOffset = perDrawLayout.GetConstantBufferOffset(blendMatrices);
if (blendMatricesOffset == -1)
return;
var blendMatricesOffset = perDrawLayout.GetConstantBufferOffset(blendMatrices);
if (blendMatricesOffset == -1)
continue;

var renderModelObjectInfo = renderModelObjectInfoData[renderNode.RenderObject.ObjectNode];
if (renderModelObjectInfo == null)
return;
var renderModelObjectInfo = renderModelObjectInfoData[renderNode.RenderObject.ObjectNode];
if (renderModelObjectInfo == null)
continue;

var mappedCB = (byte*)renderNode.Resources.ConstantBuffer.Data + blendMatricesOffset;
var mappedCB = (byte*)renderNode.Resources.ConstantBuffer.Data + blendMatricesOffset;

fixed (Matrix* blendMatricesPtr = renderModelObjectInfo)
{
Unsafe.CopyBlockUnaligned(mappedCB, blendMatricesPtr, (uint)renderModelObjectInfo.Length * (uint)sizeof(Matrix));
fixed (Matrix* blendMatricesPtr = renderModelObjectInfo)
{
Unsafe.CopyBlockUnaligned(mappedCB, blendMatricesPtr, (uint)renderModelObjectInfo.Length * (uint)sizeof(Matrix));
}
}
});
}
Expand Down
Loading