Skip to content

Commit

Permalink
Fixed critical issue in CPUAccelerator runtime related to Shared Memo…
Browse files Browse the repository at this point in the history
…ry allocations and Warp operations. (#836)
  • Loading branch information
m4rs-mt authored Sep 1, 2022
1 parent 474a32a commit dbc1cb0
Show file tree
Hide file tree
Showing 7 changed files with 290 additions and 89 deletions.
153 changes: 153 additions & 0 deletions Src/ILGPU.Algorithms.Tests/WarpExtensionTests.Find.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// ---------------------------------------------------------------------------------------
// ILGPU Algorithms
// Copyright (c) 2022 ILGPU Project
// www.ilgpu.net
//
// File: WarpExtensionTests.Find.cs
//
// This file is part of ILGPU and is distributed under the University of Illinois Open
// Source License. See LICENSE.txt for details.
// ---------------------------------------------------------------------------------------

using ILGPU.Algorithms.ScanReduceOperations;
using ILGPU.AtomicOperations;
using ILGPU.Runtime;
using ILGPU.Tests;
using ILGPU.Util;
using System;
using System.Runtime.CompilerServices;
using Xunit;

#pragma warning disable xUnit1026
#pragma warning disable CA1815

namespace ILGPU.Algorithms.Tests
{
partial class WarpExtensionTests
{
internal readonly struct LaneEntry :
IScanReduceOperation<LaneEntry>,
IAtomicOperation<LaneEntry>,
ICompareExchangeOperation<LaneEntry>
{
public LaneEntry(int distance, int laneIndex)
{
Distance = distance;
LaneIndex = laneIndex;
}

public string CLCommand => string.Empty;
public LaneEntry Identity => new LaneEntry(int.MaxValue, int.MaxValue);

public int Distance { get; }
public int LaneIndex { get; }

public LaneEntry Apply(LaneEntry first, LaneEntry second) =>
Utilities.Select(first.Distance < second.Distance, first, second);

public LaneEntry Operation(LaneEntry current, LaneEntry value) =>
Apply(current, value);

public void AtomicApply(ref LaneEntry target, LaneEntry value) =>
Atomic.MakeAtomic(ref target, value, this, this);

public LaneEntry CompareExchange(
ref LaneEntry target,
LaneEntry compare,
LaneEntry value)
{
ref long targetL = ref Unsafe.As<LaneEntry, long>(ref target);
long compareL = Unsafe.As<LaneEntry, long>(ref compare);
long valueL = Unsafe.As<LaneEntry, long>(ref value);
long result = Atomic.CompareExchange(ref targetL, compareL, valueL);
return Unsafe.As<long, LaneEntry>(ref result);
}

public bool IsSame(LaneEntry left, LaneEntry right) =>
left.Distance == right.Distance & left.LaneIndex == right.LaneIndex;

public override string ToString() => $"{LaneIndex}: {Distance}d";
}

public static void FindKernel(
ArrayView<int> values,
ArrayView<int> origins,
ArrayView<int> results)
{
const int MaxGroupSize = 1024;

int localDistance = int.MaxValue;
ref var bestSharedDistance = ref SharedMemory.Allocate<int>();
var sharedDistanceValues = SharedMemory.Allocate<int>(MaxGroupSize);
for (int i = Group.IdxX; i < MaxGroupSize; i += Group.DimX)
sharedDistanceValues[Group.IdxX] = localDistance;
Group.Barrier();

int source = values[Grid.IdxX];
if (source < 0)
return;
for (int i = Group.IdxX; i < origins.IntLength; i += Group.DimX)
{
int origin = origins[i];
int result = Math.Abs(origin - source);

localDistance = Utilities.Select(
result < localDistance,
result,
localDistance);
}

// Commit changes to shared memory
sharedDistanceValues[Group.IdxX] = localDistance;
Group.Barrier();

// Determine the best value in the first warp
if (Warp.WarpIdx == 0)
{
LaneEntry entry = default(LaneEntry).Identity;
for (int i = Warp.LaneIdx; i < Group.DimX; i += Warp.WarpSize)
{
var bestFit = WarpExtensions.Reduce<LaneEntry, LaneEntry>(
new(sharedDistanceValues[i], i));
entry = entry.Apply(entry, bestFit);
}
Warp.Barrier();

// First lane contains the actual result
if (Warp.IsFirstLane)
bestSharedDistance = sharedDistanceValues[entry.LaneIndex];
}
Group.Barrier();

// First thread should have all results
if (Group.IsFirstThread)
results[Grid.IdxX] = bestSharedDistance;
}

[Fact]
[KernelMethod(nameof(FindKernel))]
public void FindDistances()
{
var values = new int[] { 1, 2, 3 };
var origins = new int[] { 7, 5, 1, 4, 5, 1, 2, 4, 6, 7, 8 };
var results = new int[] { 0, 0, 1 };

using var valuesBuffer = Accelerator.Allocate1D<int>(values);
using var originsBuffer = Accelerator.Allocate1D<int>(origins);
using var resultsBuffer = Accelerator.Allocate1D<int>(values.Length);

Execute(
new KernelConfig(
values.Length,
Accelerator.MaxNumThreadsPerGroup),
valuesBuffer.View.AsContiguous(),
originsBuffer.View.AsContiguous(),
resultsBuffer.View.AsContiguous());

Verify(resultsBuffer.View, results);
}
}
}

#pragma warning restore xUnit1026
#pragma warning restore CA1815
22 changes: 11 additions & 11 deletions Src/ILGPU.Algorithms.Tests/WarpExtensionTests.tt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU Algorithms
// Copyright (c) 2021 ILGPU Project
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: WarpExtensionTests.tt/WarpExtensionTests.cs
Expand All @@ -22,7 +22,7 @@ using System.Linq;
using Xunit;
using Xunit.Abstractions;

#pragma warning disable xUnit1026
#pragma warning disable xUnit1026

namespace ILGPU.Algorithms.Tests
{
Expand Down Expand Up @@ -120,7 +120,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T expected = CalcValue(sequence, func);
Expand Down Expand Up @@ -151,9 +151,9 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

var expected = Enumerable.Repeat(CalcValue(sequence, func), size).ToArray();
Verify(output.View, expected);
}
Expand All @@ -179,7 +179,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Exclusive);
Expand Down Expand Up @@ -208,7 +208,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Exclusive);
Expand Down Expand Up @@ -238,13 +238,13 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Inclusive);
Verify(output.View, expected);
}

#region Helper Methods

private static T CalcValue<T, TFunction>(T[] values, TFunction func)
Expand Down Expand Up @@ -272,7 +272,7 @@ namespace ILGPU.Algorithms.Tests

if (kind == ScanKind.Exclusive)
{
T[] zero = { default };
T[] zero = { default };
result = zero.Concat(result.Take(result.Length - 1)).ToArray();
}
return result;
Expand Down
5 changes: 4 additions & 1 deletion Src/ILGPU/Backends/IL/ILBackend.cs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,10 @@ protected sealed override CompiledKernel Compile(
kernelMethod,
taskType,
taskConstructor,
taskArgumentMapping);
taskArgumentMapping,
backendContext.SharedAllocations.Length +
backendContext.DynamicSharedAllocations.Length,
backendContext.SharedMemorySpecification.StaticSize);
}

/// <summary>
Expand Down
24 changes: 22 additions & 2 deletions Src/ILGPU/Backends/IL/ILCompiledKernel.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU
// Copyright (c) 2018-2021 ILGPU Project
// Copyright (c) 2018-2022 ILGPU Project
// www.ilgpu.net
//
// File: ILCompiledKernel.cs
Expand Down Expand Up @@ -35,13 +35,21 @@ public sealed class ILCompiledKernel : CompiledKernel
/// <param name="taskArgumentMapping">
/// Mapping of argument indices to fields.
/// </param>
/// <param name="numSharedMemoryAllocations">
/// The number of shared-memory allocations.
/// </param>
/// <param name="allocatedSharedMemorySize">
/// The amount of statically allocated bytes of shared memory.
/// </param>
internal ILCompiledKernel(
Context context,
EntryPoint entryPoint,
MethodInfo kernelMethod,
Type taskType,
ConstructorInfo taskConstructor,
ImmutableArray<FieldInfo> taskArgumentMapping)
ImmutableArray<FieldInfo> taskArgumentMapping,
int numSharedMemoryAllocations,
int allocatedSharedMemorySize)
: base(context, entryPoint, null)
{
KernelMethod = kernelMethod;
Expand All @@ -50,6 +58,8 @@ internal ILCompiledKernel(
TaskType = taskType;
TaskConstructor = taskConstructor;
TaskArgumentMapping = taskArgumentMapping;
NumSharedMemoryAllocations = numSharedMemoryAllocations;
AllocatedSharedMemorySize = allocatedSharedMemorySize;
}

#endregion
Expand Down Expand Up @@ -81,6 +91,16 @@ internal ILCompiledKernel(
/// </summary>
internal ImmutableArray<FieldInfo> TaskArgumentMapping { get; }

/// <summary>
/// Returns the number of shared-memory allocations.
/// </summary>
public int NumSharedMemoryAllocations { get; }

/// <summary>
/// Returns the size of statically allocated shared memory in bytes.
/// </summary>
public int AllocatedSharedMemorySize { get; }

#endregion
}
}
12 changes: 6 additions & 6 deletions Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU
// Copyright (c) 2021 ILGPU Project
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: CPURuntimeContext.cs
Expand Down Expand Up @@ -171,19 +171,19 @@ protected CPURuntimeContext(CPUMultiprocessor multiprocessor)
/// <param name="operation">The operation to perform.</param>
/// <returns>The determined result value for all threads.</returns>
/// <remarks>
/// It internally acquires a lock using <see cref="AquireLock"/> and determines
/// It internally acquires a lock using <see cref="AcquireLock"/> and determines
/// a "main thread" that can execute the given operation in sync with all
/// other threads. Afterwards, all threads continue and query the result of
/// the synchronized operation and the main thread releases its lock.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected T PerformLocked<TParent, TOperation, T>(
protected T PerformLockStep<TParent, TOperation, T>(
TParent parent,
TOperation operation)
where TParent : IParent
where TOperation : ILockedOperation<T>
{
bool isMainThread = AquireLock();
bool isMainThread = AcquireLock();
if (isMainThread)
operation.ApplySyncInMainThread();
parent.Barrier();
Expand All @@ -198,7 +198,7 @@ protected T PerformLocked<TParent, TOperation, T>(
/// </summary>
/// <returns>True, if the current thread is the main thread.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected bool AquireLock() =>
protected bool AcquireLock() =>
Interlocked.CompareExchange(ref memoryLock, 1, 0) == 0;

/// <summary>
Expand Down Expand Up @@ -240,7 +240,7 @@ protected T Broadcast<TParent, T>(
where T : unmanaged
{
// Allocate a compatible view to perform the actual broadcast operation
var view = PerformLocked<
var view = PerformLockStep<
TParent,
GetBroadcastMemory<T>,
ArrayView<T>>(
Expand Down
Loading

0 comments on commit dbc1cb0

Please sign in to comment.