Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed critical issue in CPUAccelerator runtime related to Shared Memory allocations and Warp operations. #836

Merged
merged 3 commits into from
Sep 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 153 additions & 0 deletions Src/ILGPU.Algorithms.Tests/WarpExtensionTests.Find.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// ---------------------------------------------------------------------------------------
// ILGPU Algorithms
// Copyright (c) 2022 ILGPU Project
// www.ilgpu.net
//
// File: WarpExtensionTests.Find.cs
//
// This file is part of ILGPU and is distributed under the University of Illinois Open
// Source License. See LICENSE.txt for details.
// ---------------------------------------------------------------------------------------

using ILGPU.Algorithms.ScanReduceOperations;
using ILGPU.AtomicOperations;
using ILGPU.Runtime;
using ILGPU.Tests;
using ILGPU.Util;
using System;
using System.Runtime.CompilerServices;
using Xunit;

#pragma warning disable xUnit1026
#pragma warning disable CA1815

namespace ILGPU.Algorithms.Tests
{
partial class WarpExtensionTests
{
internal readonly struct LaneEntry :
IScanReduceOperation<LaneEntry>,
IAtomicOperation<LaneEntry>,
ICompareExchangeOperation<LaneEntry>
{
public LaneEntry(int distance, int laneIndex)
{
Distance = distance;
LaneIndex = laneIndex;
}

public string CLCommand => string.Empty;
public LaneEntry Identity => new LaneEntry(int.MaxValue, int.MaxValue);

public int Distance { get; }
public int LaneIndex { get; }

public LaneEntry Apply(LaneEntry first, LaneEntry second) =>
Utilities.Select(first.Distance < second.Distance, first, second);

public LaneEntry Operation(LaneEntry current, LaneEntry value) =>
Apply(current, value);

public void AtomicApply(ref LaneEntry target, LaneEntry value) =>
Atomic.MakeAtomic(ref target, value, this, this);

public LaneEntry CompareExchange(
ref LaneEntry target,
LaneEntry compare,
LaneEntry value)
{
ref long targetL = ref Unsafe.As<LaneEntry, long>(ref target);
long compareL = Unsafe.As<LaneEntry, long>(ref compare);
long valueL = Unsafe.As<LaneEntry, long>(ref value);
long result = Atomic.CompareExchange(ref targetL, compareL, valueL);
return Unsafe.As<long, LaneEntry>(ref result);
}

public bool IsSame(LaneEntry left, LaneEntry right) =>
left.Distance == right.Distance & left.LaneIndex == right.LaneIndex;

public override string ToString() => $"{LaneIndex}: {Distance}d";
}

public static void FindKernel(
ArrayView<int> values,
ArrayView<int> origins,
ArrayView<int> results)
{
const int MaxGroupSize = 1024;

int localDistance = int.MaxValue;
ref var bestSharedDistance = ref SharedMemory.Allocate<int>();
var sharedDistanceValues = SharedMemory.Allocate<int>(MaxGroupSize);
for (int i = Group.IdxX; i < MaxGroupSize; i += Group.DimX)
sharedDistanceValues[Group.IdxX] = localDistance;
Group.Barrier();

int source = values[Grid.IdxX];
if (source < 0)
return;
for (int i = Group.IdxX; i < origins.IntLength; i += Group.DimX)
{
int origin = origins[i];
int result = Math.Abs(origin - source);

localDistance = Utilities.Select(
result < localDistance,
result,
localDistance);
}

// Commit changes to shared memory
sharedDistanceValues[Group.IdxX] = localDistance;
Group.Barrier();

// Determine the best value in the first warp
if (Warp.WarpIdx == 0)
{
LaneEntry entry = default(LaneEntry).Identity;
for (int i = Warp.LaneIdx; i < Group.DimX; i += Warp.WarpSize)
{
var bestFit = WarpExtensions.Reduce<LaneEntry, LaneEntry>(
new(sharedDistanceValues[i], i));
entry = entry.Apply(entry, bestFit);
}
Warp.Barrier();

// First lane contains the actual result
if (Warp.IsFirstLane)
bestSharedDistance = sharedDistanceValues[entry.LaneIndex];
}
Group.Barrier();

// First thread should have all results
if (Group.IsFirstThread)
results[Grid.IdxX] = bestSharedDistance;
}

[Fact]
[KernelMethod(nameof(FindKernel))]
public void FindDistances()
{
var values = new int[] { 1, 2, 3 };
var origins = new int[] { 7, 5, 1, 4, 5, 1, 2, 4, 6, 7, 8 };
var results = new int[] { 0, 0, 1 };

using var valuesBuffer = Accelerator.Allocate1D<int>(values);
using var originsBuffer = Accelerator.Allocate1D<int>(origins);
using var resultsBuffer = Accelerator.Allocate1D<int>(values.Length);

Execute(
new KernelConfig(
values.Length,
Accelerator.MaxNumThreadsPerGroup),
valuesBuffer.View.AsContiguous(),
originsBuffer.View.AsContiguous(),
resultsBuffer.View.AsContiguous());

Verify(resultsBuffer.View, results);
}
}
}

#pragma warning restore xUnit1026
#pragma warning restore CA1815
22 changes: 11 additions & 11 deletions Src/ILGPU.Algorithms.Tests/WarpExtensionTests.tt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU Algorithms
// Copyright (c) 2021 ILGPU Project
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: WarpExtensionTests.tt/WarpExtensionTests.cs
Expand All @@ -22,7 +22,7 @@ using System.Linq;
using Xunit;
using Xunit.Abstractions;

#pragma warning disable xUnit1026
#pragma warning disable xUnit1026

namespace ILGPU.Algorithms.Tests
{
Expand Down Expand Up @@ -120,7 +120,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T expected = CalcValue(sequence, func);
Expand Down Expand Up @@ -151,9 +151,9 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

var expected = Enumerable.Repeat(CalcValue(sequence, func), size).ToArray();
Verify(output.View, expected);
}
Expand All @@ -179,7 +179,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Exclusive);
Expand Down Expand Up @@ -208,7 +208,7 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Exclusive);
Expand Down Expand Up @@ -238,13 +238,13 @@ namespace ILGPU.Algorithms.Tests

var sequence = sequencer.ComputeSequence(start, stepSize, size);
input.CopyFromCPU(stream, sequence);

Execute<KernelConfig, T, TFunction>((1, size), input.View, output.View);

T[] expected = CalcValues(sequence, func, ScanKind.Inclusive);
Verify(output.View, expected);
}

#region Helper Methods

private static T CalcValue<T, TFunction>(T[] values, TFunction func)
Expand Down Expand Up @@ -272,7 +272,7 @@ namespace ILGPU.Algorithms.Tests

if (kind == ScanKind.Exclusive)
{
T[] zero = { default };
T[] zero = { default };
result = zero.Concat(result.Take(result.Length - 1)).ToArray();
}
return result;
Expand Down
5 changes: 4 additions & 1 deletion Src/ILGPU/Backends/IL/ILBackend.cs
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,10 @@ protected sealed override CompiledKernel Compile(
kernelMethod,
taskType,
taskConstructor,
taskArgumentMapping);
taskArgumentMapping,
backendContext.SharedAllocations.Length +
backendContext.DynamicSharedAllocations.Length,
backendContext.SharedMemorySpecification.StaticSize);
}

/// <summary>
Expand Down
24 changes: 22 additions & 2 deletions Src/ILGPU/Backends/IL/ILCompiledKernel.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU
// Copyright (c) 2018-2021 ILGPU Project
// Copyright (c) 2018-2022 ILGPU Project
// www.ilgpu.net
//
// File: ILCompiledKernel.cs
Expand Down Expand Up @@ -35,13 +35,21 @@ public sealed class ILCompiledKernel : CompiledKernel
/// <param name="taskArgumentMapping">
/// Mapping of argument indices to fields.
/// </param>
/// <param name="numSharedMemoryAllocations">
/// The number of shared-memory allocations.
/// </param>
/// <param name="allocatedSharedMemorySize">
/// The amount of statically allocated bytes of shared memory.
/// </param>
internal ILCompiledKernel(
Context context,
EntryPoint entryPoint,
MethodInfo kernelMethod,
Type taskType,
ConstructorInfo taskConstructor,
ImmutableArray<FieldInfo> taskArgumentMapping)
ImmutableArray<FieldInfo> taskArgumentMapping,
int numSharedMemoryAllocations,
int allocatedSharedMemorySize)
: base(context, entryPoint, null)
{
KernelMethod = kernelMethod;
Expand All @@ -50,6 +58,8 @@ internal ILCompiledKernel(
TaskType = taskType;
TaskConstructor = taskConstructor;
TaskArgumentMapping = taskArgumentMapping;
NumSharedMemoryAllocations = numSharedMemoryAllocations;
AllocatedSharedMemorySize = allocatedSharedMemorySize;
}

#endregion
Expand Down Expand Up @@ -81,6 +91,16 @@ internal ILCompiledKernel(
/// </summary>
internal ImmutableArray<FieldInfo> TaskArgumentMapping { get; }

/// <summary>
/// Returns the number of shared-memory allocations.
/// </summary>
public int NumSharedMemoryAllocations { get; }

/// <summary>
/// Returns the size of statically allocated shared memory in bytes.
/// </summary>
public int AllocatedSharedMemorySize { get; }

#endregion
}
}
12 changes: 6 additions & 6 deletions Src/ILGPU/Runtime/CPU/CPURuntimeContext.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// ---------------------------------------------------------------------------------------
// ILGPU
// Copyright (c) 2021 ILGPU Project
// Copyright (c) 2021-2022 ILGPU Project
// www.ilgpu.net
//
// File: CPURuntimeContext.cs
Expand Down Expand Up @@ -171,19 +171,19 @@ protected CPURuntimeContext(CPUMultiprocessor multiprocessor)
/// <param name="operation">The operation to perform.</param>
/// <returns>The determined result value for all threads.</returns>
/// <remarks>
/// It internally acquires a lock using <see cref="AquireLock"/> and determines
/// It internally acquires a lock using <see cref="AcquireLock"/> and determines
/// a "main thread" that can execute the given operation in sync with all
/// other threads. Afterwards, all threads continue and query the result of
/// the synchronized operation and the main thread releases its lock.
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected T PerformLocked<TParent, TOperation, T>(
protected T PerformLockStep<TParent, TOperation, T>(
TParent parent,
TOperation operation)
where TParent : IParent
where TOperation : ILockedOperation<T>
{
bool isMainThread = AquireLock();
bool isMainThread = AcquireLock();
if (isMainThread)
operation.ApplySyncInMainThread();
parent.Barrier();
Expand All @@ -198,7 +198,7 @@ protected T PerformLocked<TParent, TOperation, T>(
/// </summary>
/// <returns>True, if the current thread is the main thread.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
protected bool AquireLock() =>
protected bool AcquireLock() =>
Interlocked.CompareExchange(ref memoryLock, 1, 0) == 0;

/// <summary>
Expand Down Expand Up @@ -240,7 +240,7 @@ protected T Broadcast<TParent, T>(
where T : unmanaged
{
// Allocate a compatible view to perform the actual broadcast operation
var view = PerformLocked<
var view = PerformLockStep<
TParent,
GetBroadcastMemory<T>,
ArrayView<T>>(
Expand Down
Loading