Skip to content
This repository has been archived by the owner on Sep 3, 2022. It is now read-only.

Commit

Permalink
Compile #2.
Browse files Browse the repository at this point in the history
  • Loading branch information
fire committed Feb 23, 2020
1 parent 88a7c48 commit 9c43d67
Show file tree
Hide file tree
Showing 9 changed files with 86 additions and 68 deletions.
8 changes: 4 additions & 4 deletions core/os/threaded_array_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,10 @@
#include "core/os/thread.h"
#include "core/os/thread_safe.h"
#include "core/safe_refcount.h"
#include "thirdparty/FEMFXAsync/FEMFXAsyncThreading.h"
#include "thirdparty/FEMFXAsync/FEMFXCommon.h"
#include "thirdparty/FEMFXAsync/FEMFXParallelFor.h"
#include "thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h"
#include "thirdparty/fem_fx_async/FEMFXAsyncThreading.h"
#include "thirdparty/fem_fx_async/FEMFXCommon.h"
#include "thirdparty/fem_fx_async/FEMFXParallelFor.h"
#include "thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h"

using namespace AMD;

Expand Down
12 changes: 12 additions & 0 deletions modules/fem_fx_async/SCsub
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python

Import('env')
Import('env_modules')

fem_fx_async = env_modules.Clone()

env_thirdparty = fem_fx_async.Clone()
env_thirdparty.add_source_files(env.modules_sources, Glob('#thirdparty/fem_fx_async/*.cpp'))

# Godot's own source files
fem_fx_async.add_source_files(env.modules_sources, "*.cpp")
5 changes: 5 additions & 0 deletions modules/fem_fx_async/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def can_build(env, platform):
return True

def configure(env):
pass
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ namespace AMD
int32_t beginIndex;
int32_t endIndex;

FmTask() : func(NULL), data(NULL), beginIndex(0), endIndex(0) {}
FmTask() : func(nullptr), data(nullptr), beginIndex(0), endIndex(0) {}
};

// Async tasks need to be executed using this function, which starts a loop that will pick up subsequent tasks set by FmSetNextTask()
Expand All @@ -77,8 +77,8 @@ namespace AMD
{
nextIndex.store(0);
numTasksIncomplete.store(0);
followTask.func = NULL;
followTask.data = NULL;
followTask.func = nullptr;
followTask.data = nullptr;
followTask.beginIndex = 0;
followTask.endIndex = 1;
}
Expand Down Expand Up @@ -172,12 +172,12 @@ namespace AMD

bool TaskIsFinished()
{
return TaskIsFinished<void>(NULL);
return TaskIsFinished<void>(nullptr);
}

bool TasksAreFinished(uint32_t numTasks)
{
return TasksAreFinished<void>(numTasks, NULL);
return TasksAreFinished<void>(numTasks, nullptr);
}
};

Expand All @@ -190,7 +190,7 @@ namespace AMD
FmTask followTask; // Task to call following scope of this task data
FmAsyncTaskData* parentTaskData; // Pointer to parent task data for nested parallel operations

FmAsyncTaskData() : parentTaskData(NULL) { }
FmAsyncTaskData() : parentTaskData(nullptr) { }

virtual ~FmAsyncTaskData() {}
};
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ THE SOFTWARE.

#include "FEMFXParallelFor.h"
#include "FEMFXAsyncThreading.h"
#include "core/engine.h"
#include <atomic>

namespace AMD
{
Expand All @@ -50,15 +52,15 @@ namespace AMD

void SetNextTask(FmTaskFuncCallback inFunc, void* inData, int32_t inBeginIndex, int32_t inEndIndex)
{
FM_ASSERT(func == NULL); // should have run and cleared this
ERR_FAIL_COND(func == NULL); // should have run and cleared this
func = inFunc;
data = inData;
beginIndex = inBeginIndex;
endIndex = inEndIndex;
}
};

FM_THREAD_LOCAL_STORAGE FmTaskFuncLoopData gFEMFXTaskFuncLoopData;
thread_local FmTaskFuncLoopData gFEMFXTaskFuncLoopData;

// Loop while there is a non-NULL task function set in gFEMFXTaskFuncLoopData.
// Reduces use of stack with Async code.
Expand Down Expand Up @@ -95,19 +97,17 @@ namespace AMD
class FmParallelForDispatcherData
{
public:
FM_CLASS_NEW_DELETE(FmParallelForDispatcherData)

FmAtomicUint dispatcherIndexAtomic;
FmAtomicUint numDispatchersIncomplete;
std::atomic_uint32_t dispatcherIndexAtomic;
std::atomic_uint32_t numDispatchersIncomplete;

FmSubmitAsyncTaskCallback SubmitAsyncTask;
const char* taskName;
FmTaskFuncCallback TaskFunc;
FmTaskFuncCallback TaskFuncWrapped;
FmBatchingFuncCallback BatchingFunc;
void* taskData;
uint problemSize;
uint numDispatchers;
uint32_t problemSize;
uint32_t numDispatchers;

FmParallelForDispatcherData(
FmSubmitAsyncTaskCallback inSubmitAsyncTask,
Expand All @@ -116,8 +116,8 @@ namespace AMD
FmTaskFuncCallback inTaskFuncWrapped,
FmBatchingFuncCallback inBatchingFunc,
void* inTaskData,
uint inProblemSize,
uint inNumDispatchers)
uint32_t inProblemSize,
uint32_t inNumDispatchers)
{
SubmitAsyncTask = inSubmitAsyncTask;
taskName = inTaskName;
Expand All @@ -127,8 +127,8 @@ namespace AMD
taskData = inTaskData;
problemSize = inProblemSize;
numDispatchers = inNumDispatchers;
FmAtomicWrite(&dispatcherIndexAtomic.val, 0);
FmAtomicWrite(&numDispatchersIncomplete.val, numDispatchers);
dispatcherIndexAtomic.store(0);
numDispatchersIncomplete.store(numDispatchers);
}
};

Expand All @@ -142,14 +142,14 @@ namespace AMD

FmBatchingFuncCallback BatchingFunc = dispatcherData->BatchingFunc;

uint dispatcherIndex = (uint)inTaskBeginIndex;
dispatcherIndex = FmAtomicIncrement(&dispatcherData->dispatcherIndexAtomic.val) - 1;
uint32_t dispatcherIndex = (uint32_t)inTaskBeginIndex;
dispatcherIndex = dispatcherData->dispatcherIndexAtomic++ - 1;

uint numDispatchers = dispatcherData->numDispatchers;
uint problemSize = dispatcherData->problemSize;
uint32_t numDispatchers = dispatcherData->numDispatchers;
uint32_t problemSize = dispatcherData->problemSize;

// Compute range of parallel-for indices this dispatcher covers
uint beginIndex, endIndex;
uint32_t beginIndex, endIndex;
FmGetIndexRangeEvenDistribution(&beginIndex, &endIndex, dispatcherIndex, numDispatchers, problemSize);

if (BatchingFunc)
Expand All @@ -158,14 +158,14 @@ namespace AMD
void* taskData = dispatcherData->taskData;

// Save first batch to run on this thread
uint firstNumItems = (uint)BatchingFunc(taskData, beginIndex, endIndex);
uint firstBeginIndex = beginIndex;
uint32_t firstNumItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex);
uint32_t firstBeginIndex = beginIndex;

beginIndex += firstNumItems;

while (beginIndex < endIndex)
{
uint numItems = (uint)BatchingFunc(taskData, beginIndex, endIndex);
uint32_t numItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex);

// Submit task
dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, beginIndex, beginIndex + numItems);
Expand All @@ -180,22 +180,22 @@ namespace AMD
}
else
{
uint numTasks = endIndex - beginIndex;
uint32_t numTasks = endIndex - beginIndex;

#define FM_STRIDED 1
#if FM_STRIDED
// Experiment to improve ordering of tasks, however depends on task system; also should have no effect if using an atomic counter to ensure order.
beginIndex = dispatcherIndex;
uint stride = numDispatchers;
uint32_t stride = numDispatchers;
#endif

// Run one task in-line but submit rest
for (uint i = 1; i < numTasks; i++)
for (uint32_t i = 1; i < numTasks; i++)
{
#if FM_STRIDED
uint taskIndex = beginIndex + stride * i;
uint32_t taskIndex = beginIndex + stride * i;
#else
uint taskIndex = beginIndex + i;
uint32_t taskIndex = beginIndex + i;
#endif
dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, taskIndex, taskIndex + 1);
}
Expand All @@ -206,7 +206,7 @@ namespace AMD
}
}

uint numIncomplete = FmAtomicDecrement(&dispatcherData->numDispatchersIncomplete.val);
uint32_t numIncomplete = dispatcherData->numDispatchersIncomplete--;

if (numIncomplete == 0)
{
Expand All @@ -222,7 +222,7 @@ namespace AMD
// This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask().
void FmParallelForAsync(const char* taskName,
FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void* taskData, int32_t taskCount,
FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop)
FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop)
{
(void)taskName;

Expand All @@ -237,13 +237,13 @@ namespace AMD
const int32_t numSubmitsPerThread = 16;

// Get number of dispatchers needed
int numDispatchers = FmGetNumTasks((uint)taskCount, numSubmitsPerThread);
numDispatchers = FmMinUint(numThreads * 8, numDispatchers);
int numDispatchers = FmGetNumTasks((uint32_t)taskCount, numSubmitsPerThread);
numDispatchers = MIN(numThreads * 8, numDispatchers);

FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint)taskCount, numDispatchers);
FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint32_t)taskCount, numDispatchers);

// Submit other dispatchers
for (uint i = 1; i < dispatcherData->numDispatchers; i++)
for (uint32_t i = 1; i < dispatcherData->numDispatchers; i++)
{
dispatcherData->SubmitAsyncTask("FEMFXParallelForDispatcher", FmTaskFuncParallelForDispatcherWrapped, dispatcherData, i, i + 1);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ THE SOFTWARE.

#include "FEMFXAsyncThreading.h"
#include "FEMFXTaskSystemInterface.h"
#include "core/engine.h"

namespace AMD {

Expand All @@ -50,47 +51,47 @@ typedef int32_t (*FmBatchingFuncCallback)(void *taskData, int32_t taskBeginIndex
// This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask().
void FmParallelForAsync(const char *taskName,
FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void *taskData, int32_t taskCount,
FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop = false);
FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop = false);

// Get number of tasks assuming maximum batch size per task
static _FORCE_INLINE_ uint FmGetNumTasks(uint problemSize, uint maxTaskBatchSize) {
static _FORCE_INLINE_ uint32_t FmGetNumTasks(uint32_t problemSize, uint32_t maxTaskBatchSize) {
return (problemSize + maxTaskBatchSize - 1) / maxTaskBatchSize;
}

// Get number of tasks based on desired batch size per task, but limited to at most maxTasks
static _FORCE_INLINE_ uint FmGetNumTasksLimited(uint problemSize, uint taskBatchSize, uint maxTasks) {
uint numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize;
static _FORCE_INLINE_ uint32_t FmGetNumTasksLimited(uint32_t problemSize, uint32_t taskBatchSize, uint32_t maxTasks) {
uint32_t numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize;
numTasks = MIN(maxTasks, numTasks);
return numTasks;
}

// Get problem index range for the specified task index, assuming FmGetNumTasks() tasks
static _FORCE_INLINE_ void FmGetIndexRange(uint *beginIndex, uint *endIndex, uint taskIndex, uint maxTaskBatchSize, uint problemSize) {
uint begin = taskIndex * maxTaskBatchSize;
uint end = begin + maxTaskBatchSize;
static _FORCE_INLINE_ void FmGetIndexRange(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t maxTaskBatchSize, uint32_t problemSize) {
uint32_t begin = taskIndex * maxTaskBatchSize;
uint32_t end = begin + maxTaskBatchSize;
begin = MIN(begin, problemSize);
end = MIN(end, problemSize);
*beginIndex = begin;
*endIndex = end;
}

// Get number of tasks for a minimum batch size per task, assuming remainder will be evenly distributed to all tasks.
static _FORCE_INLINE_ uint FmGetNumTasksMinBatchSize(uint problemSize, uint minTaskBatchSize) {
static _FORCE_INLINE_ uint32_t FmGetNumTasksMinBatchSize(uint32_t problemSize, uint32_t minTaskBatchSize) {
return MAX(problemSize / minTaskBatchSize, 1);
}

// Get problem index range for the specified task index, assuming problem is distributed to tasks as evenly as possible.
// NOTE: output range may be zero-sized if problemSize < taskCount
static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint *beginIndex, uint *endIndex, uint taskIndex, uint taskCount, uint problemSize) {
uint taskBatchSize = problemSize / taskCount;
uint remainderBatchSize = problemSize % taskCount;
static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t taskCount, uint32_t problemSize) {
uint32_t taskBatchSize = problemSize / taskCount;
uint32_t remainderBatchSize = problemSize % taskCount;

uint taskExtra = remainderBatchSize / taskCount;
uint remainder = remainderBatchSize % taskCount;
uint32_t taskExtra = remainderBatchSize / taskCount;
uint32_t remainder = remainderBatchSize % taskCount;

taskBatchSize += taskExtra;

uint begin, end;
uint32_t begin, end;
if (taskIndex < remainder) {
taskBatchSize++;
begin = taskIndex * taskBatchSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,20 +111,20 @@ struct FmTaskSystemCallbacks {
#endif

FmTaskSystemCallbacks() {
GetTaskSystemNumThreads = NULL;
GetTaskSystemWorkerIndex = NULL;
SubmitAsyncTask = NULL;
CreateSyncEvent = NULL;
DestroySyncEvent = NULL;
WaitForSyncEvent = NULL;
TriggerSyncEvent = NULL;
GetTaskSystemNumThreads = nullptr;
GetTaskSystemWorkerIndex = nullptr;
SubmitAsyncTask = nullptr;
CreateSyncEvent = nullptr;
DestroySyncEvent = nullptr;
WaitForSyncEvent = nullptr;
TriggerSyncEvent = nullptr;
#if !FM_ASYNC_THREADING
GetTaskSystemNumThreads = NULL;
CreateTaskWaitCounter = NULL;
WaitForTaskWaitCounter = NULL;
DestroyTaskWaitCounter = NULL;
SubmitTask = NULL;
ParallelFor = NULL;
GetTaskSystemNumThreads = nullptr;
CreateTaskWaitCounter = nullptr;
WaitForTaskWaitCounter = nullptr;
DestroyTaskWaitCounter = nullptr;
SubmitTask = nullptr;
ParallelFor = nullptr;
#endif
}

Expand Down

0 comments on commit 9c43d67

Please sign in to comment.