diff --git a/core/os/threaded_array_processor.h b/core/os/threaded_array_processor.h index 055603247b0e..a3d8bf228c85 100644 --- a/core/os/threaded_array_processor.h +++ b/core/os/threaded_array_processor.h @@ -37,10 +37,10 @@ #include "core/os/thread.h" #include "core/os/thread_safe.h" #include "core/safe_refcount.h" -#include "thirdparty/FEMFXAsync/FEMFXAsyncThreading.h" -#include "thirdparty/FEMFXAsync/FEMFXCommon.h" -#include "thirdparty/FEMFXAsync/FEMFXParallelFor.h" -#include "thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h" +#include "thirdparty/fem_fx_async/FEMFXAsyncThreading.h" +#include "thirdparty/fem_fx_async/FEMFXCommon.h" +#include "thirdparty/fem_fx_async/FEMFXParallelFor.h" +#include "thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h" using namespace AMD; diff --git a/modules/fem_fx_async/SCsub b/modules/fem_fx_async/SCsub new file mode 100644 index 000000000000..ecddeee0df7a --- /dev/null +++ b/modules/fem_fx_async/SCsub @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +Import('env') +Import('env_modules') + +fem_fx_async = env_modules.Clone() + +env_thirdparty = fem_fx_async.Clone() +env_thirdparty.add_source_files(env.modules_sources, Glob('#thirdparty/fem_fx_async/*.cpp')) + +# Godot's own source files +fem_fx_async.add_source_files(env.modules_sources, "*.cpp") diff --git a/modules/fem_fx_async/config.py b/modules/fem_fx_async/config.py new file mode 100644 index 000000000000..1c8cd12a2dc0 --- /dev/null +++ b/modules/fem_fx_async/config.py @@ -0,0 +1,5 @@ +def can_build(env, platform): + return True + +def configure(env): + pass diff --git a/thirdparty/FEMFXAsync/FEMFXAsyncThreading.h b/thirdparty/fem_fx_async/FEMFXAsyncThreading.h similarity index 95% rename from thirdparty/FEMFXAsync/FEMFXAsyncThreading.h rename to thirdparty/fem_fx_async/FEMFXAsyncThreading.h index d319d179a3ef..2f9904730bc4 100644 --- a/thirdparty/FEMFXAsync/FEMFXAsyncThreading.h +++ b/thirdparty/fem_fx_async/FEMFXAsyncThreading.h @@ -54,7 +54,7 @@ namespace AMD int32_t beginIndex; int32_t endIndex; - FmTask() : func(NULL), data(NULL), beginIndex(0), endIndex(0) {} + FmTask() : func(nullptr), data(nullptr), beginIndex(0), endIndex(0) {} }; // Async tasks need to be executed using this function, which starts a loop that will pick up subsequent tasks set by FmSetNextTask() @@ -77,8 +77,8 @@ namespace AMD { nextIndex.store(0); numTasksIncomplete.store(0); - followTask.func = NULL; - followTask.data = NULL; + followTask.func = nullptr; + followTask.data = nullptr; followTask.beginIndex = 0; followTask.endIndex = 1; } @@ -172,12 +172,12 @@ namespace AMD bool TaskIsFinished() { - return TaskIsFinished(NULL); + return TaskIsFinished(nullptr); } bool TasksAreFinished(uint32_t numTasks) { - return TasksAreFinished(numTasks, NULL); + return TasksAreFinished(numTasks, nullptr); } }; @@ -190,7 +190,7 @@ namespace AMD FmTask followTask; // Task to call following scope of this task data FmAsyncTaskData* parentTaskData; // Pointer to parent task data for nested parallel operations - FmAsyncTaskData() : parentTaskData(NULL) { } + FmAsyncTaskData() : parentTaskData(nullptr) { } virtual ~FmAsyncTaskData() {} }; diff --git a/thirdparty/FEMFXAsync/FEMFXCommon.h b/thirdparty/fem_fx_async/FEMFXCommon.h similarity index 100% rename from thirdparty/FEMFXAsync/FEMFXCommon.h rename to thirdparty/fem_fx_async/FEMFXCommon.h diff --git a/thirdparty/FEMFXAsync/FEMFXParallelFor.cpp b/thirdparty/fem_fx_async/FEMFXParallelFor.cpp similarity index 82% rename from thirdparty/FEMFXAsync/FEMFXParallelFor.cpp rename to thirdparty/fem_fx_async/FEMFXParallelFor.cpp index d940767f61eb..68d0db43538d 100644 --- a/thirdparty/FEMFXAsync/FEMFXParallelFor.cpp +++ b/thirdparty/fem_fx_async/FEMFXParallelFor.cpp @@ -24,6 +24,8 @@ THE SOFTWARE. #include "FEMFXParallelFor.h" #include "FEMFXAsyncThreading.h" +#include "core/engine.h" +#include namespace AMD { @@ -50,7 +52,7 @@ namespace AMD void SetNextTask(FmTaskFuncCallback inFunc, void* inData, int32_t inBeginIndex, int32_t inEndIndex) { - FM_ASSERT(func == NULL); // should have run and cleared this + ERR_FAIL_COND(func == NULL); // should have run and cleared this func = inFunc; data = inData; beginIndex = inBeginIndex; @@ -58,7 +60,7 @@ namespace AMD } }; - FM_THREAD_LOCAL_STORAGE FmTaskFuncLoopData gFEMFXTaskFuncLoopData; + thread_local FmTaskFuncLoopData gFEMFXTaskFuncLoopData; // Loop while there is a non-NULL task function set in gFEMFXTaskFuncLoopData. // Reduces use of stack with Async code. @@ -95,10 +97,8 @@ namespace AMD class FmParallelForDispatcherData { public: - FM_CLASS_NEW_DELETE(FmParallelForDispatcherData) - - FmAtomicUint dispatcherIndexAtomic; - FmAtomicUint numDispatchersIncomplete; + std::atomic_uint32_t dispatcherIndexAtomic; + std::atomic_uint32_t numDispatchersIncomplete; FmSubmitAsyncTaskCallback SubmitAsyncTask; const char* taskName; @@ -106,8 +106,8 @@ namespace AMD FmTaskFuncCallback TaskFuncWrapped; FmBatchingFuncCallback BatchingFunc; void* taskData; - uint problemSize; - uint numDispatchers; + uint32_t problemSize; + uint32_t numDispatchers; FmParallelForDispatcherData( FmSubmitAsyncTaskCallback inSubmitAsyncTask, @@ -116,8 +116,8 @@ namespace AMD FmTaskFuncCallback inTaskFuncWrapped, FmBatchingFuncCallback inBatchingFunc, void* inTaskData, - uint inProblemSize, - uint inNumDispatchers) + uint32_t inProblemSize, + uint32_t inNumDispatchers) { SubmitAsyncTask = inSubmitAsyncTask; taskName = inTaskName; @@ -127,8 +127,8 @@ namespace AMD taskData = inTaskData; problemSize = inProblemSize; numDispatchers = inNumDispatchers; - FmAtomicWrite(&dispatcherIndexAtomic.val, 0); - FmAtomicWrite(&numDispatchersIncomplete.val, numDispatchers); + dispatcherIndexAtomic.store(0); + numDispatchersIncomplete.store(numDispatchers); } }; @@ -142,14 +142,14 @@ namespace AMD FmBatchingFuncCallback BatchingFunc = dispatcherData->BatchingFunc; - uint dispatcherIndex = (uint)inTaskBeginIndex; - dispatcherIndex = FmAtomicIncrement(&dispatcherData->dispatcherIndexAtomic.val) - 1; + uint32_t dispatcherIndex = (uint32_t)inTaskBeginIndex; + dispatcherIndex = dispatcherData->dispatcherIndexAtomic++ - 1; - uint numDispatchers = dispatcherData->numDispatchers; - uint problemSize = dispatcherData->problemSize; + uint32_t numDispatchers = dispatcherData->numDispatchers; + uint32_t problemSize = dispatcherData->problemSize; // Compute range of parallel-for indices this dispatcher covers - uint beginIndex, endIndex; + uint32_t beginIndex, endIndex; FmGetIndexRangeEvenDistribution(&beginIndex, &endIndex, dispatcherIndex, numDispatchers, problemSize); if (BatchingFunc) @@ -158,14 +158,14 @@ namespace AMD void* taskData = dispatcherData->taskData; // Save first batch to run on this thread - uint firstNumItems = (uint)BatchingFunc(taskData, beginIndex, endIndex); - uint firstBeginIndex = beginIndex; + uint32_t firstNumItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex); + uint32_t firstBeginIndex = beginIndex; beginIndex += firstNumItems; while (beginIndex < endIndex) { - uint numItems = (uint)BatchingFunc(taskData, beginIndex, endIndex); + uint32_t numItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex); // Submit task dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, beginIndex, beginIndex + numItems); @@ -180,22 +180,22 @@ namespace AMD } else { - uint numTasks = endIndex - beginIndex; + uint32_t numTasks = endIndex - beginIndex; #define FM_STRIDED 1 #if FM_STRIDED // Experiment to improve ordering of tasks, however depends on task system; also should have no effect if using an atomic counter to ensure order. beginIndex = dispatcherIndex; - uint stride = numDispatchers; + uint32_t stride = numDispatchers; #endif // Run one task in-line but submit rest - for (uint i = 1; i < numTasks; i++) + for (uint32_t i = 1; i < numTasks; i++) { #if FM_STRIDED - uint taskIndex = beginIndex + stride * i; + uint32_t taskIndex = beginIndex + stride * i; #else - uint taskIndex = beginIndex + i; + uint32_t taskIndex = beginIndex + i; #endif dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, taskIndex, taskIndex + 1); } @@ -206,7 +206,7 @@ namespace AMD } } - uint numIncomplete = FmAtomicDecrement(&dispatcherData->numDispatchersIncomplete.val); + uint32_t numIncomplete = dispatcherData->numDispatchersIncomplete--; if (numIncomplete == 0) { @@ -222,7 +222,7 @@ namespace AMD // This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask(). void FmParallelForAsync(const char* taskName, FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void* taskData, int32_t taskCount, - FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop) + FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop) { (void)taskName; @@ -237,13 +237,13 @@ namespace AMD const int32_t numSubmitsPerThread = 16; // Get number of dispatchers needed - int numDispatchers = FmGetNumTasks((uint)taskCount, numSubmitsPerThread); - numDispatchers = FmMinUint(numThreads * 8, numDispatchers); + int numDispatchers = FmGetNumTasks((uint32_t)taskCount, numSubmitsPerThread); + numDispatchers = MIN(numThreads * 8, numDispatchers); - FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint)taskCount, numDispatchers); + FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint32_t)taskCount, numDispatchers); // Submit other dispatchers - for (uint i = 1; i < dispatcherData->numDispatchers; i++) + for (uint32_t i = 1; i < dispatcherData->numDispatchers; i++) { dispatcherData->SubmitAsyncTask("FEMFXParallelForDispatcher", FmTaskFuncParallelForDispatcherWrapped, dispatcherData, i, i + 1); } diff --git a/thirdparty/FEMFXAsync/FEMFXParallelFor.h b/thirdparty/fem_fx_async/FEMFXParallelFor.h similarity index 77% rename from thirdparty/FEMFXAsync/FEMFXParallelFor.h rename to thirdparty/fem_fx_async/FEMFXParallelFor.h index ffe1a88aaaf2..b25c684f0046 100644 --- a/thirdparty/FEMFXAsync/FEMFXParallelFor.h +++ b/thirdparty/fem_fx_async/FEMFXParallelFor.h @@ -33,6 +33,7 @@ THE SOFTWARE. #include "FEMFXAsyncThreading.h" #include "FEMFXTaskSystemInterface.h" +#include "core/engine.h" namespace AMD { @@ -50,24 +51,24 @@ typedef int32_t (*FmBatchingFuncCallback)(void *taskData, int32_t taskBeginIndex // This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask(). void FmParallelForAsync(const char *taskName, FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void *taskData, int32_t taskCount, - FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop = false); + FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop = false); // Get number of tasks assuming maximum batch size per task -static _FORCE_INLINE_ uint FmGetNumTasks(uint problemSize, uint maxTaskBatchSize) { +static _FORCE_INLINE_ uint32_t FmGetNumTasks(uint32_t problemSize, uint32_t maxTaskBatchSize) { return (problemSize + maxTaskBatchSize - 1) / maxTaskBatchSize; } // Get number of tasks based on desired batch size per task, but limited to at most maxTasks -static _FORCE_INLINE_ uint FmGetNumTasksLimited(uint problemSize, uint taskBatchSize, uint maxTasks) { - uint numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize; +static _FORCE_INLINE_ uint32_t FmGetNumTasksLimited(uint32_t problemSize, uint32_t taskBatchSize, uint32_t maxTasks) { + uint32_t numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize; numTasks = MIN(maxTasks, numTasks); return numTasks; } // Get problem index range for the specified task index, assuming FmGetNumTasks() tasks -static _FORCE_INLINE_ void FmGetIndexRange(uint *beginIndex, uint *endIndex, uint taskIndex, uint maxTaskBatchSize, uint problemSize) { - uint begin = taskIndex * maxTaskBatchSize; - uint end = begin + maxTaskBatchSize; +static _FORCE_INLINE_ void FmGetIndexRange(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t maxTaskBatchSize, uint32_t problemSize) { + uint32_t begin = taskIndex * maxTaskBatchSize; + uint32_t end = begin + maxTaskBatchSize; begin = MIN(begin, problemSize); end = MIN(end, problemSize); *beginIndex = begin; @@ -75,22 +76,22 @@ static _FORCE_INLINE_ void FmGetIndexRange(uint *beginIndex, uint *endIndex, uin } // Get number of tasks for a minimum batch size per task, assuming remainder will be evenly distributed to all tasks. -static _FORCE_INLINE_ uint FmGetNumTasksMinBatchSize(uint problemSize, uint minTaskBatchSize) { +static _FORCE_INLINE_ uint32_t FmGetNumTasksMinBatchSize(uint32_t problemSize, uint32_t minTaskBatchSize) { return MAX(problemSize / minTaskBatchSize, 1); } // Get problem index range for the specified task index, assuming problem is distributed to tasks as evenly as possible. // NOTE: output range may be zero-sized if problemSize < taskCount -static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint *beginIndex, uint *endIndex, uint taskIndex, uint taskCount, uint problemSize) { - uint taskBatchSize = problemSize / taskCount; - uint remainderBatchSize = problemSize % taskCount; +static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t taskCount, uint32_t problemSize) { + uint32_t taskBatchSize = problemSize / taskCount; + uint32_t remainderBatchSize = problemSize % taskCount; - uint taskExtra = remainderBatchSize / taskCount; - uint remainder = remainderBatchSize % taskCount; + uint32_t taskExtra = remainderBatchSize / taskCount; + uint32_t remainder = remainderBatchSize % taskCount; taskBatchSize += taskExtra; - uint begin, end; + uint32_t begin, end; if (taskIndex < remainder) { taskBatchSize++; begin = taskIndex * taskBatchSize; diff --git a/thirdparty/FEMFXAsync/FEMFXTaskGraph.h b/thirdparty/fem_fx_async/FEMFXTaskGraph.h similarity index 100% rename from thirdparty/FEMFXAsync/FEMFXTaskGraph.h rename to thirdparty/fem_fx_async/FEMFXTaskGraph.h diff --git a/thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h b/thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h similarity index 94% rename from thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h rename to thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h index 2e74f596d7df..0ec6ca209ee1 100644 --- a/thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h +++ b/thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h @@ -111,20 +111,20 @@ struct FmTaskSystemCallbacks { #endif FmTaskSystemCallbacks() { - GetTaskSystemNumThreads = NULL; - GetTaskSystemWorkerIndex = NULL; - SubmitAsyncTask = NULL; - CreateSyncEvent = NULL; - DestroySyncEvent = NULL; - WaitForSyncEvent = NULL; - TriggerSyncEvent = NULL; + GetTaskSystemNumThreads = nullptr; + GetTaskSystemWorkerIndex = nullptr; + SubmitAsyncTask = nullptr; + CreateSyncEvent = nullptr; + DestroySyncEvent = nullptr; + WaitForSyncEvent = nullptr; + TriggerSyncEvent = nullptr; #if !FM_ASYNC_THREADING - GetTaskSystemNumThreads = NULL; - CreateTaskWaitCounter = NULL; - WaitForTaskWaitCounter = NULL; - DestroyTaskWaitCounter = NULL; - SubmitTask = NULL; - ParallelFor = NULL; + GetTaskSystemNumThreads = nullptr; + CreateTaskWaitCounter = nullptr; + WaitForTaskWaitCounter = nullptr; + DestroyTaskWaitCounter = nullptr; + SubmitTask = nullptr; + ParallelFor = nullptr; #endif }