Compile #2.

fire · Feb 23, 2020 · 9c43d67 · 9c43d67
1 parent 88a7c48
commit 9c43d67
Show file tree

Hide file tree

Showing 9 changed files with 86 additions and 68 deletions.
diff --git a/core/os/threaded_array_processor.h b/core/os/threaded_array_processor.h
@@ -37,10 +37,10 @@
 #include "core/os/thread.h"
 #include "core/os/thread_safe.h"
 #include "core/safe_refcount.h"
-#include "thirdparty/FEMFXAsync/FEMFXAsyncThreading.h"
-#include "thirdparty/FEMFXAsync/FEMFXCommon.h"
-#include "thirdparty/FEMFXAsync/FEMFXParallelFor.h"
-#include "thirdparty/FEMFXAsync/FEMFXTaskSystemInterface.h"
+#include "thirdparty/fem_fx_async/FEMFXAsyncThreading.h"
+#include "thirdparty/fem_fx_async/FEMFXCommon.h"
+#include "thirdparty/fem_fx_async/FEMFXParallelFor.h"
+#include "thirdparty/fem_fx_async/FEMFXTaskSystemInterface.h"
 
 using namespace AMD;
 

diff --git a/modules/fem_fx_async/SCsub b/modules/fem_fx_async/SCsub
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+
+Import('env')
+Import('env_modules')
+
+fem_fx_async = env_modules.Clone()
+
+env_thirdparty = fem_fx_async.Clone()
+env_thirdparty.add_source_files(env.modules_sources, Glob('#thirdparty/fem_fx_async/*.cpp'))
+
+# Godot's own source files
+fem_fx_async.add_source_files(env.modules_sources, "*.cpp")
diff --git a/modules/fem_fx_async/config.py b/modules/fem_fx_async/config.py
@@ -0,0 +1,5 @@
+def can_build(env, platform):
+    return True
+
+def configure(env):
+    pass
diff --git a/thirdparty/FEMFXAsync/FEMFXAsyncThreading.h → ...dparty/fem_fx_async/FEMFXAsyncThreading.h b/thirdparty/FEMFXAsync/FEMFXAsyncThreading.h → ...dparty/fem_fx_async/FEMFXAsyncThreading.h
@@ -54,7 +54,7 @@ namespace AMD
         int32_t            beginIndex;
         int32_t            endIndex;
 
-        FmTask() : func(NULL), data(NULL), beginIndex(0), endIndex(0) {}
+        FmTask() : func(nullptr), data(nullptr), beginIndex(0), endIndex(0) {}
     };
 
     // Async tasks need to be executed using this function, which starts a loop that will pick up subsequent tasks set by FmSetNextTask()
@@ -77,8 +77,8 @@ namespace AMD
         {
             nextIndex.store(0);
             numTasksIncomplete.store(0);
-            followTask.func = NULL;
-            followTask.data = NULL;
+            followTask.func = nullptr;
+            followTask.data = nullptr;
             followTask.beginIndex = 0;
             followTask.endIndex = 1;
         }
@@ -172,12 +172,12 @@ namespace AMD
 
         bool TaskIsFinished()
         {
-            return TaskIsFinished<void>(NULL);
+            return TaskIsFinished<void>(nullptr);
         }
 
         bool TasksAreFinished(uint32_t numTasks)
         {
-            return TasksAreFinished<void>(numTasks, NULL);
+            return TasksAreFinished<void>(numTasks, nullptr);
         }
     };
 
@@ -190,7 +190,7 @@ namespace AMD
         FmTask               followTask;      // Task to call following scope of this task data
         FmAsyncTaskData*     parentTaskData;  // Pointer to parent task data for nested parallel operations
 
-        FmAsyncTaskData() : parentTaskData(NULL) { }
+        FmAsyncTaskData() : parentTaskData(nullptr) { }
 
         virtual ~FmAsyncTaskData() {}
     };

diff --git a/thirdparty/FEMFXAsync/FEMFXCommon.h → thirdparty/fem_fx_async/FEMFXCommon.h b/thirdparty/FEMFXAsync/FEMFXCommon.h → thirdparty/fem_fx_async/FEMFXCommon.h
diff --git a/thirdparty/FEMFXAsync/FEMFXParallelFor.cpp → thirdparty/fem_fx_async/FEMFXParallelFor.cpp b/thirdparty/FEMFXAsync/FEMFXParallelFor.cpp → thirdparty/fem_fx_async/FEMFXParallelFor.cpp
@@ -24,6 +24,8 @@ THE SOFTWARE.
 
 #include "FEMFXParallelFor.h"
 #include "FEMFXAsyncThreading.h"
+#include "core/engine.h"
+#include <atomic>
 
 namespace AMD
 {
@@ -50,15 +52,15 @@ namespace AMD
 
         void SetNextTask(FmTaskFuncCallback inFunc, void* inData, int32_t inBeginIndex, int32_t inEndIndex)
         {
-            FM_ASSERT(func == NULL); // should have run and cleared this
+            ERR_FAIL_COND(func == NULL); // should have run and cleared this
             func = inFunc;
             data = inData;
             beginIndex = inBeginIndex;
             endIndex = inEndIndex;
         }
     };
 
-    FM_THREAD_LOCAL_STORAGE FmTaskFuncLoopData gFEMFXTaskFuncLoopData;
+    thread_local FmTaskFuncLoopData gFEMFXTaskFuncLoopData;
 
     // Loop while there is a non-NULL task function set in gFEMFXTaskFuncLoopData.
     // Reduces use of stack with Async code.
@@ -95,19 +97,17 @@ namespace AMD
     class FmParallelForDispatcherData
     {
     public:
-        FM_CLASS_NEW_DELETE(FmParallelForDispatcherData)
-
-        FmAtomicUint dispatcherIndexAtomic;
-        FmAtomicUint numDispatchersIncomplete;
+        std::atomic_uint32_t dispatcherIndexAtomic;
+        std::atomic_uint32_t numDispatchersIncomplete;
 
         FmSubmitAsyncTaskCallback SubmitAsyncTask;
         const char* taskName;
         FmTaskFuncCallback TaskFunc;
         FmTaskFuncCallback TaskFuncWrapped;
         FmBatchingFuncCallback BatchingFunc;
         void* taskData;
-        uint problemSize;
-        uint numDispatchers;
+        uint32_t problemSize;
+        uint32_t numDispatchers;
 
         FmParallelForDispatcherData(
             FmSubmitAsyncTaskCallback inSubmitAsyncTask,
@@ -116,8 +116,8 @@ namespace AMD
             FmTaskFuncCallback inTaskFuncWrapped,
             FmBatchingFuncCallback inBatchingFunc,
             void* inTaskData,
-            uint inProblemSize,
-            uint inNumDispatchers)
+            uint32_t inProblemSize,
+            uint32_t inNumDispatchers)
         {
             SubmitAsyncTask = inSubmitAsyncTask;
             taskName = inTaskName;
@@ -127,8 +127,8 @@ namespace AMD
             taskData = inTaskData;
             problemSize = inProblemSize;
             numDispatchers = inNumDispatchers;
-            FmAtomicWrite(&dispatcherIndexAtomic.val, 0);
-            FmAtomicWrite(&numDispatchersIncomplete.val, numDispatchers);
+            dispatcherIndexAtomic.store(0);
+            numDispatchersIncomplete.store(numDispatchers);
         }
     };
 
@@ -142,14 +142,14 @@ namespace AMD
 
         FmBatchingFuncCallback BatchingFunc = dispatcherData->BatchingFunc;
 
-        uint dispatcherIndex = (uint)inTaskBeginIndex;
-        dispatcherIndex = FmAtomicIncrement(&dispatcherData->dispatcherIndexAtomic.val) - 1;
+        uint32_t dispatcherIndex = (uint32_t)inTaskBeginIndex;
+        dispatcherIndex = dispatcherData->dispatcherIndexAtomic++ - 1;
 
-        uint numDispatchers = dispatcherData->numDispatchers;
-        uint problemSize = dispatcherData->problemSize;
+        uint32_t numDispatchers = dispatcherData->numDispatchers;
+        uint32_t problemSize = dispatcherData->problemSize;
 
         // Compute range of parallel-for indices this dispatcher covers
-        uint beginIndex, endIndex;
+        uint32_t beginIndex, endIndex;
         FmGetIndexRangeEvenDistribution(&beginIndex, &endIndex, dispatcherIndex, numDispatchers, problemSize);
 
         if (BatchingFunc)
@@ -158,14 +158,14 @@ namespace AMD
             void* taskData = dispatcherData->taskData;
 
             // Save first batch to run on this thread
-            uint firstNumItems = (uint)BatchingFunc(taskData, beginIndex, endIndex);
-            uint firstBeginIndex = beginIndex;
+            uint32_t firstNumItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex);
+            uint32_t firstBeginIndex = beginIndex;
 
             beginIndex += firstNumItems;
 
             while (beginIndex < endIndex)
             {
-                uint numItems = (uint)BatchingFunc(taskData, beginIndex, endIndex);
+                uint32_t numItems = (uint32_t)BatchingFunc(taskData, beginIndex, endIndex);
 
                 // Submit task
                 dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, beginIndex, beginIndex + numItems);
@@ -180,22 +180,22 @@ namespace AMD
         }
         else
         {
-            uint numTasks = endIndex - beginIndex;
+            uint32_t numTasks = endIndex - beginIndex;
 
 #define FM_STRIDED 1
 #if FM_STRIDED
             // Experiment to improve ordering of tasks, however depends on task system; also should have no effect if using an atomic counter to ensure order.
             beginIndex = dispatcherIndex;
-            uint stride = numDispatchers;
+            uint32_t stride = numDispatchers;
 #endif
 
             // Run one task in-line but submit rest
-            for (uint i = 1; i < numTasks; i++)
+            for (uint32_t i = 1; i < numTasks; i++)
             {
 #if FM_STRIDED
-                uint taskIndex = beginIndex + stride * i;
+                uint32_t taskIndex = beginIndex + stride * i;
 #else
-                uint taskIndex = beginIndex + i;
+                uint32_t taskIndex = beginIndex + i;
 #endif
                 dispatcherData->SubmitAsyncTask(dispatcherData->taskName, dispatcherData->TaskFuncWrapped, dispatcherData->taskData, taskIndex, taskIndex + 1);
             }
@@ -206,7 +206,7 @@ namespace AMD
             }
         }
 
-        uint numIncomplete = FmAtomicDecrement(&dispatcherData->numDispatchersIncomplete.val);
+        uint32_t numIncomplete = dispatcherData->numDispatchersIncomplete--;
 
         if (numIncomplete == 0)
         {
@@ -222,7 +222,7 @@ namespace AMD
     // This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask().
     void FmParallelForAsync(const char* taskName, 
         FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void* taskData, int32_t taskCount, 
-        FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop)
+        FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop)
     {
         (void)taskName;
 
@@ -237,13 +237,13 @@ namespace AMD
         const int32_t numSubmitsPerThread = 16;
 
         // Get number of dispatchers needed
-        int numDispatchers = FmGetNumTasks((uint)taskCount, numSubmitsPerThread);
-        numDispatchers = FmMinUint(numThreads * 8, numDispatchers);
+        int numDispatchers = FmGetNumTasks((uint32_t)taskCount, numSubmitsPerThread);
+        numDispatchers = MIN(numThreads * 8, numDispatchers);
 
-        FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint)taskCount, numDispatchers);
+        FmParallelForDispatcherData* dispatcherData = new FmParallelForDispatcherData(SubmitAsyncTask, taskName, TaskFunc, TaskFuncWrapped, BatchingFunc, taskData, (uint32_t)taskCount, numDispatchers);
 
         // Submit other dispatchers
-        for (uint i = 1; i < dispatcherData->numDispatchers; i++)
+        for (uint32_t i = 1; i < dispatcherData->numDispatchers; i++)
         {
             dispatcherData->SubmitAsyncTask("FEMFXParallelForDispatcher", FmTaskFuncParallelForDispatcherWrapped, dispatcherData, i, i + 1);
         }

diff --git a/thirdparty/FEMFXAsync/FEMFXParallelFor.h → thirdparty/fem_fx_async/FEMFXParallelFor.h b/thirdparty/FEMFXAsync/FEMFXParallelFor.h → thirdparty/fem_fx_async/FEMFXParallelFor.h
@@ -33,6 +33,7 @@ THE SOFTWARE.
 
 #include "FEMFXAsyncThreading.h"
 #include "FEMFXTaskSystemInterface.h"
+#include "core/engine.h"
 
 namespace AMD {
 
@@ -50,47 +51,47 @@ typedef int32_t (*FmBatchingFuncCallback)(void *taskData, int32_t taskBeginIndex
 // This is necessary if FmParallelForAsync is not called from FmExecuteTask(), or other FmParallelForAsync() calls may take place before returning to FmExecuteTask().
 void FmParallelForAsync(const char *taskName,
 		FmTaskFuncCallback TaskFunc, FmTaskFuncCallback TaskFuncWrapped, FmBatchingFuncCallback BatchingFunc, void *taskData, int32_t taskCount,
-		FmSubmitAsyncTaskCallback SubmitAsyncTask, uint numThreads, bool runLoop = false);
+		FmSubmitAsyncTaskCallback SubmitAsyncTask, uint32_t numThreads, bool runLoop = false);
 
 // Get number of tasks assuming maximum batch size per task
-static _FORCE_INLINE_ uint FmGetNumTasks(uint problemSize, uint maxTaskBatchSize) {
+static _FORCE_INLINE_ uint32_t FmGetNumTasks(uint32_t problemSize, uint32_t maxTaskBatchSize) {
 	return (problemSize + maxTaskBatchSize - 1) / maxTaskBatchSize;
 }
 
 // Get number of tasks based on desired batch size per task, but limited to at most maxTasks
-static _FORCE_INLINE_ uint FmGetNumTasksLimited(uint problemSize, uint taskBatchSize, uint maxTasks) {
-	uint numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize;
+static _FORCE_INLINE_ uint32_t FmGetNumTasksLimited(uint32_t problemSize, uint32_t taskBatchSize, uint32_t maxTasks) {
+	uint32_t numTasks = (problemSize + taskBatchSize - 1) / taskBatchSize;
 	numTasks = MIN(maxTasks, numTasks);
 	return numTasks;
 }
 
 // Get problem index range for the specified task index, assuming FmGetNumTasks() tasks
-static _FORCE_INLINE_ void FmGetIndexRange(uint *beginIndex, uint *endIndex, uint taskIndex, uint maxTaskBatchSize, uint problemSize) {
-	uint begin = taskIndex * maxTaskBatchSize;
-	uint end = begin + maxTaskBatchSize;
+static _FORCE_INLINE_ void FmGetIndexRange(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t maxTaskBatchSize, uint32_t problemSize) {
+	uint32_t begin = taskIndex * maxTaskBatchSize;
+	uint32_t end = begin + maxTaskBatchSize;
 	begin = MIN(begin, problemSize);
 	end = MIN(end, problemSize);
 	*beginIndex = begin;
 	*endIndex = end;
 }
 
 // Get number of tasks for a minimum batch size per task, assuming remainder will be evenly distributed to all tasks.
-static _FORCE_INLINE_ uint FmGetNumTasksMinBatchSize(uint problemSize, uint minTaskBatchSize) {
+static _FORCE_INLINE_ uint32_t FmGetNumTasksMinBatchSize(uint32_t problemSize, uint32_t minTaskBatchSize) {
 	return MAX(problemSize / minTaskBatchSize, 1);
 }
 
 // Get problem index range for the specified task index, assuming problem is distributed to tasks as evenly as possible.
 // NOTE: output range may be zero-sized if problemSize < taskCount
-static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint *beginIndex, uint *endIndex, uint taskIndex, uint taskCount, uint problemSize) {
-	uint taskBatchSize = problemSize / taskCount;
-	uint remainderBatchSize = problemSize % taskCount;
+static _FORCE_INLINE_ void FmGetIndexRangeEvenDistribution(uint32_t *beginIndex, uint32_t *endIndex, uint32_t taskIndex, uint32_t taskCount, uint32_t problemSize) {
+	uint32_t taskBatchSize = problemSize / taskCount;
+	uint32_t remainderBatchSize = problemSize % taskCount;
 
-	uint taskExtra = remainderBatchSize / taskCount;
-	uint remainder = remainderBatchSize % taskCount;
+	uint32_t taskExtra = remainderBatchSize / taskCount;
+	uint32_t remainder = remainderBatchSize % taskCount;
 
 	taskBatchSize += taskExtra;
 
-	uint begin, end;
+	uint32_t begin, end;
 	if (taskIndex < remainder) {
 		taskBatchSize++;
 		begin = taskIndex * taskBatchSize;

diff --git a/thirdparty/FEMFXAsync/FEMFXTaskGraph.h → thirdparty/fem_fx_async/FEMFXTaskGraph.h b/thirdparty/FEMFXAsync/FEMFXTaskGraph.h → thirdparty/fem_fx_async/FEMFXTaskGraph.h
diff --git a/...rty/FEMFXAsync/FEMFXTaskSystemInterface.h → ...y/fem_fx_async/FEMFXTaskSystemInterface.h b/...rty/FEMFXAsync/FEMFXTaskSystemInterface.h → ...y/fem_fx_async/FEMFXTaskSystemInterface.h
@@ -111,20 +111,20 @@ struct FmTaskSystemCallbacks {
 #endif
 
 	FmTaskSystemCallbacks() {
-		GetTaskSystemNumThreads = NULL;
-		GetTaskSystemWorkerIndex = NULL;
-		SubmitAsyncTask = NULL;
-		CreateSyncEvent = NULL;
-		DestroySyncEvent = NULL;
-		WaitForSyncEvent = NULL;
-		TriggerSyncEvent = NULL;
+		GetTaskSystemNumThreads = nullptr;
+		GetTaskSystemWorkerIndex = nullptr;
+		SubmitAsyncTask = nullptr;
+		CreateSyncEvent = nullptr;
+		DestroySyncEvent = nullptr;
+		WaitForSyncEvent = nullptr;
+		TriggerSyncEvent = nullptr;
 #if !FM_ASYNC_THREADING
-		GetTaskSystemNumThreads = NULL;
-		CreateTaskWaitCounter = NULL;
-		WaitForTaskWaitCounter = NULL;
-		DestroyTaskWaitCounter = NULL;
-		SubmitTask = NULL;
-		ParallelFor = NULL;
+		GetTaskSystemNumThreads = nullptr;
+		CreateTaskWaitCounter = nullptr;
+		WaitForTaskWaitCounter = nullptr;
+		DestroyTaskWaitCounter = nullptr;
+		SubmitTask = nullptr;
+		ParallelFor = nullptr;
 #endif
 	}