From 3b442f3975209b7c89d1741f5f66a8f066ccf414 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 13:30:11 +0100 Subject: [PATCH 01/17] basic infrastructure for creating NCCL communicator --- include/genn/backends/cuda/backend.h | 3 ++ src/genn/backends/cuda/backend.cc | 53 +++++++++++++++++++++++----- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/include/genn/backends/cuda/backend.h b/include/genn/backends/cuda/backend.h index f38814169e..b070046d8e 100644 --- a/include/genn/backends/cuda/backend.h +++ b/include/genn/backends/cuda/backend.h @@ -76,6 +76,9 @@ struct Preferences : public PreferencesBase //! it was optimized for. However if, for example, you are running on a cluser with NVML this is not desired behaviour. bool selectGPUByDeviceID = false; + //! Generate corresponding NCCL batch reductions + bool enableNCCLReductions = false; + //! How to select GPU device DeviceSelect deviceSelectMethod = DeviceSelect::OPTIMAL; diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 065dbfb6ef..363bd3f463 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -968,6 +968,25 @@ void Backend::genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerg if(getRuntimeVersion() >= 9000) { os <<"#include " << std::endl; } + + // If NCCL is enabled + if(getPreferences().enableNCCLReductions) { + // Include NCCL header + os << "#include " << std::endl; + os << std::endl; + // Define NCCL ID and communicator + os << "EXPORT_VAR ncclUniqueId ncclID;" << std::endl; + os << "EXPORT_VAR ncclComm_t ncclCommunicator;" << std::endl; + os << std::endl; + os << "// ------------------------------------------------------------------------" << std::endl; + os << "// Helper macro for error-checking NCCL calls" << std::endl; + os << "#define CHECK_NCCL_ERRORS(call) {\\" << std::endl; + os << " ncclResult_t error = call;\\" << std::endl; + os << " if (error != ncclSuccess) {\\" << std::endl; + os << " throw std::runtime_error(__FILE__\": \" + std::to_string(__LINE__) + \": nccl error \" + std::to_string(error) + \": \" + ncclGetErrorString(error));\\" << std::endl; + os << " }\\" << std::endl; + } + os << std::endl; os << "// ------------------------------------------------------------------------" << std::endl; os << "// Helper macro for error-checking CUDA calls" << std::endl; @@ -1170,10 +1189,13 @@ void Backend::genRunnerPreamble(CodeStream &os, const ModelSpecMerged&, const Me // **YUCK** on Windows, disable "function assumed not to throw an exception but does" warning // Setting /Ehs SHOULD solve this but CUDA rules don't give this option and it's not clear it gets through to the compiler anyway os << "#pragma warning(disable: 4297)" << std::endl; -#else - // Prevent unused parameter warning - (void)os; #endif + + // If NCCL is enabled, declare NCCL ID and communicator + if(getPreferences().enableNCCLReductions) { + os << "ncclUniqueId ncclID;" << std::endl; + os << "ncclComm_t ncclCommunicator;" << std::endl; + } } //-------------------------------------------------------------------------- void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc&) const @@ -1220,6 +1242,11 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode os << "CHECK_CUDA_ERRORS(cudaSetDevice(deviceID));" << std::endl; } + // Initialise NCCL communicator + if(getPreferences().enableNCCLReductions) { + os << "CHECK_NCCL_ERRORS(ncclCommInitRank(&ncclCommunicator, numRanks, ncclID, rank));" << std::endl; + } + os << std::endl; } //-------------------------------------------------------------------------- @@ -1697,20 +1724,30 @@ void Backend::genMSBuildImportTarget(std::ostream &os) const std::string Backend::getAllocateMemParams(const ModelSpecMerged &) const { // If device should be selected at runtime + std::string params; if(getPreferences().deviceSelectMethod == DeviceSelect::MANUAL_RUNTIME) { // If devices should be delected by ID, add an integer parameter if(getPreferences().selectGPUByDeviceID) { - return "int deviceID"; + params += "int deviceID"; } // Otherwise, add a pci bus ID parameter else { - return "const char *pciBusID"; + params += "const char *pciBusID"; } } - // Othewise, no parameters are required - else { - return ""; + + // If NCCL reductions are enabled + if(getPreferences().enableNCCLReductions) { + // If there are existing parameters, add comma + if(!params.empty()) { + params += ", "; + } + + // Add num ranks and rank parameter + params += "int numRanks, int rank"; } + + return params; } //-------------------------------------------------------------------------- Backend::MemorySpaces Backend::getMergedGroupMemorySpaces(const ModelSpecMerged &modelMerged) const From 29525f35a48b013aa989023a5eaee9c43558fc26 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 13:38:25 +0100 Subject: [PATCH 02/17] export wrapper function around ncclGetUniqueId --- src/genn/backends/cuda/backend.cc | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 363bd3f463..dc45f7bb26 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -959,6 +959,11 @@ void Backend::genDefinitionsPreamble(CodeStream &os, const ModelSpecMerged &) co os << "// Standard C includes" << std::endl; os << "#include " << std::endl; os << "#include " << std::endl; + + // If NCCL is enabled, export ncclGetUniqueId function + if(getPreferences().enableNCCLReductions) { + os << "extern \"C\" EXPORT_FUNC void ncclGetUniqueId();" << std::endl; + } } //-------------------------------------------------------------------------- void Backend::genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &) const @@ -1191,10 +1196,20 @@ void Backend::genRunnerPreamble(CodeStream &os, const ModelSpecMerged&, const Me os << "#pragma warning(disable: 4297)" << std::endl; #endif - // If NCCL is enabled, declare NCCL ID and communicator + // If NCCL is enabled if(getPreferences().enableNCCLReductions) { + // Define NCCL ID and communicator os << "ncclUniqueId ncclID;" << std::endl; os << "ncclComm_t ncclCommunicator;" << std::endl; + + // Define wrapper around ncclGetUniqueId function + os << std::endl; + os << "void ncclGetUniqueId()"; + { + CodeStream::Scope b(os); + os << "CHECK_NCCL_ERRORS(ncclGetUniqueId(&ncclID));" << std::endl; + } + os << std::endl; } } //-------------------------------------------------------------------------- From 245721df7c2d75cf1c0444f090155d9e15c46057 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 13:54:49 +0100 Subject: [PATCH 03/17] linker flags for NCCL and error on Windows --- src/genn/backends/cuda/backend.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index dc45f7bb26..404cf60c4c 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -222,6 +222,14 @@ Backend::Backend(const KernelBlockSize &kernelBlockSizes, const Preferences &pre LOGW << "Using automatic copy on pre-Pascal devices is supported but likely to be very slow - we recommend copying manually on these devices"; } +#ifdef _WIN32 + // If we're on Windows and NCCL is enabled, give error + // **NOTE** There are several NCCL Windows ports e.g. https://github.com/MyCaffe/NCCL but we don't have access to any suitable systems to test + if(getPreferences().enableNCCLReductions) { + throw std::runtime_error("GeNN doesn't currently support NCCL on Windows"); + } +#endif + // Add CUDA-specific types, additionally marking them as 'device types' innaccesible to host code addDeviceType("curandState", 44); addDeviceType("curandStatePhilox4_32_10_t", 64); @@ -1643,7 +1651,11 @@ void Backend::genMakefilePreamble(std::ostream &os) const { const std::string architecture = "sm_" + std::to_string(getChosenCUDADevice().major) + std::to_string(getChosenCUDADevice().minor); std::string linkFlags = "--shared -arch " + architecture; - + + // If NCCL reductions are enabled, link NCCL + if(getPreferences().enableNCCLReductions) { + linkFlags += " -lnccl"; + } // Write variables to preamble os << "CUDA_PATH ?=/usr/local/cuda" << std::endl; os << "NVCC := $(CUDA_PATH)/bin/nvcc" << std::endl; From 40798e66d564842c0f34da3826d7cc1e08a2193c Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 14:41:46 +0100 Subject: [PATCH 04/17] code generation of merged host groups for host reductions --- include/genn/backends/cuda/backend.h | 4 + include/genn/backends/opencl/backend.h | 3 + .../backends/single_threaded_cpu/backend.h | 3 + .../genn/genn/code_generator/backendBase.h | 3 + .../genn/genn/code_generator/groupMerged.h | 80 +++++++++++++++++++ .../genn/code_generator/modelSpecMerged.h | 16 +++- src/genn/backends/cuda/backend.cc | 6 +- src/genn/backends/opencl/backend.cc | 2 +- .../backends/single_threaded_cpu/backend.cc | 2 +- .../genn/code_generator/generateRunner.cc | 12 +++ src/genn/genn/code_generator/groupMerged.cc | 38 +++++++++ .../genn/code_generator/modelSpecMerged.cc | 11 +++ .../features/custom_update_reduction/model.cc | 3 + 13 files changed, 177 insertions(+), 6 deletions(-) diff --git a/include/genn/backends/cuda/backend.h b/include/genn/backends/cuda/backend.h index b070046d8e..8e1f20c5d7 100644 --- a/include/genn/backends/cuda/backend.h +++ b/include/genn/backends/cuda/backend.h @@ -113,6 +113,7 @@ struct Preferences : public PreferencesBase Utils::updateHash(selectGPUByDeviceID, hash); Utils::updateHash(deviceSelectMethod, hash); Utils::updateHash(constantCacheOverhead, hash); + Utils::updateHash(enableNCCLReductions, hash); } }; @@ -272,6 +273,9 @@ class BACKEND_EXPORT Backend : public BackendSIMT //! Different backends seed RNGs in different ways. Does this one initialise population RNGS on device? virtual bool isPopulationRNGInitialisedOnDevice() const override { return true; } + //! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels + virtual bool isHostReductionRequired() const override { return getPreferences().enableNCCLReductions; } + //! How many bytes of memory does 'device' have virtual size_t getDeviceMemoryBytes() const override{ return m_ChosenDevice.totalGlobalMem; } diff --git a/include/genn/backends/opencl/backend.h b/include/genn/backends/opencl/backend.h index 7908bbcecc..eb6640f417 100644 --- a/include/genn/backends/opencl/backend.h +++ b/include/genn/backends/opencl/backend.h @@ -247,6 +247,9 @@ class BACKEND_EXPORT Backend : public BackendSIMT //! Different backends seed RNGs in different ways. Does this one initialise population RNGS on device? virtual bool isPopulationRNGInitialisedOnDevice() const override { return false; } + //! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels + virtual bool isHostReductionRequired() const override { return false; } + //! How many bytes of memory does 'device' have virtual size_t getDeviceMemoryBytes() const override { return m_ChosenDevice.getInfo(); } diff --git a/include/genn/backends/single_threaded_cpu/backend.h b/include/genn/backends/single_threaded_cpu/backend.h index 9f8ec684e5..d61983b3af 100644 --- a/include/genn/backends/single_threaded_cpu/backend.h +++ b/include/genn/backends/single_threaded_cpu/backend.h @@ -150,6 +150,9 @@ class BACKEND_EXPORT Backend : public BackendBase virtual bool isSynRemapRequired(const SynapseGroupInternal&) const override{ return false; } virtual bool isPostsynapticRemapRequired() const override{ return true; } + //! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels + virtual bool isHostReductionRequired() const override { return false; } + //! How many bytes of memory does 'device' have virtual size_t getDeviceMemoryBytes() const override{ return 0; } diff --git a/include/genn/genn/code_generator/backendBase.h b/include/genn/genn/code_generator/backendBase.h index d29a04ed9f..577e0df693 100644 --- a/include/genn/genn/code_generator/backendBase.h +++ b/include/genn/genn/code_generator/backendBase.h @@ -437,6 +437,9 @@ class GENN_EXPORT BackendBase //! Different backends may implement synaptic plasticity differently. Does this one require a postsynaptic remapping data structure? virtual bool isPostsynapticRemapRequired() const = 0; + //! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels + virtual bool isHostReductionRequired() const = 0; + //! How many bytes of memory does 'device' have virtual size_t getDeviceMemoryBytes() const = 0; diff --git a/include/genn/genn/code_generator/groupMerged.h b/include/genn/genn/code_generator/groupMerged.h index 296e96e644..fbdf055598 100644 --- a/include/genn/genn/code_generator/groupMerged.h +++ b/include/genn/genn/code_generator/groupMerged.h @@ -1788,4 +1788,84 @@ class GENN_EXPORT CustomWUUpdateSparseInitGroupMerged : public CustomUpdateInitG //---------------------------------------------------------------------------- static const std::string name; }; + +// ---------------------------------------------------------------------------- +// CustomUpdateHostReductionGroupMergedBase +//---------------------------------------------------------------------------- +template +class CustomUpdateHostReductionGroupMergedBase : public GroupMerged +{ +protected: + CustomUpdateHostReductionGroupMergedBase(size_t index, const std::string &precision, const BackendBase &backend, + const std::vector> &groups) + : GroupMerged(index, precision, groups) + { + // Loop through variables and add pointers if they are reduction targets + const CustomUpdateModels::Base *cm = this->getArchetype().getCustomUpdateModel(); + for(const auto &v : cm->getVars()) { + if(v.access & VarAccessModeAttribute::REDUCE) { + this->addPointerField(v.type, v.name, backend.getDeviceVarPrefix() + v.name); + } + } + + // Loop through variable references and add pointers if they are reduction targets + for(const auto &v : cm->getVarRefs()) { + if(v.access & VarAccessModeAttribute::REDUCE) { + this->addPointerField(v.type, v.name, backend.getDeviceVarPrefix() + v.name); + } + } + } +}; + +// ---------------------------------------------------------------------------- +// CustomUpdateHostReductionGroupMerged +//---------------------------------------------------------------------------- +class CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase +{ +public: + CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, + const std::vector> &groups); + + //------------------------------------------------------------------------ + // Public API + //------------------------------------------------------------------------ + void generateRunner(const BackendBase &backend, CodeStream &definitionsInternal, + CodeStream &definitionsInternalFunc, CodeStream &definitionsInternalVar, + CodeStream &runnerVarDecl, CodeStream &runnerMergedStructAlloc) const + { + generateRunnerBase(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar, + runnerVarDecl, runnerMergedStructAlloc, name, true); + } + + //---------------------------------------------------------------------------- + // Static constants + //---------------------------------------------------------------------------- + static const std::string name; +}; + +// ---------------------------------------------------------------------------- +// CustomWUUpdateHostReductionGroupMerged +//---------------------------------------------------------------------------- +class CustomWUUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase +{ +public: + CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, + const std::vector> &groups); + + //------------------------------------------------------------------------ + // Public API + //------------------------------------------------------------------------ + void generateRunner(const BackendBase &backend, CodeStream &definitionsInternal, + CodeStream &definitionsInternalFunc, CodeStream &definitionsInternalVar, + CodeStream &runnerVarDecl, CodeStream &runnerMergedStructAlloc) const + { + generateRunnerBase(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar, + runnerVarDecl, runnerMergedStructAlloc, name, true); + } + + //---------------------------------------------------------------------------- + // Static constants + //---------------------------------------------------------------------------- + static const std::string name; +}; } // namespace CodeGenerator diff --git a/include/genn/genn/code_generator/modelSpecMerged.h b/include/genn/genn/code_generator/modelSpecMerged.h index 9e3df1bb03..2e2ea0a03c 100644 --- a/include/genn/genn/code_generator/modelSpecMerged.h +++ b/include/genn/genn/code_generator/modelSpecMerged.h @@ -150,6 +150,12 @@ class GENN_EXPORT ModelSpecMerged //! Get merged custom weight update groups where transpose needs to be calculated const std::vector &getMergedCustomUpdateTransposeWUGroups() const { return m_MergedCustomUpdateTransposeWUGroups; } + //! Get merged custom update groups where host reduction needs to be performed + const std::vector &getMergedCustomUpdateHostReductionGroups() const { return m_MergedCustomUpdateHostReductionGroups; } + + //! Get merged custom weight update groups where host reduction needs to be performed + const std::vector &getMergedCustomWUUpdateHostReductionGroups() const { return m_MergedCustomWUUpdateHostReductionGroups; } + void genMergedNeuronUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedNeuronUpdateGroups); } void genMergedPresynapticUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedPresynapticUpdateGroups); } void genMergedPostsynapticUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedPostsynapticUpdateGroups); } @@ -167,7 +173,9 @@ class GENN_EXPORT ModelSpecMerged void genMergedSynapseConnectivityHostInitStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedSynapseConnectivityHostInitGroups); } void genMergedCustomUpdateStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateGroups); } void genMergedCustomUpdateWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateWUGroups); } - void gemMergedCustomUpdateTransposeWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateTransposeWUGroups); } + void genMergedCustomUpdateTransposeWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateTransposeWUGroups); } + void genMergedCustomUpdateHostReductionStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateHostReductionGroups); } + void genMergedCustomWUUpdateHostReductionStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomWUUpdateHostReductionGroups); } void genNeuronUpdateGroupSupportCode(CodeStream &os, bool supportsNamespace = true) const{ m_NeuronUpdateSupportCode.gen(os, getModel().getPrecision(), supportsNamespace); } void genPostsynapticDynamicsSupportCode(CodeStream &os, bool supportsNamespace = true) const{ m_PostsynapticDynamicsSupportCode.gen(os, getModel().getPrecision(), supportsNamespace); } @@ -409,6 +417,12 @@ class GENN_EXPORT ModelSpecMerged //! Merged custom weight update groups where transpose needs to be calculated std::vector m_MergedCustomUpdateTransposeWUGroups; + //! Get merged custom update groups where host reduction needs to be performed + std::vector m_MergedCustomUpdateHostReductionGroups; + + //! Get merged custom weight update groups where host reduction needs to be performed + std::vector m_MergedCustomWUUpdateHostReductionGroups; + //! Unique support code strings for neuron update SupportCodeMerged m_NeuronUpdateSupportCode; diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 404cf60c4c..05b7abffa4 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -225,9 +225,9 @@ Backend::Backend(const KernelBlockSize &kernelBlockSizes, const Preferences &pre #ifdef _WIN32 // If we're on Windows and NCCL is enabled, give error // **NOTE** There are several NCCL Windows ports e.g. https://github.com/MyCaffe/NCCL but we don't have access to any suitable systems to test - if(getPreferences().enableNCCLReductions) { + /*if(getPreferences().enableNCCLReductions) { throw std::runtime_error("GeNN doesn't currently support NCCL on Windows"); - } + }*/ #endif // Add CUDA-specific types, additionally marking them as 'device types' innaccesible to host code @@ -616,7 +616,7 @@ void Backend::genCustomUpdate(CodeStream &os, const ModelSpecMerged &modelMerged // Generate struct definitions modelMerged.genMergedCustomUpdateStructs(os, *this); modelMerged.genMergedCustomUpdateWUStructs(os, *this); - modelMerged.gemMergedCustomUpdateTransposeWUStructs(os, *this); + modelMerged.genMergedCustomUpdateTransposeWUStructs(os, *this); // Generate arrays of merged structs and functions to push them genMergedStructArrayPush(os, modelMerged.getMergedCustomUpdateGroups()); diff --git a/src/genn/backends/opencl/backend.cc b/src/genn/backends/opencl/backend.cc index e88b3f35d9..d5a292c40d 100644 --- a/src/genn/backends/opencl/backend.cc +++ b/src/genn/backends/opencl/backend.cc @@ -875,7 +875,7 @@ void Backend::genCustomUpdate(CodeStream &os, const ModelSpecMerged &modelMerged // Generate struct definitions modelMerged.genMergedCustomUpdateStructs(customUpdateKernels, *this); modelMerged.genMergedCustomUpdateWUStructs(customUpdateKernels, *this); - modelMerged.gemMergedCustomUpdateTransposeWUStructs(customUpdateKernels, *this); + modelMerged.genMergedCustomUpdateTransposeWUStructs(customUpdateKernels, *this); // Generate data structure for accessing merged groups from within custom update kernels diff --git a/src/genn/backends/single_threaded_cpu/backend.cc b/src/genn/backends/single_threaded_cpu/backend.cc index b8f15efec9..9a362dca47 100644 --- a/src/genn/backends/single_threaded_cpu/backend.cc +++ b/src/genn/backends/single_threaded_cpu/backend.cc @@ -459,7 +459,7 @@ void Backend::genCustomUpdate(CodeStream &os, const ModelSpecMerged &modelMerged // Generate struct definitions modelMerged.genMergedCustomUpdateStructs(os, *this); modelMerged.genMergedCustomUpdateWUStructs(os, *this); - modelMerged.gemMergedCustomUpdateTransposeWUStructs(os, *this); + modelMerged.genMergedCustomUpdateTransposeWUStructs(os, *this); // Generate arrays of merged structs and functions to set them genMergedStructArrayPush(os, modelMerged.getMergedCustomUpdateGroups()); diff --git a/src/genn/genn/code_generator/generateRunner.cc b/src/genn/genn/code_generator/generateRunner.cc index b72c7b9899..37fc041a95 100644 --- a/src/genn/genn/code_generator/generateRunner.cc +++ b/src/genn/genn/code_generator/generateRunner.cc @@ -797,6 +797,18 @@ MemAlloc CodeGenerator::generateRunner(const filesystem::path &outputPath, const runnerVarDecl, runnerMergedStructAlloc); } + // Loop through custom update host reduction groups + for(const auto &m : modelMerged.getMergedCustomUpdateHostReductionGroups()) { + m.generateRunner(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar, + runnerVarDecl, runnerMergedStructAlloc); + } + + // Loop through custom weight update host reduction groups + for(const auto &m : modelMerged.getMergedCustomWUUpdateHostReductionGroups()) { + m.generateRunner(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar, + runnerVarDecl, runnerMergedStructAlloc); + } + allVarStreams << "// ------------------------------------------------------------------------" << std::endl; allVarStreams << "// local neuron groups" << std::endl; allVarStreams << "// ------------------------------------------------------------------------" << std::endl; diff --git a/src/genn/genn/code_generator/groupMerged.cc b/src/genn/genn/code_generator/groupMerged.cc index 2688346c59..711c0d8867 100644 --- a/src/genn/genn/code_generator/groupMerged.cc +++ b/src/genn/genn/code_generator/groupMerged.cc @@ -2212,3 +2212,41 @@ boost::uuids::detail::sha1::digest_type CustomWUUpdateSparseInitGroupMerged::get return hash.get_digest(); } + +// ---------------------------------------------------------------------------- +// CustomUpdateHostReductionGroupMerged +//---------------------------------------------------------------------------- +const std::string CustomUpdateHostReductionGroupMerged::name = "CustomUpdateHostReduction"; +//---------------------------------------------------------------------------- +CustomUpdateHostReductionGroupMerged::CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, + const std::vector> &groups) +: CustomUpdateHostReductionGroupMergedBase(index, precision, backend, groups) +{ + addField("unsigned int", "size", + [](const CustomUpdateInternal &c, size_t) { return std::to_string(c.getSize()); }); + + // If some variables are delayed, add delay pointer + // **NOTE** this is HOST delay pointer + if(getArchetype().getDelayNeuronGroup() != nullptr) { + addField("unsigned int*", "spkQuePtr", + [&](const CustomUpdateInternal &cg, size_t) + { + return "spkQuePtr" + cg.getDelayNeuronGroup()->getName(); + }); + } +} + +// ---------------------------------------------------------------------------- +// CustomWUUpdateHostReductionGroupMerged +//---------------------------------------------------------------------------- +const std::string CustomWUUpdateHostReductionGroupMerged::name = "CustomWUUpdateHostReduction"; +//---------------------------------------------------------------------------- +CustomWUUpdateHostReductionGroupMerged::CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, + const std::vector> &groups) +: CustomUpdateHostReductionGroupMergedBase(index, precision, backend, groups) +{ + addField("unsigned int", "rowStride", + [&backend](const CustomUpdateWUInternal &cg, size_t) { return std::to_string(cg.getSynapseGroup()->getMaxConnections()); }); + addField("unsigned int", "numSrcNeurons", + [](const CustomUpdateWUInternal &cg, size_t) { return std::to_string(cg.getSynapseGroup()->getSrcNeuronGroup()->getNumNeurons()); }); +} \ No newline at end of file diff --git a/src/genn/genn/code_generator/modelSpecMerged.cc b/src/genn/genn/code_generator/modelSpecMerged.cc index d09d441067..be66d8b54a 100644 --- a/src/genn/genn/code_generator/modelSpecMerged.cc +++ b/src/genn/genn/code_generator/modelSpecMerged.cc @@ -142,6 +142,17 @@ ModelSpecMerged::ModelSpecMerged(const ModelSpecInternal &model, const BackendBa [](const CustomUpdateWUInternal &cg) { return cg.isTransposeOperation(); }, &CustomUpdateWUInternal::getHashDigest); + if(backend.isHostReductionRequired()) { + LOGD_CODE_GEN << "Merging custom weight update groups:"; + createMergedGroupsHash(model, backend, model.getCustomUpdates(), m_MergedCustomUpdateHostReductionGroups, + [](const CustomUpdateInternal &cg) { return cg.isReduction(); }, + &CustomUpdateInternal::getHashDigest); + + LOGD_CODE_GEN << "Merging custom weight transpose update groups:"; + createMergedGroupsHash(model, backend, model.getCustomWUUpdates(), m_MergedCustomWUUpdateHostReductionGroups, + [](const CustomUpdateWUInternal &cg) { return cg.isReduction(); }, + &CustomUpdateWUInternal::getHashDigest); + } // Get memory spaces available to this backend // **NOTE** Memory spaces are given out on a first-come, first-serve basis so subsequent groups are in preferential order diff --git a/tests/features/custom_update_reduction/model.cc b/tests/features/custom_update_reduction/model.cc index 1741e65180..c56f808f25 100644 --- a/tests/features/custom_update_reduction/model.cc +++ b/tests/features/custom_update_reduction/model.cc @@ -52,6 +52,9 @@ void modelDefinition(ModelSpec &model) GENN_PREFERENCES.manualPlatformID = std::atoi(std::getenv("OPENCL_PLATFORM")); } #endif + GENN_PREFERENCES.deviceSelectMethod = DeviceSelect::MANUAL; + GENN_PREFERENCES.blockSizeSelectMethod = BlockSizeSelect::MANUAL; + GENN_PREFERENCES.enableNCCLReductions = true; model.setDT(1.0); model.setName("custom_update_reduction"); model.setBatchSize(5); From e085a7a85e384be984f3c2d66b8940a3af87e61e Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 15:32:40 +0100 Subject: [PATCH 05/17] implement NCCL reductions --- .../genn/genn/code_generator/groupMerged.h | 4 +- src/genn/backends/cuda/backend.cc | 97 +++++++++++++++++++ src/genn/genn/code_generator/groupMerged.cc | 13 +-- 3 files changed, 106 insertions(+), 8 deletions(-) diff --git a/include/genn/genn/code_generator/groupMerged.h b/include/genn/genn/code_generator/groupMerged.h index fbdf055598..95ead59e9e 100644 --- a/include/genn/genn/code_generator/groupMerged.h +++ b/include/genn/genn/code_generator/groupMerged.h @@ -1824,7 +1824,7 @@ class CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGro { public: CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, - const std::vector> &groups); + const std::vector> &groups); //------------------------------------------------------------------------ // Public API @@ -1850,7 +1850,7 @@ class CustomWUUpdateHostReductionGroupMerged : public CustomUpdateHostReductionG { public: CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, - const std::vector> &groups); + const std::vector> &groups); //------------------------------------------------------------------------ // Public API diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 05b7abffa4..5d7dfae616 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -194,6 +194,84 @@ const std::vector &getFunctionTemplates(const s { return (precision == "double") ? cudaDoublePrecisionFunctions : cudaSinglePrecisionFunctions; } +//----------------------------------------------------------------------- +std::string getNCCLReductionType(VarAccessMode mode) +{ + // Convert GeNN reduction types to NCCL + if(mode & VarAccessModeAttribute::MAX) { + return "ncclMax"; + } + else if(mode & VarAccessModeAttribute::SUM) { + return "ncclSum"; + } + else { + throw std::runtime_error("Reduction type unsupported by NCCL"); + } +} +//----------------------------------------------------------------------- +std::string getNCCLType(const std::string &type, const std::string &precision) +{ + // Convert GeNN types to NCCL types + // **YUCK** GeNN really needs a better type system + if(type == "scalar") { + return (precision == "float") ? "ncclFloat32" : "ncclFloat64"; + } + else if(type == "char" || type == "signed char" || type == "int8_t") { + return "ncclInt8"; + } + else if(type == "unsigned char" || type == "uint8_t") { + return "ncclUint8"; + } + else if(type == "int" || type == "signed int" || type == "signed" || type == "int32_t") { + return "ncclInt32"; + } + else if(type == "unsigned" || type == "unsigned int" || type == "uint32_t") { + return "ncclUint32"; + } + else if(type == "half") { + return "ncclFloat16"; + } + else if(type == "float") { + return "ncclFloat32"; + } + else if(type == "double") { + return "ncclFloat64"; + } + else { + throw std::runtime_error("Data type '" + type + "' unsupported by NCCL"); + } +} +//----------------------------------------------------------------------- +template +void genNCCLReduction(CodeStream &os, const G &cg, const std::string &precision) +{ + CodeStream::Scope b(os); + os << "// merged custom update host reduction group " << cg.getIndex() << std::endl; + os << "for(unsigned int g = 0; g < " << cg.getGroups().size() << "; g++)"; + { + CodeStream::Scope b(os); + + // Get reference to group + os << "const auto *group = &merged" << G::name << "Group" << cg.getIndex() << "[g]; " << std::endl; + + // Loop through variables and add pointers if they are reduction targets + const CustomUpdateModels::Base *cm = cg.getArchetype().getCustomUpdateModel(); + for(const auto &v : cm->getVars()) { + if(v.access & VarAccessModeAttribute::REDUCE) { + os << "CHECK_NCCL_ERRORS(ncclAllReduce(group->" << v.name << ", group->" << v.name << ", group->size"; + os << ", " << getNCCLType(v.type, precision) << ", " << getNCCLReductionType(getVarAccessMode(v.access)) << ", ncclCommunicator, 0)); " << std::endl; + } + } + + // Loop through variable references and add pointers if they are reduction targets + for(const auto &v : cm->getVarRefs()) { + if(v.access & VarAccessModeAttribute::REDUCE) { + os << "CHECK_NCCL_ERRORS(ncclAllReduce(group->" << v.name << ", group->" << v.name << ", group->size"; + os << ", " << getNCCLType(v.type, precision) << ", " << getNCCLReductionType(v.access) << ", ncclCommunicator, 0));" << std::endl; + } + } + } +} } // Anonymous namespace //-------------------------------------------------------------------------- @@ -729,6 +807,25 @@ void Backend::genCustomUpdate(CodeStream &os, const ModelSpecMerged &modelMerged os << "CHECK_CUDA_ERRORS(cudaPeekAtLastError());" << std::endl; } + // If NCCL reductions are enabled + if(getPreferences().enableNCCLReductions) { + // Loop through custom update host reduction groups and + // generate reductions for those in this custom update group + for(const auto &cg : modelMerged.getMergedCustomUpdateHostReductionGroups()) { + if(cg.getArchetype().getUpdateGroupName() == g) { + genNCCLReduction(os, cg, model.getPrecision()); + } + } + + // Loop through custom update host reduction groups and + // generate reductions for those in this custom update group + for(const auto &cg : modelMerged.getMergedCustomWUUpdateHostReductionGroups()) { + if(cg.getArchetype().getUpdateGroupName() == g) { + genNCCLReduction(os, cg, model.getPrecision()); + } + } + } + // If timing is enabled if(model.isTimingEnabled()) { // Synchronise last event diff --git a/src/genn/genn/code_generator/groupMerged.cc b/src/genn/genn/code_generator/groupMerged.cc index 711c0d8867..1fe0417383 100644 --- a/src/genn/genn/code_generator/groupMerged.cc +++ b/src/genn/genn/code_generator/groupMerged.cc @@ -2219,7 +2219,7 @@ boost::uuids::detail::sha1::digest_type CustomWUUpdateSparseInitGroupMerged::get const std::string CustomUpdateHostReductionGroupMerged::name = "CustomUpdateHostReduction"; //---------------------------------------------------------------------------- CustomUpdateHostReductionGroupMerged::CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, - const std::vector> &groups) + const std::vector> &groups) : CustomUpdateHostReductionGroupMergedBase(index, precision, backend, groups) { addField("unsigned int", "size", @@ -2242,11 +2242,12 @@ CustomUpdateHostReductionGroupMerged::CustomUpdateHostReductionGroupMerged(size_ const std::string CustomWUUpdateHostReductionGroupMerged::name = "CustomWUUpdateHostReduction"; //---------------------------------------------------------------------------- CustomWUUpdateHostReductionGroupMerged::CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, - const std::vector> &groups) + const std::vector> &groups) : CustomUpdateHostReductionGroupMergedBase(index, precision, backend, groups) { - addField("unsigned int", "rowStride", - [&backend](const CustomUpdateWUInternal &cg, size_t) { return std::to_string(cg.getSynapseGroup()->getMaxConnections()); }); - addField("unsigned int", "numSrcNeurons", - [](const CustomUpdateWUInternal &cg, size_t) { return std::to_string(cg.getSynapseGroup()->getSrcNeuronGroup()->getNumNeurons()); }); + addField("unsigned int", "size", + [&backend](const CustomUpdateWUInternal &cg, size_t) + { + return std::to_string(cg.getSynapseGroup()->getMaxConnections() * (size_t)cg.getSynapseGroup()->getSrcNeuronGroup()->getNumNeurons()); + }); } \ No newline at end of file From e2ffaeecfbd35856b92a437a6225e63461a2d737 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 15:51:03 +0100 Subject: [PATCH 06/17] interface to expose ncclUniqueID data --- src/genn/backends/cuda/backend.cc | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 5d7dfae616..763454f864 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -1067,7 +1067,11 @@ void Backend::genDefinitionsPreamble(CodeStream &os, const ModelSpecMerged &) co // If NCCL is enabled, export ncclGetUniqueId function if(getPreferences().enableNCCLReductions) { - os << "extern \"C\" EXPORT_FUNC void ncclGetUniqueId();" << std::endl; + os << "extern \"C\" {" << std::endl; + os << "EXPORT_VAR const unsigned int ncclUniqueIDBytes;" << std::endl; + os << "EXPORT_FUNC void generateNCCLUniqueID();" << std::endl; + os << "EXPORT_FUNC const char *getNCCLUniqueID();" << std::endl; + os << "}" << std::endl; } } //-------------------------------------------------------------------------- @@ -1307,14 +1311,23 @@ void Backend::genRunnerPreamble(CodeStream &os, const ModelSpecMerged&, const Me os << "ncclUniqueId ncclID;" << std::endl; os << "ncclComm_t ncclCommunicator;" << std::endl; - // Define wrapper around ncclGetUniqueId function + // Define constant to expose NCCL_UNIQUE_ID_BYTES + os << "const unsigned int ncclUniqueIDBytes = NCCL_UNIQUE_ID_BYTES;" << std::endl; + + // Define wrapper to generate a unique NCCL ID os << std::endl; - os << "void ncclGetUniqueId()"; + os << "void generateNCCLUniqueID()"; { CodeStream::Scope b(os); os << "CHECK_NCCL_ERRORS(ncclGetUniqueId(&ncclID));" << std::endl; } os << std::endl; + os << "const char *getNCCLUniqueID()"; + { + CodeStream::Scope b(os); + os << "return reinterpret_cast(&ncclID);" << std::endl; + } + os << std::endl; } } //-------------------------------------------------------------------------- From 23731c45a485b6dfc1bdd2f4e16ffc0a7a644f52 Mon Sep 17 00:00:00 2001 From: James Knight Date: Tue, 3 Aug 2021 16:58:03 +0200 Subject: [PATCH 07/17] missing makefile --- tests/features/custom_update_reduction/Makefile | 1 + 1 file changed, 1 insertion(+) create mode 120000 tests/features/custom_update_reduction/Makefile diff --git a/tests/features/custom_update_reduction/Makefile b/tests/features/custom_update_reduction/Makefile new file mode 120000 index 0000000000..1302b13ca5 --- /dev/null +++ b/tests/features/custom_update_reduction/Makefile @@ -0,0 +1 @@ +../../utils/Makefile \ No newline at end of file From 012347cc6f3d350bc773d8936bf4833864ebefa7 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 16:08:40 +0100 Subject: [PATCH 08/17] missing bracket --- src/genn/backends/cuda/backend.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 763454f864..0ca5b84077 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -1099,6 +1099,7 @@ void Backend::genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerg os << " if (error != ncclSuccess) {\\" << std::endl; os << " throw std::runtime_error(__FILE__\": \" + std::to_string(__LINE__) + \": nccl error \" + std::to_string(error) + \": \" + ncclGetErrorString(error));\\" << std::endl; os << " }\\" << std::endl; + os << "}" << std::endl; } os << std::endl; From 558508829ad7eec3348a23acce1b4007dac85f32 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 16:43:23 +0100 Subject: [PATCH 09/17] fixed windows linker issues --- include/genn/genn/code_generator/groupMerged.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/genn/genn/code_generator/groupMerged.h b/include/genn/genn/code_generator/groupMerged.h index 95ead59e9e..c02d9b1c1f 100644 --- a/include/genn/genn/code_generator/groupMerged.h +++ b/include/genn/genn/code_generator/groupMerged.h @@ -1820,7 +1820,7 @@ class CustomUpdateHostReductionGroupMergedBase : public GroupMerged // ---------------------------------------------------------------------------- // CustomUpdateHostReductionGroupMerged //---------------------------------------------------------------------------- -class CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase +class GENN_EXPORT CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase { public: CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, @@ -1846,7 +1846,7 @@ class CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGro // ---------------------------------------------------------------------------- // CustomWUUpdateHostReductionGroupMerged //---------------------------------------------------------------------------- -class CustomWUUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase +class GENN_EXPORT CustomWUUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase { public: CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend, From 257cd546b0e296cc0442362bed01155b97fe804f Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 17:27:57 +0100 Subject: [PATCH 10/17] Added functionality to backends to generate ``freeMem`` preamble and used to destroy NCCL communicator --- include/genn/backends/cuda/backend.h | 1 + include/genn/backends/opencl/backend.h | 1 + include/genn/backends/single_threaded_cpu/backend.h | 1 + include/genn/genn/code_generator/backendBase.h | 4 ++++ src/genn/backends/cuda/backend.cc | 8 ++++++++ src/genn/backends/opencl/backend.cc | 4 ++++ src/genn/backends/single_threaded_cpu/backend.cc | 4 ++++ src/genn/genn/code_generator/generateRunner.cc | 5 ++++- 8 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/genn/backends/cuda/backend.h b/include/genn/backends/cuda/backend.h index 8e1f20c5d7..c68b49035d 100644 --- a/include/genn/backends/cuda/backend.h +++ b/include/genn/backends/cuda/backend.h @@ -192,6 +192,7 @@ class BACKEND_EXPORT Backend : public BackendSIMT virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override; virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override; + virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override; diff --git a/include/genn/backends/opencl/backend.h b/include/genn/backends/opencl/backend.h index eb6640f417..3027ae2a67 100644 --- a/include/genn/backends/opencl/backend.h +++ b/include/genn/backends/opencl/backend.h @@ -161,6 +161,7 @@ class BACKEND_EXPORT Backend : public BackendSIMT virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override; virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &allocations) const override; + virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override; diff --git a/include/genn/backends/single_threaded_cpu/backend.h b/include/genn/backends/single_threaded_cpu/backend.h index d61983b3af..6b0e1ff708 100644 --- a/include/genn/backends/single_threaded_cpu/backend.h +++ b/include/genn/backends/single_threaded_cpu/backend.h @@ -69,6 +69,7 @@ class BACKEND_EXPORT Backend : public BackendBase virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override; virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override; + virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override; virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override; diff --git a/include/genn/genn/code_generator/backendBase.h b/include/genn/genn/code_generator/backendBase.h index 577e0df693..5de5ce95c7 100644 --- a/include/genn/genn/code_generator/backendBase.h +++ b/include/genn/genn/code_generator/backendBase.h @@ -299,6 +299,10 @@ class GENN_EXPORT BackendBase //! Therefore it's a good place for any global initialisation. This function generates a 'preamble' to this function. virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const = 0; + //! Free memory is called by usercode to free all memory allocatd by GeNN and should only ever be called once. + //! This function generates a 'preamble' to this function, for example to free backend-specific objects + virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const = 0; + //! After all timestep logic is complete virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const = 0; diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 0ca5b84077..934120c7cf 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -1384,6 +1384,14 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode os << std::endl; } //-------------------------------------------------------------------------- +void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +{ + // Free NCCL communicator + if(getPreferences().enableNCCLReductions) { + os << "CHECK_NCCL_ERRORS(ncclCommDestroy(ncclCommunicator));" << std::endl; + } +} +//-------------------------------------------------------------------------- void Backend::genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const { // Synchronise if automatic copying or zero-copy are in use diff --git a/src/genn/backends/opencl/backend.cc b/src/genn/backends/opencl/backend.cc index d5a292c40d..6a9d5c094e 100644 --- a/src/genn/backends/opencl/backend.cc +++ b/src/genn/backends/opencl/backend.cc @@ -1774,6 +1774,10 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode } } //-------------------------------------------------------------------------- +void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +{ +} +//-------------------------------------------------------------------------- void Backend::genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const { // If timing is enabled, synchronise diff --git a/src/genn/backends/single_threaded_cpu/backend.cc b/src/genn/backends/single_threaded_cpu/backend.cc index 9a362dca47..5a8dd0eb86 100644 --- a/src/genn/backends/single_threaded_cpu/backend.cc +++ b/src/genn/backends/single_threaded_cpu/backend.cc @@ -1091,6 +1091,10 @@ void Backend::genAllocateMemPreamble(CodeStream&, const ModelSpecMerged&, const { } //-------------------------------------------------------------------------- +void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +{ +} +//-------------------------------------------------------------------------- void Backend::genStepTimeFinalisePreamble(CodeStream &, const ModelSpecMerged &) const { } diff --git a/src/genn/genn/code_generator/generateRunner.cc b/src/genn/genn/code_generator/generateRunner.cc index 37fc041a95..0265fc9c6c 100644 --- a/src/genn/genn/code_generator/generateRunner.cc +++ b/src/genn/genn/code_generator/generateRunner.cc @@ -1569,7 +1569,7 @@ MemAlloc CodeGenerator::generateRunner(const filesystem::path &outputPath, const { CodeStream::Scope b(runner); - // Generate preamble -this is the first bit of generated code called by user simulations + // Generate preamble - this is the first bit of generated code called by user simulations // so global initialisation is often performed here backend.genAllocateMemPreamble(runner, modelMerged, mem); @@ -1587,6 +1587,9 @@ MemAlloc CodeGenerator::generateRunner(const filesystem::path &outputPath, const { CodeStream::Scope b(runner); + // Generate backend-specific preamble + backend.genFreeMemPreamble(runner, modelMerged); + // Write variable frees to runner runner << runnerVarFreeStream.str(); } From 9289eaba986644c72932441112b47332d92871f1 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Tue, 3 Aug 2021 17:46:55 +0100 Subject: [PATCH 11/17] Tweaks * Adding allocateMem parameters is a bad idea as it breaks calling via sharedLibraryModel * Exposed all NCCL functionality via seperate generated functions --- generate_swig_interfaces.py | 6 --- .../include/sharedLibraryModelNumpy.h | 7 +++ src/genn/backends/cuda/backend.cc | 43 ++++++++----------- userproject/include/sharedLibraryModel.h | 37 +++++++++++++++- 4 files changed, 60 insertions(+), 33 deletions(-) diff --git a/generate_swig_interfaces.py b/generate_swig_interfaces.py index 0792a47672..ac04bbad5d 100644 --- a/generate_swig_interfaces.py +++ b/generate_swig_interfaces.py @@ -298,12 +298,6 @@ def generateNumpyApplyInArray1D( dataType, varName, sizeName ): '''Generates a line which applies numpy IN_ARRAY1 typemap to variable. IN_ARRAY1 is used to pass a numpy array as C array to C code''' return Template( '%apply ( ${data_t} IN_ARRAY1, int DIM1 ) {( ${data_t} ${varName}, int ${sizeName} )};\n').substitute( data_t=dataType, varName=varName, sizeName=sizeName ) -def generateBuiltInGetter( models ): - return Template('''std::vector< std::string > getBuiltInModels() { - return std::vector{"${MODELS}"}; -} -''').substitute( MODELS='", "'.join( models ) ) - def generateSharedLibraryModelInterface( swigPath ): '''Generates SharedLibraryModelNumpy.i file''' diff --git a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h index 7873d51833..2c8d0cc814 100644 --- a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h +++ b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h @@ -41,10 +41,17 @@ class SharedLibraryModelNumpy : public SharedLibraryModel *varPtr = this->template getScalar(varName); *n1 = 1; } + + void ncclAssignExternalUniqueID(unsigned char** varPtr, int* n1) + { + *varPtr = ncclGetUniqueID(); + *n1 = 128; + } private: // Hide C++ based public API using SharedLibraryModel::getSymbol; using SharedLibraryModel::getArray; using SharedLibraryModel::getScalar; + using SharedLibraryModel::ncclGetUniqueID; }; diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 934120c7cf..e24d5d62b7 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -1069,8 +1069,9 @@ void Backend::genDefinitionsPreamble(CodeStream &os, const ModelSpecMerged &) co if(getPreferences().enableNCCLReductions) { os << "extern \"C\" {" << std::endl; os << "EXPORT_VAR const unsigned int ncclUniqueIDBytes;" << std::endl; - os << "EXPORT_FUNC void generateNCCLUniqueID();" << std::endl; - os << "EXPORT_FUNC const char *getNCCLUniqueID();" << std::endl; + os << "EXPORT_FUNC void ncclGenerateUniqueID();" << std::endl; + os << "EXPORT_FUNC void ncclInitCommunicator(int rank, int numRanks);" << std::endl; + os << "EXPORT_FUNC unsigned char *ncclGetUniqueID();" << std::endl; os << "}" << std::endl; } } @@ -1317,16 +1318,22 @@ void Backend::genRunnerPreamble(CodeStream &os, const ModelSpecMerged&, const Me // Define wrapper to generate a unique NCCL ID os << std::endl; - os << "void generateNCCLUniqueID()"; + os << "void ncclGenerateUniqueID()"; { CodeStream::Scope b(os); os << "CHECK_NCCL_ERRORS(ncclGetUniqueId(&ncclID));" << std::endl; } os << std::endl; - os << "const char *getNCCLUniqueID()"; + os << "unsigned char *ncclGetUniqueID()"; { CodeStream::Scope b(os); - os << "return reinterpret_cast(&ncclID);" << std::endl; + os << "return reinterpret_cast(&ncclID);" << std::endl; + } + os << std::endl; + os << "void ncclInitCommunicator(int rank, int numRanks)"; + { + CodeStream::Scope b(os); + os << "CHECK_NCCL_ERRORS(ncclCommInitRank(&ncclCommunicator, numRanks, ncclID, rank));" << std::endl; } os << std::endl; } @@ -1375,12 +1382,6 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode os << "));" << std::endl; os << "CHECK_CUDA_ERRORS(cudaSetDevice(deviceID));" << std::endl; } - - // Initialise NCCL communicator - if(getPreferences().enableNCCLReductions) { - os << "CHECK_NCCL_ERRORS(ncclCommInitRank(&ncclCommunicator, numRanks, ncclID, rank));" << std::endl; - } - os << std::endl; } //-------------------------------------------------------------------------- @@ -1870,30 +1871,20 @@ void Backend::genMSBuildImportTarget(std::ostream &os) const std::string Backend::getAllocateMemParams(const ModelSpecMerged &) const { // If device should be selected at runtime - std::string params; if(getPreferences().deviceSelectMethod == DeviceSelect::MANUAL_RUNTIME) { // If devices should be delected by ID, add an integer parameter if(getPreferences().selectGPUByDeviceID) { - params += "int deviceID"; + return "int deviceID"; } // Otherwise, add a pci bus ID parameter else { - params += "const char *pciBusID"; + return "const char *pciBusID"; } } - - // If NCCL reductions are enabled - if(getPreferences().enableNCCLReductions) { - // If there are existing parameters, add comma - if(!params.empty()) { - params += ", "; - } - - // Add num ranks and rank parameter - params += "int numRanks, int rank"; + // Othewise, no parameters are required + else { + return ""; } - - return params; } //-------------------------------------------------------------------------- Backend::MemorySpaces Backend::getMergedGroupMemorySpaces(const ModelSpecMerged &modelMerged) const diff --git a/userproject/include/sharedLibraryModel.h b/userproject/include/sharedLibraryModel.h index 69e40033d4..42c24e7cf4 100644 --- a/userproject/include/sharedLibraryModel.h +++ b/userproject/include/sharedLibraryModel.h @@ -34,7 +34,8 @@ class SharedLibraryModel SharedLibraryModel() : m_Library(nullptr), m_AllocateMem(nullptr), m_AllocateRecordingBuffers(nullptr), m_FreeMem(nullptr), m_Initialize(nullptr), m_InitializeSparse(nullptr), - m_StepTime(nullptr), m_PullRecordingBuffersFromDevice(nullptr) + m_StepTime(nullptr), m_PullRecordingBuffersFromDevice(nullptr), + m_NCCLGenerateUniqueID(nullptr), m_NCCLGetUniqueID(nullptr), m_NCCLInitCommunicator(nullptr) { } @@ -90,6 +91,10 @@ class SharedLibraryModel m_T = (scalar*)getSymbol("t"); m_Timestep = (unsigned long long*)getSymbol("iT"); + + m_NCCLGenerateUniqueID = (VoidFunction)getSymbol("ncclGenerateUniqueID", true); + m_NCCLGetUniqueID = (UCharPtrFunction)getSymbol("ncclGetUniqueID", true); + m_NCCLInitCommunicator = (NCCLInitCommunicatorFunction)getSymbol("ncclInitCommunicator", true); return true; } @@ -366,7 +371,31 @@ class SharedLibraryModel { return m_GetFreeDeviceMemBytes(); } + + void ncclGenerateUniqueID() + { + if(m_NCCLGenerateUniqueID == nullptr) { + throw std::runtime_error("Cannot generate NCCL unique ID - model may not have been built with NCCL support"); + } + m_NCCLGenerateUniqueID(); + } + + unsigned char *ncclGetUniqueID() + { + if(m_NCCLGetUniqueID == nullptr) { + throw std::runtime_error("Cannot get NCCL unique ID - model may not have been built with NCCL support"); + } + return m_NCCLGetUniqueID(); + } + void ncclInitCommunicator(int rank, int numRanks) + { + if(m_NCCLInitCommunicator == nullptr) { + throw std::runtime_error("Cannot initialise NCCL communicator - model may not have been built with NCCL support"); + } + m_NCCLInitCommunicator(rank, numRanks); + } + void initialize() { m_Initialize(); @@ -462,10 +491,12 @@ class SharedLibraryModel // Typedefines //---------------------------------------------------------------------------- typedef void (*VoidFunction)(void); + typedef unsigned char* (*UCharPtrFunction)(void); typedef void (*PushFunction)(bool); typedef void (*PullFunction)(void); typedef void (*EGPFunction)(unsigned int); typedef size_t (*GetFreeMemFunction)(void); + typedef void (*NCCLInitCommunicatorFunction)(int, int); typedef std::pair PushPullFunc; typedef std::tuple EGPFunc; @@ -536,6 +567,10 @@ class SharedLibraryModel VoidFunction m_Initialize; VoidFunction m_InitializeSparse; VoidFunction m_StepTime; + VoidFunction m_NCCLGenerateUniqueID; + UCharPtrFunction m_NCCLGetUniqueID; + NCCLInitCommunicatorFunction m_NCCLInitCommunicator; + PullFunction m_PullRecordingBuffersFromDevice; std::unordered_map m_PopulationVars; From 461f0650b669ce366fa8d8e45bfd5e019d2d2a75 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 11 Aug 2021 14:45:06 +0100 Subject: [PATCH 12/17] reinstate Windows NCCL error --- src/genn/backends/cuda/backend.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index e24d5d62b7..67ae319625 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -303,9 +303,9 @@ Backend::Backend(const KernelBlockSize &kernelBlockSizes, const Preferences &pre #ifdef _WIN32 // If we're on Windows and NCCL is enabled, give error // **NOTE** There are several NCCL Windows ports e.g. https://github.com/MyCaffe/NCCL but we don't have access to any suitable systems to test - /*if(getPreferences().enableNCCLReductions) { + if(getPreferences().enableNCCLReductions) { throw std::runtime_error("GeNN doesn't currently support NCCL on Windows"); - }*/ + } #endif // Add CUDA-specific types, additionally marking them as 'device types' innaccesible to host code From 6a2dbc1e0c9bd531c439c451fe31edb88a245cf6 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 11 Aug 2021 15:23:39 +0100 Subject: [PATCH 13/17] fixed warnings --- src/genn/backends/opencl/backend.cc | 2 +- src/genn/backends/single_threaded_cpu/backend.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/genn/backends/opencl/backend.cc b/src/genn/backends/opencl/backend.cc index 6a9d5c094e..6f97e31b8c 100644 --- a/src/genn/backends/opencl/backend.cc +++ b/src/genn/backends/opencl/backend.cc @@ -1774,7 +1774,7 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode } } //-------------------------------------------------------------------------- -void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +void Backend::genFreeMemPreamble(CodeStream&, const ModelSpecMerged&) const { } //-------------------------------------------------------------------------- diff --git a/src/genn/backends/single_threaded_cpu/backend.cc b/src/genn/backends/single_threaded_cpu/backend.cc index b93c8f215c..443499f79d 100644 --- a/src/genn/backends/single_threaded_cpu/backend.cc +++ b/src/genn/backends/single_threaded_cpu/backend.cc @@ -1098,7 +1098,7 @@ void Backend::genAllocateMemPreamble(CodeStream&, const ModelSpecMerged&, const { } //-------------------------------------------------------------------------- -void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +void Backend::genFreeMemPreamble(CodeStream&, const ModelSpecMerged&) const { } //-------------------------------------------------------------------------- From dda134fae38aff1419b340de50be6e7ce7df2d43 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 11 Aug 2021 15:33:33 +0100 Subject: [PATCH 14/17] implemented ncclGetUniqueIDBytes --- .../include/sharedLibraryModelNumpy.h | 2 +- userproject/include/sharedLibraryModel.h | 22 +++++++++++++++---- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h index 2c8d0cc814..40dc53608e 100644 --- a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h +++ b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h @@ -45,7 +45,7 @@ class SharedLibraryModelNumpy : public SharedLibraryModel void ncclAssignExternalUniqueID(unsigned char** varPtr, int* n1) { *varPtr = ncclGetUniqueID(); - *n1 = 128; + *n1 = ncclGetUniqueIDBytes(); } private: diff --git a/userproject/include/sharedLibraryModel.h b/userproject/include/sharedLibraryModel.h index 42c24e7cf4..07bc53ce0d 100644 --- a/userproject/include/sharedLibraryModel.h +++ b/userproject/include/sharedLibraryModel.h @@ -35,7 +35,8 @@ class SharedLibraryModel : m_Library(nullptr), m_AllocateMem(nullptr), m_AllocateRecordingBuffers(nullptr), m_FreeMem(nullptr), m_Initialize(nullptr), m_InitializeSparse(nullptr), m_StepTime(nullptr), m_PullRecordingBuffersFromDevice(nullptr), - m_NCCLGenerateUniqueID(nullptr), m_NCCLGetUniqueID(nullptr), m_NCCLInitCommunicator(nullptr) + m_NCCLGenerateUniqueID(nullptr), m_NCCLGetUniqueID(nullptr), + m_NCCLInitCommunicator(nullptr), m_NCCLUniqueIDBytes(nullptr) { } @@ -95,7 +96,7 @@ class SharedLibraryModel m_NCCLGenerateUniqueID = (VoidFunction)getSymbol("ncclGenerateUniqueID", true); m_NCCLGetUniqueID = (UCharPtrFunction)getSymbol("ncclGetUniqueID", true); m_NCCLInitCommunicator = (NCCLInitCommunicatorFunction)getSymbol("ncclInitCommunicator", true); - + m_NCCLUniqueIDBytes = (unsigned int*)getSymbol("ncclUniqueIDBytes", true); return true; } else { @@ -387,6 +388,15 @@ class SharedLibraryModel } return m_NCCLGetUniqueID(); } + + unsigned int ncclGetUniqueIDBytes() const + { + if(m_NCCLUniqueIDBytes == nullptr) { + throw std::runtime_error("Cannot get NCCL unique ID bytes - model may not have been built with NCCL support"); + } + + return *m_NCCLUniqueIDBytes; + } void ncclInitCommunicator(int rank, int numRanks) { @@ -395,6 +405,8 @@ class SharedLibraryModel } m_NCCLInitCommunicator(rank, numRanks); } + + void initialize() { @@ -567,12 +579,14 @@ class SharedLibraryModel VoidFunction m_Initialize; VoidFunction m_InitializeSparse; VoidFunction m_StepTime; + + PullFunction m_PullRecordingBuffersFromDevice; + VoidFunction m_NCCLGenerateUniqueID; UCharPtrFunction m_NCCLGetUniqueID; NCCLInitCommunicatorFunction m_NCCLInitCommunicator; + const unsigned int *m_NCCLUniqueIDBytes; - PullFunction m_PullRecordingBuffersFromDevice; - std::unordered_map m_PopulationVars; std::unordered_map m_PopulationEPGs; std::unordered_map m_CustomUpdates; From 050926fba8e7128b8a29c155f7254e5ef6c2a7ba Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 11 Aug 2021 15:37:21 +0100 Subject: [PATCH 15/17] removed NCCL from test --- tests/features/custom_update_reduction/model.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/features/custom_update_reduction/model.cc b/tests/features/custom_update_reduction/model.cc index c56f808f25..1741e65180 100644 --- a/tests/features/custom_update_reduction/model.cc +++ b/tests/features/custom_update_reduction/model.cc @@ -52,9 +52,6 @@ void modelDefinition(ModelSpec &model) GENN_PREFERENCES.manualPlatformID = std::atoi(std::getenv("OPENCL_PLATFORM")); } #endif - GENN_PREFERENCES.deviceSelectMethod = DeviceSelect::MANUAL; - GENN_PREFERENCES.blockSizeSelectMethod = BlockSizeSelect::MANUAL; - GENN_PREFERENCES.enableNCCLReductions = true; model.setDT(1.0); model.setName("custom_update_reduction"); model.setBatchSize(5); From fad06a6589e4544589ec0bdcc2443928f305809e Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 11 Aug 2021 17:16:59 +0100 Subject: [PATCH 16/17] added this-> --- pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h index 40dc53608e..65e694e8a9 100644 --- a/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h +++ b/pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h @@ -45,7 +45,7 @@ class SharedLibraryModelNumpy : public SharedLibraryModel void ncclAssignExternalUniqueID(unsigned char** varPtr, int* n1) { *varPtr = ncclGetUniqueID(); - *n1 = ncclGetUniqueIDBytes(); + *n1 = (int)this->ncclGetUniqueIDBytes(); } private: @@ -54,4 +54,5 @@ class SharedLibraryModelNumpy : public SharedLibraryModel using SharedLibraryModel::getArray; using SharedLibraryModel::getScalar; using SharedLibraryModel::ncclGetUniqueID; + using SharedLibraryModel::ncclGetUniqueIDBytes; }; From e0d5e433913aea4766353efc15bd724cf15bd6a0 Mon Sep 17 00:00:00 2001 From: neworderofjamie Date: Wed, 1 Sep 2021 09:18:31 +0100 Subject: [PATCH 17/17] fixed warnings --- src/genn/backends/cuda/backend.cc | 2 +- tests/unit/currentSource.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/genn/backends/cuda/backend.cc b/src/genn/backends/cuda/backend.cc index 67ae319625..48f2508bec 100644 --- a/src/genn/backends/cuda/backend.cc +++ b/src/genn/backends/cuda/backend.cc @@ -1385,7 +1385,7 @@ void Backend::genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &mode os << std::endl; } //-------------------------------------------------------------------------- -void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const +void Backend::genFreeMemPreamble(CodeStream &os, const ModelSpecMerged&) const { // Free NCCL communicator if(getPreferences().enableNCCLReductions) { diff --git a/tests/unit/currentSource.cc b/tests/unit/currentSource.cc index f18b2f5647..6074ebabd4 100644 --- a/tests/unit/currentSource.cc +++ b/tests/unit/currentSource.cc @@ -94,7 +94,7 @@ TEST(CurrentSource, InvalidName) NeuronModels::Izhikevich::VarValues varVals(0.0, 0.0); ModelSpec model; - auto *pop = model.addNeuronPopulation("Pop", 10, paramVals, varVals); + model.addNeuronPopulation("Pop", 10, paramVals, varVals); try { model.addCurrentSource("CS-2", "Pop", {1.0}, {});