Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NCCL multi-GPU reductions #449

Merged
merged 18 commits into from
Sep 6, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions generate_swig_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,12 +298,6 @@ def generateNumpyApplyInArray1D( dataType, varName, sizeName ):
'''Generates a line which applies numpy IN_ARRAY1 typemap to variable. IN_ARRAY1 is used to pass a numpy array as C array to C code'''
return Template( '%apply ( ${data_t} IN_ARRAY1, int DIM1 ) {( ${data_t} ${varName}, int ${sizeName} )};\n').substitute( data_t=dataType, varName=varName, sizeName=sizeName )

def generateBuiltInGetter( models ):
return Template('''std::vector< std::string > getBuiltInModels() {
return std::vector<std::string>{"${MODELS}"};
}
''').substitute( MODELS='", "'.join( models ) )


def generateSharedLibraryModelInterface( swigPath ):
'''Generates SharedLibraryModelNumpy.i file'''
Expand Down
8 changes: 8 additions & 0 deletions include/genn/backends/cuda/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ struct Preferences : public PreferencesBase
//! it was optimized for. However if, for example, you are running on a cluser with NVML this is not desired behaviour.
bool selectGPUByDeviceID = false;

//! Generate corresponding NCCL batch reductions
bool enableNCCLReductions = false;

//! How to select GPU device
DeviceSelect deviceSelectMethod = DeviceSelect::OPTIMAL;

Expand Down Expand Up @@ -110,6 +113,7 @@ struct Preferences : public PreferencesBase
Utils::updateHash(selectGPUByDeviceID, hash);
Utils::updateHash(deviceSelectMethod, hash);
Utils::updateHash(constantCacheOverhead, hash);
Utils::updateHash(enableNCCLReductions, hash);
}
};

Expand Down Expand Up @@ -188,6 +192,7 @@ class BACKEND_EXPORT Backend : public BackendSIMT
virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;

virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override;
Expand Down Expand Up @@ -269,6 +274,9 @@ class BACKEND_EXPORT Backend : public BackendSIMT
//! Different backends seed RNGs in different ways. Does this one initialise population RNGS on device?
virtual bool isPopulationRNGInitialisedOnDevice() const override { return true; }

//! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels
virtual bool isHostReductionRequired() const override { return getPreferences<Preferences>().enableNCCLReductions; }

//! How many bytes of memory does 'device' have
virtual size_t getDeviceMemoryBytes() const override{ return m_ChosenDevice.totalGlobalMem; }

Expand Down
4 changes: 4 additions & 0 deletions include/genn/backends/opencl/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ class BACKEND_EXPORT Backend : public BackendSIMT
virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &allocations) const override;
virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;

virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override;
Expand Down Expand Up @@ -247,6 +248,9 @@ class BACKEND_EXPORT Backend : public BackendSIMT
//! Different backends seed RNGs in different ways. Does this one initialise population RNGS on device?
virtual bool isPopulationRNGInitialisedOnDevice() const override { return false; }

//! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels
virtual bool isHostReductionRequired() const override { return false; }

//! How many bytes of memory does 'device' have
virtual size_t getDeviceMemoryBytes() const override { return m_ChosenDevice.getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>(); }

Expand Down
4 changes: 4 additions & 0 deletions include/genn/backends/single_threaded_cpu/backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ class BACKEND_EXPORT Backend : public BackendBase
virtual void genDefinitionsInternalPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genRunnerPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const override;
virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;
virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const override;

virtual void genVariableDefinition(CodeStream &definitions, CodeStream &definitionsInternal, const std::string &type, const std::string &name, VarLocation loc) const override;
Expand Down Expand Up @@ -151,6 +152,9 @@ class BACKEND_EXPORT Backend : public BackendBase
virtual bool isSynRemapRequired(const SynapseGroupInternal&) const override{ return false; }
virtual bool isPostsynapticRemapRequired() const override{ return true; }

//! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels
virtual bool isHostReductionRequired() const override { return false; }

//! How many bytes of memory does 'device' have
virtual size_t getDeviceMemoryBytes() const override{ return 0; }

Expand Down
7 changes: 7 additions & 0 deletions include/genn/genn/code_generator/backendBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ class GENN_EXPORT BackendBase
//! Therefore it's a good place for any global initialisation. This function generates a 'preamble' to this function.
virtual void genAllocateMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged, const MemAlloc &memAlloc) const = 0;

//! Free memory is called by usercode to free all memory allocatd by GeNN and should only ever be called once.
//! This function generates a 'preamble' to this function, for example to free backend-specific objects
virtual void genFreeMemPreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const = 0;

//! After all timestep logic is complete
virtual void genStepTimeFinalisePreamble(CodeStream &os, const ModelSpecMerged &modelMerged) const = 0;

Expand Down Expand Up @@ -437,6 +441,9 @@ class GENN_EXPORT BackendBase
//! Different backends may implement synaptic plasticity differently. Does this one require a postsynaptic remapping data structure?
virtual bool isPostsynapticRemapRequired() const = 0;

//! Backends which support batch-parallelism might require an additional host reduction phase after reduction kernels
virtual bool isHostReductionRequired() const = 0;

//! How many bytes of memory does 'device' have
virtual size_t getDeviceMemoryBytes() const = 0;

Expand Down
80 changes: 80 additions & 0 deletions include/genn/genn/code_generator/groupMerged.h
Original file line number Diff line number Diff line change
Expand Up @@ -1788,4 +1788,84 @@ class GENN_EXPORT CustomWUUpdateSparseInitGroupMerged : public CustomUpdateInitG
//----------------------------------------------------------------------------
static const std::string name;
};

// ----------------------------------------------------------------------------
// CustomUpdateHostReductionGroupMergedBase
//----------------------------------------------------------------------------
template<typename G>
class CustomUpdateHostReductionGroupMergedBase : public GroupMerged<G>
{
protected:
CustomUpdateHostReductionGroupMergedBase(size_t index, const std::string &precision, const BackendBase &backend,
const std::vector<std::reference_wrapper<const G>> &groups)
: GroupMerged<G>(index, precision, groups)
{
// Loop through variables and add pointers if they are reduction targets
const CustomUpdateModels::Base *cm = this->getArchetype().getCustomUpdateModel();
for(const auto &v : cm->getVars()) {
if(v.access & VarAccessModeAttribute::REDUCE) {
this->addPointerField(v.type, v.name, backend.getDeviceVarPrefix() + v.name);
}
}

// Loop through variable references and add pointers if they are reduction targets
for(const auto &v : cm->getVarRefs()) {
if(v.access & VarAccessModeAttribute::REDUCE) {
this->addPointerField(v.type, v.name, backend.getDeviceVarPrefix() + v.name);
}
}
}
};

// ----------------------------------------------------------------------------
// CustomUpdateHostReductionGroupMerged
//----------------------------------------------------------------------------
class GENN_EXPORT CustomUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase<CustomUpdateInternal>
{
public:
CustomUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend,
const std::vector<std::reference_wrapper<const CustomUpdateInternal>> &groups);

//------------------------------------------------------------------------
// Public API
//------------------------------------------------------------------------
void generateRunner(const BackendBase &backend, CodeStream &definitionsInternal,
CodeStream &definitionsInternalFunc, CodeStream &definitionsInternalVar,
CodeStream &runnerVarDecl, CodeStream &runnerMergedStructAlloc) const
{
generateRunnerBase(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar,
runnerVarDecl, runnerMergedStructAlloc, name, true);
}

//----------------------------------------------------------------------------
// Static constants
//----------------------------------------------------------------------------
static const std::string name;
};

// ----------------------------------------------------------------------------
// CustomWUUpdateHostReductionGroupMerged
//----------------------------------------------------------------------------
class GENN_EXPORT CustomWUUpdateHostReductionGroupMerged : public CustomUpdateHostReductionGroupMergedBase<CustomUpdateWUInternal>
{
public:
CustomWUUpdateHostReductionGroupMerged(size_t index, const std::string &precision, const std::string &, const BackendBase &backend,
const std::vector<std::reference_wrapper<const CustomUpdateWUInternal>> &groups);

//------------------------------------------------------------------------
// Public API
//------------------------------------------------------------------------
void generateRunner(const BackendBase &backend, CodeStream &definitionsInternal,
CodeStream &definitionsInternalFunc, CodeStream &definitionsInternalVar,
CodeStream &runnerVarDecl, CodeStream &runnerMergedStructAlloc) const
{
generateRunnerBase(backend, definitionsInternal, definitionsInternalFunc, definitionsInternalVar,
runnerVarDecl, runnerMergedStructAlloc, name, true);
}

//----------------------------------------------------------------------------
// Static constants
//----------------------------------------------------------------------------
static const std::string name;
};
} // namespace CodeGenerator
16 changes: 15 additions & 1 deletion include/genn/genn/code_generator/modelSpecMerged.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,12 @@ class GENN_EXPORT ModelSpecMerged
//! Get merged custom weight update groups where transpose needs to be calculated
const std::vector<CustomUpdateTransposeWUGroupMerged> &getMergedCustomUpdateTransposeWUGroups() const { return m_MergedCustomUpdateTransposeWUGroups; }

//! Get merged custom update groups where host reduction needs to be performed
const std::vector<CustomUpdateHostReductionGroupMerged> &getMergedCustomUpdateHostReductionGroups() const { return m_MergedCustomUpdateHostReductionGroups; }

//! Get merged custom weight update groups where host reduction needs to be performed
const std::vector<CustomWUUpdateHostReductionGroupMerged> &getMergedCustomWUUpdateHostReductionGroups() const { return m_MergedCustomWUUpdateHostReductionGroups; }

void genMergedNeuronUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedNeuronUpdateGroups); }
void genMergedPresynapticUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedPresynapticUpdateGroups); }
void genMergedPostsynapticUpdateGroupStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedPostsynapticUpdateGroups); }
Expand All @@ -167,7 +173,9 @@ class GENN_EXPORT ModelSpecMerged
void genMergedSynapseConnectivityHostInitStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedSynapseConnectivityHostInitGroups); }
void genMergedCustomUpdateStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateGroups); }
void genMergedCustomUpdateWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateWUGroups); }
void gemMergedCustomUpdateTransposeWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateTransposeWUGroups); }
void genMergedCustomUpdateTransposeWUStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateTransposeWUGroups); }
void genMergedCustomUpdateHostReductionStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomUpdateHostReductionGroups); }
void genMergedCustomWUUpdateHostReductionStructs(CodeStream &os, const BackendBase &backend) const { genMergedStructures(os, backend, m_MergedCustomWUUpdateHostReductionGroups); }

void genNeuronUpdateGroupSupportCode(CodeStream &os, bool supportsNamespace = true) const{ m_NeuronUpdateSupportCode.gen(os, getModel().getPrecision(), supportsNamespace); }
void genPostsynapticDynamicsSupportCode(CodeStream &os, bool supportsNamespace = true) const{ m_PostsynapticDynamicsSupportCode.gen(os, getModel().getPrecision(), supportsNamespace); }
Expand Down Expand Up @@ -409,6 +417,12 @@ class GENN_EXPORT ModelSpecMerged
//! Merged custom weight update groups where transpose needs to be calculated
std::vector<CustomUpdateTransposeWUGroupMerged> m_MergedCustomUpdateTransposeWUGroups;

//! Get merged custom update groups where host reduction needs to be performed
std::vector<CustomUpdateHostReductionGroupMerged> m_MergedCustomUpdateHostReductionGroups;

//! Get merged custom weight update groups where host reduction needs to be performed
std::vector<CustomWUUpdateHostReductionGroupMerged> m_MergedCustomWUUpdateHostReductionGroups;

//! Unique support code strings for neuron update
SupportCodeMerged m_NeuronUpdateSupportCode;

Expand Down
8 changes: 8 additions & 0 deletions pygenn/genn_wrapper/include/sharedLibraryModelNumpy.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,18 @@ class SharedLibraryModelNumpy : public SharedLibraryModel<scalar>
*varPtr = this->template getScalar<T>(varName);
*n1 = 1;
}

void ncclAssignExternalUniqueID(unsigned char** varPtr, int* n1)
{
*varPtr = ncclGetUniqueID();
*n1 = (int)this->ncclGetUniqueIDBytes();
}

private:
// Hide C++ based public API
using SharedLibraryModel<scalar>::getSymbol;
using SharedLibraryModel<scalar>::getArray;
using SharedLibraryModel<scalar>::getScalar;
using SharedLibraryModel<scalar>::ncclGetUniqueID;
using SharedLibraryModel<scalar>::ncclGetUniqueIDBytes;
};
Loading