Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mothership-facing changes to support hardware idle. #282

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
f5a5872
Add new configuration section 'mode', and field 'single_app_mode', to…
mvousden Aug 27, 2021
28c8c09
Deployment logic reads from the aforementioned config field, and lets…
mvousden Aug 27, 2021
026550e
Whitespace fixes.
mvousden Aug 27, 2021
10850e9
Define an `APP,EMPT` message, which is sent by root in single-app mod…
mvousden Aug 27, 2021
8ef3182
Add a preprocessor warning to make sure GMB defines these paths in fu…
mvousden Aug 27, 2021
02c286d
Put the Tinsel submodule back (whoops).
mvousden Aug 27, 2021
e5cac41
Merge branch 'development' into FEATURE-0242-HardwareIdle-Mothership
mvousden Oct 22, 2021
4011759
Merge branch 'FEATURE-0242-HardwareIdle' into FEATURE-0242-HardwareId…
heliosfa Oct 25, 2021
e9c983a
add logic to get dummy binaries
heliosfa Oct 25, 2021
7c7dd68
Merge branch 'FEATURE-0242-HardwareIdle' into FEATURE-0242-HardwareId…
heliosfa Oct 25, 2021
c4450fe
Add missing entry for APP,EMPT messages in the Mothership's MPI messa…
mvousden Oct 27, 2021
7fb7f9e
Fix bad decode message format.
mvousden Oct 27, 2021
2e85f79
Merge branch 'FEATURE-0242-HardwareIdle-Mothership' of github.com:POE…
mvousden Oct 27, 2021
acd5c84
Merge branch 'FEATURE-0242-HardwareIdle' into FEATURE-0242-HardwareId…
heliosfa Oct 27, 2021
09e91f3
Fix some misleading reporting in debug mode.
mvousden Oct 27, 2021
f8a6aca
Merge branch 'FEATURE-0242-HardwareIdle' into FEATURE-0242-HardwareId…
heliosfa Oct 27, 2021
96928e4
Merge branch 'FEATURE-0242-HardwareIdle-Mothership' of https://github…
heliosfa Oct 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions Config/Orchestrator.ocfg
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@

[Orchestrator_header]
// All pretty arbitrary; just copied in and stored
name = OrchestratorConfiguration
author = "MLV and GMB"
date = "2021-01-17"
version = "0.0.6"
name = "OrchestratorConfiguration"
author = "MLV"
date = "2021-08-27"
version = "0.1.0"

// All these may be overridden by the console "path" command
[default_paths]
Expand Down Expand Up @@ -58,6 +58,11 @@ hardware = "../Config/POETSHardwareOneBox.ocfg"
// Default flags for the cross-compiler
// build = "\oink -plop ++wheeee !. <?> "

// Alter the behaviour of the Orchestrator
[modes]
single_app_mode = "true" // Either 'true' or 'false'. Must be true to support
// hardware idle, for now.

// Elaboration messages passed out to the author for errors in processing THIS
// file They are all classed as "Unrecoverable", not because they are, but
// because: If you're not a grown-up you shouldn't be mucking about with it
Expand Down
10 changes: 6 additions & 4 deletions Config/OrchestratorMessages.ocfg
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
// Loading, typelinking

100(U) : "MLV: Allocated but as yet unused"
101(S) : "Decoder in %s (PID %s) has dropped a packet from %s(rank %s) to %s(rank %s) with key 0x%s"
101(S) : "Decoder in %s (rank %s) has dropped a packet from %s (rank %s) with key 0x%s."
102(I) : "Task graph default file path is ||%s||"
103(I) : "New path is ||%s||"
104(W) : "Filename %s does not parse correctly"
Expand Down Expand Up @@ -450,7 +450,7 @@
534(I) : "Mothership (rank %s): All devices on this Mothership for application '%s' have stopped."
535(I) : "Mothership (rank %s): Recalling application '%s'."
536(I) : "Mothership (rank %s): Application '%s' recalled. This Mothership has forgotten everything about this application."
537(U) : "MLV: Allocated but as yet unused"
537(E) : "Mothership: Error decoding MPI message with key '0x%s': Expected bool in field %s. Ignoring message."
538(U) : "MLV: Allocated but as yet unused"
539(U) : "MLV: Allocated but as yet unused"
540(U) : "GMB: Allocated but as yet unused"
Expand Down Expand Up @@ -489,8 +489,10 @@
573(U) : "MLV: Allocated but as yet unused"
574(U) : "MLV: Allocated but as yet unused"
575(U) : "MLV: Allocated but as yet unused"
578(U) : "MLV: Allocated but as yet unused"
579(U) : "MLV: Allocated but as yet unused"
576(U) : "MLV: Allocated but as yet unused"
577(I) : "Mothership: Calling backend->loadAll."
578(I) : "Mothership: Calling backend->go."
579(I) : "Mothership: Calling backend->startAll."
580(E) : "Mothership: Received a log packet with an invalid device index 0x%s."
581(I) : "Mothership: Consuming log packet from device address 0x%s with name %s."
582(I) : "Mothership: Received a message containing packets for a supervisor device for application '%s' that is not running (it may be in the process of stopping). Ignoring these packets."
Expand Down
7 changes: 2 additions & 5 deletions Source/Common/Decode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,9 @@ if (CommonBase::FnMap.find(pPkt->Key())!=CommonBase::FnMap.end()) {
// Nope. Kick.
// Pull out the unknown key and post what
// little we know to the LogServer
Post(101,Sderived,int2str(pPkt->Src()),pPmap->vPmap[pPkt->Src()].P_class,int2str(pPkt->Tgt()),
pPmap->vPmap[pPkt->Tgt()].P_class,hex2str(pPkt->Key()));
Post(101,Sderived,int2str(pPkt->Tgt()),pPmap->vPmap[pPkt->Src()].P_class,
int2str(pPkt->Src()),hex2str(pPkt->Key()));
return 0; // Return "keep going" value
}

//==============================================================================



3 changes: 2 additions & 1 deletion Source/Common/Pglobals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ const byte Q::SPEC = 0x50;
const byte Q::SUPD = 0x51;
const byte Q::INIT = 0x52;
const byte Q::CNC = 0x53;
const byte Q::ACKt = 0x54;
const byte Q::ACKt = 0x54;
const byte Q::SUPR = 0x55;
const byte Q::EMPT = 0x56;
// temporary use: for MPI testing ------------------------------------------
const byte Q::M0 = 0x60;
const byte Q::M1 = 0x61;
Expand Down
5 changes: 5 additions & 0 deletions Source/Common/Pglobals.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,12 @@ Mothership
----------
EXIT |- |- |- | (None)
SYST |KILL |- |- | (None)
APP |EMPT |- |- | (0:string)Code path to broadcast
| (1:string)Data path to broadcast
APP |SPEC |- |- | (0:string)Application name
(1:uint32_t)Number of expected distribution messages
(2:uint8_t)Application number
(3:bool)Hardware-idle application?
APP |DIST |- |- | (0:string)Application name
(1:string)Code path for this core
(2:string)Data path for this core
Expand Down Expand Up @@ -152,6 +156,7 @@ static const byte INIT;
static const byte CNC;
static const byte ACK;
static const byte SUPR;
static const byte EMPT;
// temporary use: for MPI testing ------------------------------------------
static const byte M0;
static const byte M1;
Expand Down
14 changes: 8 additions & 6 deletions Source/Mothership/AppDB.cpp
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#include "AppDB.h"

/* Checks appInfos for an application of a given name. If it doesn't exist,
* AppDB creates it and returns a pointer to it (passing distCountExpected to
* it as an argument). If it already exists, returns a pointer to the existing
* application, and does not use distCountExpected. */
AppInfo* AppDB::check_create_app(std::string name, uint32_t distCountExpected)
* AppDB creates it and returns a pointer to it (passing distCountExpected and
* soloApp to it as an argument). If it already exists, returns a pointer to
* the existing application, does not use distCountExpected, but sets
* soloApp. */
AppInfo* AppDB::check_create_app(std::string name, uint32_t distCountExpected,
bool soloApp)
{
AppInfoIt appFinder = appInfos.find(name);

Expand All @@ -21,7 +23,7 @@ AppInfo* AppDB::check_create_app(std::string name, uint32_t distCountExpected)
else
{
appInfos.insert(std::pair<std::string, AppInfo>
(name, AppInfo(name, distCountExpected)));
(name, AppInfo(name, distCountExpected, soloApp)));
}

return &(appInfos.find(name)->second);
Expand All @@ -37,7 +39,7 @@ AppInfo* AppDB::check_create_app(std::string name, uint32_t distCountExpected)
/* Sub-synonym. */
AppInfo* AppDB::check_create_app(std::string name)
{
return check_create_app(name, 0);
return check_create_app(name, 0, false); /* soloApp argument not used. */
}

/* Checks appInfos for an application of a given name, returning true if such
Expand Down
2 changes: 1 addition & 1 deletion Source/Mothership/AppDB.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ class AppDB
std::map<uint32_t, uint32_t> threadToCoreAddr;
std::map<uint8_t, std::string> numberToApp;

AppInfo* check_create_app(std::string, uint32_t);
AppInfo* check_create_app(std::string, uint32_t, bool);
AppInfo* check_create_app(std::string);
bool check_defined_app(std::string);
void recall_app(AppInfo*);
Expand Down
5 changes: 3 additions & 2 deletions Source/Mothership/AppInfo.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#include "AppInfo.h"

/* This constructor is used by SPEC messages. */
AppInfo::AppInfo(std::string name, uint32_t distCountExpected):
AppInfo::AppInfo(std::string name, uint32_t distCountExpected, bool soloApp):
name(name),
distCountExpected(distCountExpected)
distCountExpected(distCountExpected),
soloApp(soloApp)
{
pendingCommands = 0;
distCountCurrent = 0;
Expand Down
3 changes: 2 additions & 1 deletion Source/Mothership/AppInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@ enum AppState{UNDERDEFINED, /* We're still receiving DIST messages. */
class AppInfo
{
public:
AppInfo(std::string nameArg, uint32_t distCountExpected);
AppInfo(std::string nameArg, uint32_t distCountExpected, bool soloApp);
AppInfo(std::string nameArg);

std::string name;
uint32_t distCountExpected;
bool soloApp;
AppState state;
std::map<uint32_t, CoreInfo> coreInfos;
std::set<uint32_t> coresLoaded;
Expand Down
84 changes: 51 additions & 33 deletions Source/Mothership/AppTransitions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,44 +58,62 @@ void Mothership::initialise_application(AppInfo* app)
meshX, meshY, coreId);
}

/* 2: For each core, kick off the threads (mode=false).
* 3: For each core, start execution (mode=true). */
mode = false;
do
/* 2: For each core, kick off the threads.
* 3: For each core, start execution. */

/* In solo-app mode, use backend's available global start and go
* methods. */
if (app->soloApp)
{
for (coreIt = app->coreInfos.begin(); coreIt != app->coreInfos.end();
coreIt++)
{
backend->fromAddr(coreIt->first, &meshX, &meshY, &coreId,
&threadId);
debug_post(579, 0);
backend->startAll(); /* 2 */
debug_post(578, 0);
backend->go(); /* 3 */
}

if (!mode) /* 2 */
{
debug_post(
587, 4, hex2str(meshX).c_str(), hex2str(meshY).c_str(),
hex2str(coreId).c_str(),
uint2str(coreIt->second.threadsExpected.size()).c_str());
/* Note that startOne can hang for the Tinsel backend if the
* number of threads expected is greater than the number of
* threads that the core will start - this is because startOne
* waits for an acknowledgement message from the core that
* varies as a function of the number of threads. If you find
* the above 587 being the last message you see from the MPI
* CNC Resolver thread (for example), this is most likely your
* issue. */
backend->startOne(meshX, meshY, coreId,
coreIt->second.threadsExpected.size());
}
else /* 3 */
/* Otherwise, things get more complicated... */
else
{
mode = false; /* 2 when false, 3 when true. */
do /* Simple loop to reduce code duplication (see comment accompanying
* this function definition). */
{
for (coreIt = app->coreInfos.begin();
coreIt != app->coreInfos.end(); coreIt++)
{
debug_post(586, 3, hex2str(meshX).c_str(),
hex2str(meshY).c_str(), hex2str(coreId).c_str());
backend->goOne(meshX, meshY, coreId);
backend->fromAddr(coreIt->first, &meshX, &meshY, &coreId,
&threadId);

if (!mode) /* 2 */
{
debug_post(
587, 4, hex2str(meshX).c_str(), hex2str(meshY).c_str(),
hex2str(coreId).c_str(),
uint2str(coreIt->second.threadsExpected.size())
.c_str());
/* Note that startOne can hang for the Tinsel backend if
* the number of threads expected is greater than the
* number of threads that the core will start - this is
* because startOne waits for an acknowledgement message
* from the core that varies as a function of the number of
* threads. If you find the above 587 being the last
* message you see from the MPI CNC Resolver thread (for
* example), this is most likely your issue. */
backend->startOne(meshX, meshY, coreId,
coreIt->second.threadsExpected.size());
}
else /* 3 */
{
debug_post(586, 3, hex2str(meshX).c_str(),
hex2str(meshY).c_str(),
hex2str(coreId).c_str());
backend->goOne(meshX, meshY, coreId);
}
}
}

mode = !mode;
} while (mode);
mode = !mode;
} while (mode);
}

/* Good stuff. Now the cores will spin up and send BARRIER messages to the
* Mothership. */
Expand Down
36 changes: 30 additions & 6 deletions Source/Mothership/MPIHandlers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ unsigned Mothership::handle_msg_cnc(PMsg_p* message)
{
#if ORCHESTRATOR_DEBUG
std::string key = "Unknown";
if (message->Key() == PMsg_p::KEY(Q::APP,Q::SPEC))
key = "Q::APP,Q::SPEC";
if (message->Key() == PMsg_p::KEY(Q::APP,Q::EMPT))
key = "Q::APP,Q::EMPT";
else if (message->Key() == PMsg_p::KEY(Q::APP,Q::SPEC))
key = "Q::APP,Q::SPEC";
else if (message->Key() == PMsg_p::KEY(Q::APP,Q::DIST))
Expand All @@ -79,6 +79,28 @@ unsigned Mothership::handle_msg_cnc(PMsg_p* message)
return 0;
}

unsigned Mothership::handle_msg_app_empt(PMsg_p* message)
{
/* Pull message contents. */
std::string codePath;
std::string dataPath;
if (!decode_app_empt_message(message, &codePath, &dataPath))
{
debug_post(597, 3, "Q::APP,Q::EMPT", hex2str(message->Key()).c_str(),
"Failed to decode.");
return 0;
}

debug_post(597, 3, "Q::APP,Q::EMPT", hex2str(message->Key()).c_str(),
dformat("codePath=%s, dataPath=%s",
codePath.c_str(), dataPath.c_str()).c_str());

/* gogogo */
debug_post(577, 0);
backend->loadAll(codePath.c_str(), dataPath.c_str());
return 0;
}

unsigned Mothership::handle_msg_app_spec(PMsg_p* message)
{
AppInfo* appInfo;
Expand All @@ -87,21 +109,23 @@ unsigned Mothership::handle_msg_app_spec(PMsg_p* message)
std::string appName;
uint32_t distCount;
uint8_t appNumber;
bool soloApp;
if (!decode_app_spec_message(message, &appName, &distCount,
&appNumber))
&appNumber, &soloApp))
{
debug_post(597, 3, "Q::APP,Q::SPEC", hex2str(message->Key()).c_str(),
"Failed to decode.");
return 0;
}

debug_post(597, 3, "Q::APP,Q::SPEC", hex2str(message->Key()).c_str(),
dformat("appName=%s, distCount=%u, appNumber=%u",
appName.c_str(), distCount, appNumber).c_str());
dformat("appName=%s, distCount=%u, appNumber=%u, soloApp=%s",
appName.c_str(), distCount, appNumber,
soloApp ? "true" : "false").c_str());

/* Ensure application existence idempotently (it might have been created by
* an AppDist message). */
appInfo = appdb.check_create_app(appName, distCount);
appInfo = appdb.check_create_app(appName, distCount, soloApp);

/* If the application is not in the UNDERDEFINED state, post bossily and do
* nothing else. */
Expand Down
Loading