Skip to content

Commit

Permalink
Bug MPI traces 1
Browse files Browse the repository at this point in the history
Derniere experimenation:
- jeu de test de Nicolas
  - reduit a une seule variable construite, dans le scenario:
    - AnalysisSpec.PredictorsSpec.ConstructionSpec.MaxConstructedAttributeNumber 1
  - lance avec la variable d'environnement KhiopsParallelTrace=3
  - lance avec 3 cores
- plante sur la Machine de Nicolas dans la tache "Run task Database slicer"
  - trace sur stdout:
  ...
  [0]	2023-12-12 10:40:13.757	In MasterFinalize
  [0]	2023-12-12 10:40:13.758	Out MasterFinalize
  error : MPI driver : Other MPI error, error stack:
  internal_Comm_disconnect(81)...: MPI_Comm_disconnect(comm=0x555abbeef028) failed
  MPID_Comm_disconnect(493)......:
  MPIR_Comm_free_impl(809).......:
  MPIR_Comm_delete_internal(1224): Communicator (handle=84000003) being freed has 1 unmatched message(s)
  error : MPI driver : Other MPI error, error stack:
  internal_Comm_disconnect(81)...: MPI_Comm_disconnect(comm=0x556ac7f0a028) failed
  MPID_Comm_disconnect(493)......:
  MPIR_Comm_free_impl(809).......:
  MPIR_Comm_delete_internal(1224): Communicator (handle=84000003) being freed has 1 unmatched message(s)

Ajout de traces dans les methodes StartFileServers et StopFileServers de PLMPITaskDriver pour preciser le probleme

Relancer pour obtenir la nouvelle trace sur stdout
  • Loading branch information
marcboulle committed Dec 12, 2023
1 parent 468d37a commit d4d32f9
Showing 1 changed file with 26 additions and 1 deletion.
27 changes: 26 additions & 1 deletion src/Parallel/PLMPI/PLMPITaskDriver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ void PLMPITaskDriver::StartFileServers()

require(GetProcessId() == 0);

// DDD
cout << "StartFileServers\tBEGIN\t" << nFileServerStartNumber << endl;

nFileServerStartNumber++;
if (nFileServerStartNumber == 1)
{
Expand All @@ -101,33 +104,43 @@ void PLMPITaskDriver::StartFileServers()
require(not bFileServerOn);
require(commTask == MPI_COMM_NULL);

cout << "\StartFileServers\tStep1" << endl;

// On active les serveurs de fichier seulement si il y a plus d'un host
// Ou si bFileServerOnSingleHost est actif ET qu'on a lance khiops avec mpi
bFileServerOn = RMResourceManager::GetResourceSystem()->GetHostNumber() > 1 or
(bFileServerOnSingleHost and
RMResourceManager::GetResourceSystem()->GetLogicalProcessNumber() > 1);

cout << "\StartFileServers\tStep2\t" << bFileServerOn << endl;
assert(not bFileServerOn or ivFileServers->GetSize() != 0);

// Lancement des serveurs de fichiers
int nSize;
MPI_Comm_size(MPI_COMM_WORLD, &nSize);
cout << "\StartFileServers\tStep3" << endl;
for (i = 1; i < nSize; i++)
{
if (GetTracerMPI()->GetActiveMode())
cast(PLMPITracer*, GetTracerMPI())->AddSend(i, MASTER_LAUNCH_FILE_SERVERS);
cout << "\StartFileServers\tStep4\t" << i << endl;
context.Send(MPI_COMM_WORLD, i, MASTER_LAUNCH_FILE_SERVERS);
cout << "\StartFileServers\tStep5\t" << i << endl;
serializer.OpenForWrite(&context);
serializer.PutIntVector(ivFileServers);
serializer.Close();
cout << "\StartFileServers\tStep6\t" << i << endl;
}

cout << "\StartFileServers\tStep7" << endl;
// Construction du communicateur master/slaves
MPI_Comm_split(MPI_COMM_WORLD, 1, GetProcessId(), &commProcesses);
cout << "\StartFileServers\tStep8" << endl;
debug(; MPI_Comm_size(commProcesses, &nSize); ensure(
nSize == RMResourceManager::GetLogicalProcessNumber() - ivFileServers->GetSize()););
}
}
cout << "StartFileServers\tEND\t" << nFileServerStartNumber << endl;
}

void PLMPITaskDriver::StopFileServers()
Expand All @@ -137,33 +150,45 @@ void PLMPITaskDriver::StopFileServers()

require(GetProcessId() == 0);

// DDD
cout << "StopFileServers\tBEGIN\t" << nFileServerStartNumber << endl;

nFileServerStartNumber--;
if (nFileServerStartNumber == 0)
{
if (not PLParallelTask::GetParallelSimulated())
{
require(not PLParallelTask::IsRunning());

cout << "\tStopFileServers\tStep1" << endl;

// Arret des serveurs de fichier
// On envoie a TOUS les process : esclaves (qui travaillent ou non) et serveurs de fichiers
// Ca n'est pas un Bcast car la reception est non bloquante (Iprobe)
// La reception pour les esclaves est effectuee dans PLMPISlaveLauncher::IsFileServerEnd()
// Pour les serveurs de fichiers dans PLMPIFileServerSlave::Run
MPI_Comm_size(MPI_COMM_WORLD, &nCommSize);
cout << "\tStopFileServers\tStep2" << endl;
for (i = 1; i < nCommSize; i++)
{
if (GetTracerMPI()->GetActiveMode())
cast(PLMPITracer*, GetTracerMPI())->AddSend(i, MASTER_STOP_FILE_SERVERS);
cout << "\tStopFileServers\tStep3\t" << i << endl;
MPI_Send(NULL, 0, MPI_CHAR, i, MASTER_STOP_FILE_SERVERS, MPI_COMM_WORLD);
cout << "\tStopFileServers\tStep4\t" << i << endl;
}

bFileServerOn = false;

cout << "\tStopFileServers\tStep5" << endl;
// La deconnexion est bloquante, elle a lieu chez les esclaves et chez les serveurs de fichiers
MPI_Barrier(*PLMPITaskDriver::GetProcessComm()); // Necessaire pour MPICH
MPI_Comm_disconnect(&commProcesses); // DISCONNECT COMM_PROCESS
cout << "\tStopFileServers\tStep6" << endl;
MPI_Comm_disconnect(&commProcesses); // DISCONNECT COMM_PROCESS
cout << "\tStopFileServers\tStep7" << endl;
}
}
cout << "StopFileServers\tEND\t" << nFileServerStartNumber << endl;
}

void PLMPITaskDriver::BCastBlock(PLSerializer* serializer, PLMsgContext* context)
Expand Down

0 comments on commit d4d32f9

Please sign in to comment.