Merge pull request #4621 from garlick/issue#4619

flux-shell: always use pmi=pershell by default
flux-framework · Sep 27, 2022 · 086372b · 086372b
2 parents d817d7e + 97b7ec3
commit 086372b
Showing 1 changed file with 9 additions and 37 deletions.
diff --git a/src/shell/pmi/pmi.c b/src/shell/pmi/pmi.c
@@ -375,17 +375,12 @@ static void pmi_fd_cb (flux_shell_task_t *task,
     }
 }
 
-/* Query broker to see if instance mapping is known, then use that information
- * to select whether process mapping should be "none", "single", or "pershell".
- * If the instance mapping is unknown, use "pershell".  The choice of default
- * was a process of trial and error:
+/* Generate 'PMI_process_mapping' key (see RFC 13) for MPI clique computation.
  *
  * PMI_process_mapping originated with MPICH, which uses it to determine
  * whether it can short circult the comms path between local ranks with shmem.
  * MPICH allows the key to be missing or its value to be empty, and in those
- * cases just skips the optimization.  Based on this, one might assume that
- * either "none" or "pershell" would be valid defaults when the mapping is
- * unknown.  However, note the following:
+ * cases just skips the optimization.  However, note the following:
  *
  * - MVAPICH2 fails with an "Invalid tag" error in MPI_Init() if the key
  *   does not exist (flux-framework/flux-core#3592) and an even more obscure
@@ -395,35 +390,12 @@ static void pmi_fd_cb (flux_shell_task_t *task,
  *   that ranks are not co-located when they really are
  *   (flux-framework/flux-core#3551)
  *
- * Least worse choice seems to be "pershell", and then use the openmpi shell
- * plugin to cast the proper runes to avoid OpenMPI name collisions by
- * redirecting shmem paths to FLUX_JOB_TMPDIR.  FLUX_JOB_TMPDIR includes
- * the jobid and the shell rank in its path, so works as a unique path
- * prefix even when there are multiple brokers/shells per node.
- */
-static const char *guess_clique_option (struct shell_pmi *pmi)
-{
-    const char *val;
-    struct pmi_map_block *blocks = NULL;
-    int nblocks;
-    const char *opt = "pershell";
-
-    if (pmi->shell->standalone)
-        goto done;
-    if (!(val = flux_attr_get (pmi->shell->h, "broker.mapping")))
-        goto done;
-    if (pmi_process_mapping_parse (val, &blocks, &nblocks) < 0)
-        goto done;
-    if (nblocks == 1 && blocks[0].nodes == 1)       // one node
-        opt = "single";
-    else if (nblocks == 1 && blocks[0].procs == 1)  // one broker per node
-        opt = "pershell";
-done:
-    free (blocks);
-    return opt;
-}
-
-/* Generate 'PMI_process_mapping' key (see RFC 13) for MPI clique computation.
+ * Unless overridden, "pershell" mode is assumed.  When multiple brokers
+ * are launched per node (e.g. in test), each job shell remains in an isolated
+ * clique as though the brokers represent independent nodes.  The OpenMPI shell
+ * plugin avoids the above problem by setting MCA environment variables to
+ * locate shmem paths within FLUX_JOB_TMPDIR, which includes the jobid and
+ * the shell rank in its path, and thus works as a unique path prefix.
  */
 static int init_clique (struct shell_pmi *pmi, const char *opt)
 {
@@ -433,7 +405,7 @@ static int init_clique (struct shell_pmi *pmi, const char *opt)
     char val[SIMPLE_KVS_VAL_MAX];
 
     if (!opt)
-        opt = guess_clique_option (pmi);
+        opt = "pershell";
 
      /* pmi.clique=pershell (default): one clique per shell.
       * Create an array of pmi_map_block structures, sized for worst case