diff --git a/doc/user_manual/src/Howtorun.tex b/doc/user_manual/src/Howtorun.tex index 17d50aa2..e05b3928 100644 --- a/doc/user_manual/src/Howtorun.tex +++ b/doc/user_manual/src/Howtorun.tex @@ -25,3 +25,32 @@ \subsection{How to run} \end{lstlisting} Alternatively, you can use Python to run \texttt{HERON/src/main.py} with the HERON XML input as argument; however, this will bypass loading the \texttt{raven\_libraries} and other initialization. + +\subsection{Parallel Notes} + +HERON uses RAVEN's parallel tools. Since running on different clusters +can require somewhat different commands, HERON allows the commands +used for parallel running to be chosen based on the hostname. + +These are stored in the directory \texttt{templates/parallel}. Example: + +\begin{lstlisting}[style=XML] + + + + mpi + + + + + ray + + +\end{lstlisting} + +The \texttt{hostregexp} is a regular expression and the first regular +expression that matches the hostname will be used as the template for +running in parallel. If parallel is used, then the section in +\texttt{useParallel} will be added to the RunInfo in the RAVEN +input. If the batch size is greater than one then the code in the +section \texttt{outer} will be used. diff --git a/templates/parallel/bitterroot.xml b/templates/parallel/bitterroot.xml new file mode 100644 index 00000000..d8b282e8 --- /dev/null +++ b/templates/parallel/bitterroot.xml @@ -0,0 +1,11 @@ + + + slurm + + + + + dask + + + diff --git a/templates/parallel/sawtooth.xml b/templates/parallel/sawtooth.xml new file mode 100644 index 00000000..eabc23dd --- /dev/null +++ b/templates/parallel/sawtooth.xml @@ -0,0 +1,11 @@ + + + + mpi + + + + + ray + + diff --git a/templates/template_driver.py b/templates/template_driver.py index 54046104..8ae47e25 100644 --- a/templates/template_driver.py +++ b/templates/template_driver.py @@ -10,6 +10,9 @@ import shutil import xml.etree.ElementTree as ET import itertools as it +import socket +import glob +import re import numpy as np import dill as pk @@ -237,6 +240,23 @@ def _modify_outer_mode(self, template, case, components, sources): elif case.get_mode() == 'opt': template.find('Samplers').remove(template.find(".//Grid[@name='grid']")) + def _get_parallel_xml(self, hostname): + """ + Finds the xml file to go with the given hostname. + @ In, hostname, string with the hostname to search for + @ Out, xml, xml.eTree.ElementTree or None, if an xml file is found then use it, otherwise return None + """ + # Should this allow loading from another directory (such as one + # next to the input file?) + path = os.path.join(os.path.dirname(__file__),"parallel","*.xml") + filenames = glob.glob(path) + for filename in filenames: + cur_xml = ET.parse(filename).getroot() + regexp = cur_xml.attrib['hostregexp'] + if re.match(regexp, hostname): + return cur_xml + return None + def _modify_outer_runinfo(self, template, case): """ Defines modifications to the RunInfo of outer.xml RAVEN input file. @@ -257,15 +277,26 @@ def _modify_outer_runinfo(self, template, case): elif case.get_mode() == 'opt': run_info.find('Sequence').text = 'optimize, plot' # parallel + # Should there be a way to override the hostname (such as if we are + # generating the files to run on a different computer?) + hostname = socket.gethostbyaddr(socket.gethostname())[0] + self.parallel_xml = self._get_parallel_xml(hostname) + #note, parallel_xml might be None if case.outerParallel: - # set outer batchsize and InternalParallel - batchSize = run_info.find('batchSize') - batchSize.text = f'{case.outerParallel}' - run_info.append(xmlUtils.newNode('internalParallel', text='True')) + self._modify_outer_parallel(template, case) if case.useParallel: - #XXX this doesn't handle non-mpi modes like torque or other custom ones - mode = xmlUtils.newNode('mode', text='mpi') - mode.append(xmlUtils.newNode('runQSUB')) + if self.parallel_xml is None: + #this doesn't handle non-mpi modes like torque or other custom ones + # so it is highly recommended that a parallel xml template be created + # for hosts that are using those. + mode = xmlUtils.newNode('mode', text='mpi') + mode.append(xmlUtils.newNode('runQSUB')) + else: + for child in self.parallel_xml.find('useParallel'): + if child.tag == 'mode': + mode = child + else: + run_info.append(child) if 'memory' in case.parallelRunInfo: mode.append(xmlUtils.newNode('memory', text=case.parallelRunInfo.pop('memory'))) for sub in case.parallelRunInfo: @@ -274,6 +305,26 @@ def _modify_outer_runinfo(self, template, case): if case.innerParallel: run_info.append(xmlUtils.newNode('NumMPI', text=case.innerParallel)) + def _modify_outer_parallel(self, template, case): + """ + Modifies the outer parallel stuff. This should only be called if + case.outerparallel > 0 + @ In, template, xml.etree.ElementTree.Element, root of XML to modify + @ In, case, HERON Case, defining Case instance + @ Out, None + + """ + run_info = template.find('RunInfo') + # set outer batchsize and InternalParallel + batchSize = run_info.find('batchSize') + batchSize.text = f'{case.outerParallel}' + if self.parallel_xml is None: + run_info.append(xmlUtils.newNode('internalParallel', text='True')) + else: + #append all the children in the 'outer' element + for child in self.parallel_xml.find('outer'): + run_info.append(child) + def _modify_outer_vargroups(self, template, case, components, sources): """ Defines modifications to the VariableGroups of outer.xml RAVEN input file. @@ -703,12 +754,8 @@ def _modify_outer_samplers(self, template, case, components): #XXX if we had a way to calculate this ahead of time, # this could be done in _modify_outer_runinfo #Need to update the outerParallel number - run_info = template.find('RunInfo') case.outerParallel = len(self.__sweep_vars) + 1 - #XXX duplicate of code in _modify_outer_runinfo - batchSize = run_info.find('batchSize') - batchSize.text = f'{case.outerParallel}' - run_info.append(xmlUtils.newNode('internalParallel', text='True')) + self._modify_outer_parallel(template, case) def _modify_outer_optimizers(self, template, case): """