diff --git a/osg_configure/configure_modules/gratia.py b/osg_configure/configure_modules/gratia.py index 7d8c1714..3b6fe4e0 100644 --- a/osg_configure/configure_modules/gratia.py +++ b/osg_configure/configure_modules/gratia.py @@ -5,6 +5,7 @@ import sys import logging import subprocess +import textwrap from xml.sax import saxutils from osg_configure.modules import exceptions @@ -12,23 +13,16 @@ from osg_configure.modules import validation from osg_configure.modules import configfile from osg_configure.modules.baseconfiguration import BaseConfiguration -from osg_configure.configure_modules.condor import CondorConfiguration -from osg_configure.configure_modules.sge import SGEConfiguration -from osg_configure.configure_modules.slurm import SlurmConfiguration __all__ = ['GratiaConfiguration'] GRATIA_CONFIG_FILES = { - 'condor': '/etc/gratia/condor/ProbeConfig', - 'sge': '/etc/gratia/sge/ProbeConfig', - 'lsf': '/etc/gratia/pbs-lsf/urCollector.conf', - 'pbs': '/etc/gratia/pbs-lsf/urCollector.conf', - 'slurm': '/etc/gratia/slurm/ProbeConfig', 'htcondor-ce': '/etc/gratia/htcondor-ce/ProbeConfig' } -CE_PROBE_RPMS = ['gratia-probe-condor', 'gratia-probe-pbs-lsf', 'gratia-probe-sge', - 'gratia-probe-slurm', 'gratia-probe-htcondor-ce'] +CE_PROBE_RPMS = ['gratia-probe-htcondor-ce'] + +CONDOR_CE_CONFIG_VAL = "/usr/bin/condor_ce_config_val" def requirements_are_installed(): @@ -71,7 +65,8 @@ def __init__(self, *args, **kwargs): self._production_defaults = {'probes': 'jobmanager:gratia-osg-prod.opensciencegrid.org:80'} - self._job_managers = ['pbs', 'sge', 'lsf', 'condor', 'slurm', 'htcondor-ce'] + self._job_managers = ['htcondor-ce'] + self._old_job_managers = ['pbs', 'sge', 'lsf', 'condor', 'slurm'] self._probe_config = {} self.grid_group = 'OSG' @@ -116,74 +111,8 @@ def parse_configuration(self, configuration): self._itb_defaults['probes'] # grab configuration information for various jobmanagers - probes_iter = self.get_installed_probe_config_files().keys() - for probe in probes_iter: - if probe == 'condor': - self._probe_config['condor'] = {'condor_location': - CondorConfiguration.get_condor_location(configuration), - 'condor_config': - CondorConfiguration.get_condor_config(configuration)} - elif probe == 'pbs': - if BaseConfiguration.section_disabled(configuration, 'PBS'): - # if the PBS jobmanager is disabled, the CE is probably using LSF - # in any case, setting up the pbs gratia probe is not useful - continue - log_option = configfile.Option(name='log_directory', - required=configfile.Option.OPTIONAL, - default_value='') - configfile.get_option(configuration, 'PBS', log_option) - self._probe_config['pbs'] = {'log_directory': log_option.value} - - accounting_log_option = configfile.Option(name='accounting_log_directory', - required=configfile.Option.OPTIONAL, - default_value='') - configfile.get_option(configuration, 'PBS', accounting_log_option) - self._probe_config['pbs'] = {'accounting_log_directory': accounting_log_option.value} - elif probe == 'lsf': - if BaseConfiguration.section_disabled(configuration, 'LSF'): - # if the LSF jobmanager is disabled, the CE is probably using PBS - # in any case, setting up the pbs gratia probe is not useful - continue - lsf_location = configfile.Option(name='lsf_location', - default_value='/usr/bin') - configfile.get_option(configuration, 'LSF', lsf_location) - self._probe_config['lsf'] = {'lsf_location': lsf_location.value} - - log_option = configfile.Option(name='log_directory', - required=configfile.Option.OPTIONAL, - default_value='') - configfile.get_option(configuration, 'LSF', log_option) - self._probe_config['lsf']['log_directory'] = log_option.value - elif probe == 'sge': - if BaseConfiguration.section_disabled(configuration, 'SGE'): - # if section is disabled then the following code won't work - # since the parse_configuration will short circuit, so - # give a warning and then move on - self.log("Skipping SGE gratia probe configuration since SGE is disabled", - level=logging.WARNING) - continue - sge_config = SGEConfiguration(logger=self.logger) - sge_config.parse_configuration(configuration) - self._probe_config['sge'] = {'sge_accounting_file': sge_config.get_accounting_file()} - elif probe == 'slurm': - if BaseConfiguration.section_disabled(configuration, 'SLURM'): - # if section is disabled then the following code won't work - # since the parse_configuration will short circuit, so - # give a warning and then move on - self.log("Skipping Slurm gratia probe configuration since Slurm is disabled", - level=logging.WARNING) - continue - slurm_config = SlurmConfiguration(logger=self.logger) - slurm_config.parse_configuration(configuration) - self._probe_config['slurm'] = {'db_host': slurm_config.get_db_host(), - 'db_port': slurm_config.get_db_port(), - 'db_user': slurm_config.get_db_user(), - 'db_pass': slurm_config.get_db_pass(), - 'db_name': slurm_config.get_db_name(), - 'cluster': slurm_config.get_slurm_cluster(), - 'location': slurm_config.get_location()} - elif probe == 'htcondor-ce': - self._probe_config['htcondor-ce'] = {} + if "htcondor-ce" in self.get_installed_probe_config_files_by_probe(): + self._probe_config['htcondor-ce'] = {} self.get_options(configuration, ignore_options=['itb-jobmanager-gratia', @@ -238,9 +167,8 @@ def configure(self, attributes): return False hostname = attributes['OSG_HOSTNAME'] - probe_config_files = self.get_installed_probe_config_files() - probes_iter = probe_config_files.keys() - for probe in probes_iter: + probe_config_files_by_probe = self.get_installed_probe_config_files_by_probe() + for probe in probe_config_files_by_probe: if probe in self._job_managers: if probe not in self._probe_config: # Probe is installed but we don't have configuration for it @@ -252,6 +180,8 @@ def configure(self, attributes): probe_host = self.enabled_probe_hosts['jobmanager'] else: continue + elif probe in self._old_job_managers: + continue else: if probe in self.enabled_probe_hosts: probe_host = self.enabled_probe_hosts[probe] @@ -260,30 +190,20 @@ def configure(self, attributes): self._subscribe_probe_to_remote_host( probe, - probe_config_files[probe], + probe_config_files_by_probe[probe], remote_host=probe_host, local_resource=self.options['resource'].value, local_host=hostname ) - if probe == 'condor': - self._configure_condor_probe() - elif probe == 'pbs': - self._configure_pbs_probe() - elif probe == 'lsf': - self._configure_lsf_probe() - elif probe == 'sge': - self._configure_sge_probe() - elif probe == 'slurm': - self._configure_slurm_probe() - elif probe == 'htcondor-ce': - self._configure_htcondor_ce_probe() + if "htcondor-ce" in probe_config_files_by_probe: + self._configure_htcondor_ce_probe() self.log("GratiaConfiguration.configure completed") return True # pylint: disable-msg=R0201 @staticmethod - def get_installed_probe_config_files(): + def get_installed_probe_config_files_by_probe(): """Return a mapping of probe name -> ProbeConfig file. Note that "pbs" and "lsf" have the same probe. """ @@ -321,7 +241,8 @@ def check_attributes(self, attributes): self.log("GratiaConfiguration.check_attributes completed") return True status = self._check_servers() - status &= self._verify_gratia_dirs_for_condor_probe() + if 'htcondor-ce' in self._probe_config: + status &= self._verify_gratia_dirs_for_htcondor_ce_probe() self.log("GratiaConfiguration.check_attributes completed") return status @@ -492,119 +413,6 @@ def _configure_default_ce(self, configuration): return True - def _configure_condor_probe(self): - """ - Do condor probe specific configuration - """ - - config_location = GRATIA_CONFIG_FILES['condor'] - buf = open(config_location, "r", encoding="latin-1").read() - settings = self._probe_config['condor'] - buf = self.replace_setting(buf, 'CondorLocation', settings['condor_location']) - buf = self.replace_setting(buf, 'CondorConfig', settings['condor_config']) - if not utilities.atomic_write(config_location, buf): - return False - return True - - def _configure_pbs_probe(self): - """ - Do pbs probe specific configuration - """ - if (self._probe_config['pbs']['accounting_log_directory'] is None or - self._probe_config['pbs']['accounting_log_directory'] == ''): - return True - accounting_dir = self._probe_config['pbs']['accounting_log_directory'] - if not validation.valid_directory(accounting_dir): - self.log("PBS accounting log not present, PBS gratia probe not configured", - level=logging.ERROR, - option='accounting_log_directory', - section='PBS') - return True - - config_location = GRATIA_CONFIG_FILES['pbs'] - buf = open(config_location, "r", encoding="latin-1").read() - buf = self.replace_setting(buf, 'pbsAcctLogDir', accounting_dir, xml_file=False) - buf = self.replace_setting(buf, 'lrmsType', 'pbs', xml_file=False) - if not utilities.atomic_write(config_location, buf): - return False - return True - - def _configure_lsf_probe(self): - """ - Do lsf probe specific configuration - """ - if (self._probe_config['lsf']['log_directory'] is None or - self._probe_config['lsf']['log_directory'] == ''): - self.log("LSF accounting log directory not given, LSF gratia probe not configured", - level=logging.ERROR, - option='log_directory', - section='LSF') - return True - log_directory = self._probe_config['lsf']['log_directory'] - if not validation.valid_directory(log_directory): - self.log("LSF accounting log not present, LSF gratia probe not configured", - level=logging.ERROR, - option='log_directory', - section='LSF') - return True - config_location = GRATIA_CONFIG_FILES['lsf'] - buf = open(config_location, "r", encoding="latin-1").read() - buf = self.replace_setting(buf, 'lsfAcctLogDir', log_directory, xml_file=False) - - # setup lsfBinDir - if (self._probe_config['lsf']['lsf_location'] is None or - self._probe_config['lsf']['lsf_location'] == ''): - self.log("LSF location not given, lsf gratia probe not configured", - level=logging.ERROR, - option='lsf_location', - section='LSF') - return True - lsf_bin_dir = os.path.join(self._probe_config['lsf']['lsf_location'], 'bin') - buf = self.replace_setting(buf, 'lsfBinDir', lsf_bin_dir, xml_file=False) - buf = self.replace_setting(buf, 'lrmsType', 'lsf', xml_file=False) - if not utilities.atomic_write(config_location, buf): - return False - return True - - def _configure_sge_probe(self): - """ - Do SGE probe specific configuration - """ - accounting_path = self._probe_config['sge']['sge_accounting_file'] - config_location = GRATIA_CONFIG_FILES['sge'] - buf = open(config_location, "r", encoding="latin-1").read() - buf = self.replace_setting(buf, 'SGEAccountingFile', accounting_path) - if not utilities.atomic_write(config_location, buf): - return False - return True - - def _configure_slurm_probe(self): - """ - Do SLURM probe specific configuration - """ - config_location = GRATIA_CONFIG_FILES['slurm'] - buf = open(config_location, "r", encoding="latin-1").read() - - settings = self._probe_config['slurm'] - if not validation.valid_file(settings['db_pass']): - self.log("Slurm DB password file not present", - level=logging.ERROR, - option='db_pass', - section='SLURM') - return True - - buf = self.replace_setting(buf, 'SlurmDbHost', settings['db_host']) - buf = self.replace_setting(buf, 'SlurmDbPort', settings['db_port']) - buf = self.replace_setting(buf, 'SlurmDbUser', settings['db_user']) - buf = self.replace_setting(buf, 'SlurmDbPasswordFile', settings['db_pass']) - buf = self.replace_setting(buf, 'SlurmDbName', settings['db_name']) - buf = self.replace_setting(buf, 'SlurmCluster', settings['cluster']) - buf = self.replace_setting(buf, 'SlurmLocation', settings['location']) - - if not utilities.atomic_write(config_location, buf): - return False - return True - def _configure_htcondor_ce_probe(self): """ Do HTCondor-CE probe specific configuration @@ -618,89 +426,72 @@ def _configure_htcondor_ce_probe(self): return False return True - - def _verify_gratia_dirs_for_condor_probe(self): + def _verify_gratia_dirs_for_htcondor_ce_probe(self) -> bool: """ - Verify that the condor per_job_history directory and the DataFolder - directory are the same and warn if admin if the two don't match + Verify that the HTCondor-CE PER_JOB_HISTORY_DIR and the DataFolder + directory are the same and warn the admin if the two don't match """ - valid = True - if 'condor' not in self._probe_config: - # Don't need this for non-condor probes - return valid - condor_config_val_bin = os.path.join(self._probe_config['condor']['condor_location'], - "bin", - "condor_config_val") - if not os.path.exists(condor_config_val_bin): - self.log("While checking gratia parameters: Unable to find condor_config_val binary (looked for %s).\n" - "In the [Condor] section of your configuration, set condor_location such that " - "(condor_location)/bin/condor_config_val is the location of the condor_config_val binary." - % condor_config_val_bin, - level=logging.ERROR) - return False + if not os.path.exists(CONDOR_CE_CONFIG_VAL): + raise exceptions.ConfigureError(f"{CONDOR_CE_CONFIG_VAL} missing") + + history_dir = self._get_condor_ce_history_dir() - config_location = GRATIA_CONFIG_FILES['condor'] - contents = open(config_location, "r", encoding="latin-1").read() + config_location = GRATIA_CONFIG_FILES['htcondor-ce'] + contents = utilities.read_file(config_location, default="") re_obj = re.compile(r'(?m)^\s*DataFolder\s*=(.*)\s*$') match = re_obj.search(contents) - if match is not None: - data_folder = match.group(1) - data_folder = data_folder.strip('" \t') - # PER_JOB_HISTORY_DIR comes from the schedd, so if condor's not - # running, we can't get a value (SOFTWARE-1564) - history_dir = self._get_condor_history_dir(condor_config_val_bin) + data_folder = match.group(1).strip('" \t') if match else None + + advice_on_error = ( + f"Make sure DataFolder in {config_location} ({data_folder or 'missing'})" + f" and PER_JOB_HISTORY_DIR in the HTCondor-CE config ({history_dir or 'missing'})" + f" exist and are the same, accessible directory." + ) + + try: + ok = True if not history_dir: - self.log("Could not verify DataFolder correctness: unable to get PER_JOB_HISTORY_DIR. " - "This may be caused by the condor schedd not running, or by PER_JOB_HISTORY_DIR " - "not being defined.", level=logging.WARNING) - else: - # os.path.samefile will die if the paths don't exist so check that explicitly (SOFTWARE-1735) - if not os.path.exists(data_folder): - self.log("DataFolder setting in %s (%s) points to a nonexistant location" % ( - config_location, data_folder), - level=logging.ERROR) - valid = False - elif not os.path.exists(history_dir): - self.log("Condor PER_JOB_HISTORY_DIR %s points to a nonexistant location" % history_dir, - level=logging.ERROR) - valid = False - else: - try: - if not os.path.samefile(data_folder, history_dir): - self.log("DataFolder setting in %s (%s) and condor PER_JOB_HISTORY_DIR %s " - "do not match, these settings must match!" % (config_location, - data_folder, - history_dir), - level=logging.ERROR) - valid = False - except OSError as e: - self.log("Error comparing DataFolder setting in %s (%s) and condor PER_JOB_HISTORY_DIR %s:\n%s" - % (config_location, data_folder, history_dir, e), - level=logging.ERROR) - valid = False - - # Per Gratia-126 DataFolder must end in / otherwise gratia won't find certinfo files - if not data_folder.endswith('/'): - self.log("DataFolder setting in %s must end in a /" % config_location, - level=logging.ERROR) - valid = False + self.logger.error("PER_JOB_HISTORY_DIR is not defined") + ok = False + elif not os.path.isdir(history_dir): + self.logger.error("PER_JOB_HISTORY_DIR does not point to a valid directory") + ok = False + + if not data_folder: + self.logger.error(f"DataFolder is not defined") + ok = False + elif not os.path.isdir(data_folder): + self.logger.error(f"DataFolder does not point to a valid directory") + ok = False + + if not ok: # can't do any more checking + self.logger.error(advice_on_error) + return False - return valid + if os.path.samefile(data_folder, history_dir): + return True + else: + self.logger.error("DataFolder and PER_JOB_HISTORY_DIR do not point to the same directory") + self.logger.error(advice_on_error) + return False + except OSError as e: + self.logger.error("Unexpected error checking DataFolder and PER_JOB_HISTORY_DIR: %s", e) + self.logger.error(advice_on_error) + return False - def _get_condor_history_dir(self, condor_config_val_bin): - cmd = [condor_config_val_bin, '-schedd', 'PER_JOB_HISTORY_DIR'] + def _get_condor_ce_history_dir(self): + cmd = [CONDOR_CE_CONFIG_VAL, '-subsystem', 'SCHEDD', 'PER_JOB_HISTORY_DIR'] try: process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, encoding="latin-1") - (history_dir, errtext) = process.communicate() + history_dir, errtext = process.communicate() if process.returncode != 0: - self.log("While checking gratia parameters: %s failed. Output follows:\n%s" % (condor_config_val_bin, - errtext), - level=logging.INFO) + self.logger.info("While checking gratia parameters: %s failed. Output follows:\n%s", + CONDOR_CE_CONFIG_VAL, errtext) return None except OSError as err: - self.log("While checking gratia parameters: Error running %s: %s" % (condor_config_val_bin, str(err)), - level=logging.INFO) + self.logger.info("While checking gratia parameters: Error running %s: %s", + CONDOR_CE_CONFIG_VAL, err) return None history_dir = history_dir.strip() if history_dir.startswith('Not defined'):