diff --git a/device/mellanox/x86_64-mlnx_msn2010-r0/system_health_monitoring_config.json b/device/mellanox/x86_64-mlnx_msn2010-r0/system_health_monitoring_config.json index 5574936ee3e7..a4c9e9b44e02 100644 --- a/device/mellanox/x86_64-mlnx_msn2010-r0/system_health_monitoring_config.json +++ b/device/mellanox/x86_64-mlnx_msn2010-r0/system_health_monitoring_config.json @@ -1,11 +1,11 @@ { "services_to_ignore": [], "devices_to_ignore": ["psu.voltage", "psu.temperature"], - "external_checkers": [], + "user_defined_checkers": [], "polling_interval": 60, "led_color": { - "fault": "orange", - "normal": "green", - "booting": "orange_blink" + "fault": "orange", + "normal": "green", + "booting": "orange_blink" } } diff --git a/device/mellanox/x86_64-mlnx_msn2700-r0/system_health_monitoring_config.json b/device/mellanox/x86_64-mlnx_msn2700-r0/system_health_monitoring_config.json index 4c7d53f19cf5..bff6ab4b38ee 100644 --- a/device/mellanox/x86_64-mlnx_msn2700-r0/system_health_monitoring_config.json +++ b/device/mellanox/x86_64-mlnx_msn2700-r0/system_health_monitoring_config.json @@ -1,11 +1,11 @@ { "services_to_ignore": [], "devices_to_ignore": ["psu.voltage"], - "external_checkers": [], + "user_defined_checkers": [], "polling_interval": 60, "led_color": { - "fault": "orange", - "normal": "green", - "booting": "orange_blink" + "fault": "orange", + "normal": "green", + "booting": "orange_blink" } } diff --git a/device/mellanox/x86_64-mlnx_msn2700_simx-r0/system_health_monitoring_config.json b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/system_health_monitoring_config.json index e4ba06d7f46c..23cb74dd3f96 100644 --- a/device/mellanox/x86_64-mlnx_msn2700_simx-r0/system_health_monitoring_config.json +++ b/device/mellanox/x86_64-mlnx_msn2700_simx-r0/system_health_monitoring_config.json @@ -1,11 +1,11 @@ { "services_to_ignore": [], "devices_to_ignore": ["psu","asic","fan"], - "external_checkers": [], + "user_defined_checkers": [], "polling_interval": 60, "led_color": { - "fault": "orange", - "normal": "green", - "booting": "orange_blink" + "fault": "orange", + "normal": "green", + "booting": "orange_blink" } } diff --git a/files/build_templates/sonic_debian_extension.j2 b/files/build_templates/sonic_debian_extension.j2 index 4dd49d8e3df8..390f042c7b91 100644 --- a/files/build_templates/sonic_debian_extension.j2 +++ b/files/build_templates/sonic_debian_extension.j2 @@ -172,6 +172,12 @@ sudo cp {{platform_common_py2_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2 sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $PLATFORM_COMMON_PY2_WHEEL_NAME sudo rm -rf $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY2_WHEEL_NAME +# Install system-health Python 2 package +SYSTEM_HEALTH_PY2_WHEEL_NAME=$(basename {{system_health_py2_wheel_path}}) +sudo cp {{system_health_py2_wheel_path}} $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME +sudo https_proxy=$https_proxy LANG=C chroot $FILESYSTEM_ROOT pip install $SYSTEM_HEALTH_PY2_WHEEL_NAME +sudo rm -rf $FILESYSTEM_ROOT/$SYSTEM_HEALTH_PY2_WHEEL_NAME + # Install sonic-platform-common Python 3 package PLATFORM_COMMON_PY3_WHEEL_NAME=$(basename {{platform_common_py3_wheel_path}}) sudo cp {{platform_common_py3_wheel_path}} $FILESYSTEM_ROOT/$PLATFORM_COMMON_PY3_WHEEL_NAME @@ -283,6 +289,10 @@ sudo mkdir -p $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d sudo cp $IMAGE_CONFIGS/syslog/override.conf $FILESYSTEM_ROOT/etc/systemd/system/syslog.socket.d/override.conf sudo cp $IMAGE_CONFIGS/syslog/host_umount.sh $FILESYSTEM_ROOT/usr/bin/ +# Copy system-health files +sudo LANG=C cp $IMAGE_CONFIGS/system-health/system-health.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM +echo "system-health.service" | sudo tee -a $GENERATED_SERVICE_FILE + # Copy logrotate.d configuration files sudo cp -f $IMAGE_CONFIGS/logrotate/logrotate.d/* $FILESYSTEM_ROOT/etc/logrotate.d/ diff --git a/files/image_config/system-health/system-health.service b/files/image_config/system-health/system-health.service new file mode 100644 index 000000000000..3de6a51584cb --- /dev/null +++ b/files/image_config/system-health/system-health.service @@ -0,0 +1,11 @@ +[Unit] +Description=SONiC system health monitor +Requires=database.service updategraph.service +After=database.service updategraph.service + +[Service] +ExecStart=/usr/local/bin/healthd +Restart=always + +[Install] +WantedBy=multi-user.target diff --git a/rules/system-health.dep b/rules/system-health.dep new file mode 100644 index 000000000000..31de25cb6d8c --- /dev/null +++ b/rules/system-health.dep @@ -0,0 +1,8 @@ +SPATH := $($(SYSTEM_HEALTH)_SRC_PATH) +DEP_FILES := $(SONIC_COMMON_FILES_LIST) rules/system-health.mk rules/system-health.dep +DEP_FILES += $(SONIC_COMMON_BASE_FILES_LIST) +DEP_FILES += $(shell git ls-files $(SPATH)) + +$(SYSTEM_HEALTH)_CACHE_MODE := GIT_CONTENT_SHA +$(SYSTEM_HEALTH)_DEP_FLAGS := $(SONIC_COMMON_FLAGS_LIST) +$(SYSTEM_HEALTH)_DEP_FILES := $(DEP_FILES) diff --git a/rules/system-health.mk b/rules/system-health.mk new file mode 100644 index 000000000000..16648508b9f4 --- /dev/null +++ b/rules/system-health.mk @@ -0,0 +1,9 @@ +# system health python2 wheel + +SYSTEM_HEALTH = system_health-1.0-py2-none-any.whl +$(SYSTEM_HEALTH)_SRC_PATH = $(SRC_PATH)/system-health +$(SYSTEM_HEALTH)_PYTHON_VERSION = 2 +$(SYSTEM_HEALTH)_DEPENDS = $(SONIC_PY_COMMON_PY2) $(SWSSSDK_PY2) $(SONIC_CONFIG_ENGINE) +SONIC_PYTHON_WHEELS += $(SYSTEM_HEALTH) + +export system_health_py2_wheel_path="$(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH))" diff --git a/slave.mk b/slave.mk index 9b0b10b3c0e7..3e200f9ee45d 100644 --- a/slave.mk +++ b/slave.mk @@ -819,7 +819,8 @@ $(addprefix $(TARGET_PATH)/, $(SONIC_INSTALLERS)) : $(TARGET_PATH)/% : \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(REDIS_DUMP_LOAD_PY2)) \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_PLATFORM_API_PY2)) \ $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MODELS_PY3)) \ - $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) + $(addprefix $(PYTHON_WHEELS_PATH)/,$(SONIC_YANG_MGMT_PY)) \ + $(addprefix $(PYTHON_WHEELS_PATH)/,$(SYSTEM_HEALTH)) $(HEADER) # Pass initramfs and linux kernel explicitly. They are used for all platforms export debs_path="$(IMAGE_DISTRO_DEBS_PATH)" diff --git a/src/system-health/.gitignore b/src/system-health/.gitignore new file mode 100644 index 000000000000..843dd50ba52f --- /dev/null +++ b/src/system-health/.gitignore @@ -0,0 +1,8 @@ +*/deb_dist/ +*/dist/ +*/build/ +*/*.tar.gz +*/*.egg-info +*/.cache/ +*.pyc +*/__pycache__/ diff --git a/src/system-health/health_checker/__init__.py b/src/system-health/health_checker/__init__.py new file mode 100644 index 000000000000..18c49c8d81a1 --- /dev/null +++ b/src/system-health/health_checker/__init__.py @@ -0,0 +1,2 @@ +from . import hardware_checker +from . import service_checker diff --git a/src/system-health/health_checker/config.py b/src/system-health/health_checker/config.py new file mode 100644 index 000000000000..47b5f82b172b --- /dev/null +++ b/src/system-health/health_checker/config.py @@ -0,0 +1,144 @@ +import json +import os + +from sonic_py_common import device_info + + +class Config(object): + """ + Manage configuration of system health. + """ + + # Default system health check interval + DEFAULT_INTERVAL = 60 + + # Default boot up timeout. When reboot system, system health will wait a few seconds before starting to work. + DEFAULT_BOOTUP_TIMEOUT = 300 + + # Default LED configuration. Different platform has different LED capability. This configuration allow vendor to + # override the default behavior. + DEFAULT_LED_CONFIG = { + 'fault': 'red', + 'normal': 'green', + 'booting': 'orange_blink' + } + + # System health configuration file name + CONFIG_FILE = 'system_health_monitoring_config.json' + + # Monit service configuration file path + MONIT_CONFIG_FILE = '/etc/monit/monitrc' + + # Monit service start delay configuration entry + MONIT_START_DELAY_CONFIG = 'with start delay' + + def __init__(self): + """ + Constructor. Initialize all configuration entry to default value in case there is no configuration file. + """ + self.platform_name = device_info.get_platform() + self._config_file = os.path.join('/usr/share/sonic/device/', self.platform_name, Config.CONFIG_FILE) + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.user_defined_checkers = None + + def config_file_exists(self): + return os.path.exists(self._config_file) + + def load_config(self): + """ + Load the configuration file from disk. + 1. If there is no configuration file, current config entries will reset to default value + 2. Only read the configuration file is last_mtime changes for better performance + 3. If there is any format issues in configuration file, current config entries will reset to default value + :return: + """ + if not self.config_file_exists(): + if self._last_mtime is not None: + self._reset() + return + + mtime = os.stat(self._config_file) + if mtime != self._last_mtime: + try: + self._last_mtime = mtime + with open(self._config_file, 'r') as f: + self.config_data = json.load(f) + + self.interval = self.config_data.get('polling_interval', Config.DEFAULT_INTERVAL) + self.ignore_services = self._get_list_data('services_to_ignore') + self.ignore_devices = self._get_list_data('devices_to_ignore') + self.user_defined_checkers = self._get_list_data('user_defined_checkers') + except Exception as e: + self._reset() + + def _reset(self): + """ + Reset current configuration entry to default value + :return: + """ + self._last_mtime = None + self.config_data = None + self.interval = Config.DEFAULT_INTERVAL + self.ignore_services = None + self.ignore_devices = None + self.user_defined_checkers = None + + def get_led_color(self, status): + """ + Get desired LED color according to the input status + :param status: System health status + :return: StringLED color + """ + if self.config_data and 'led_color' in self.config_data: + if status in self.config_data['led_color']: + return self.config_data['led_color'][status] + + return self.DEFAULT_LED_CONFIG[status] + + def get_bootup_timeout(self): + """ + Get boot up timeout from monit configuration file. + 1. If monit configuration file does not exist, return default value + 2. If there is any exception while parsing monit config, return default value + :return: Integer timeout value + """ + if not os.path.exists(Config.MONIT_CONFIG_FILE): + return self.DEFAULT_BOOTUP_TIMEOUT + + try: + with open(Config.MONIT_CONFIG_FILE) as f: + lines = f.readlines() + for line in lines: + if not line: + continue + + line = line.strip() + if not line: + continue + + pos = line.find('#') + if pos == 0: + continue + + line = line[:pos] + pos = line.find(Config.MONIT_START_DELAY_CONFIG) + if pos != -1: + return int(line[pos + len(Config.MONIT_START_DELAY_CONFIG):].strip()) + except Exception: + return self.DEFAULT_BOOTUP_TIMEOUT + + def _get_list_data(self, key): + """ + Get list type configuration data by key and remove duplicate element. + :param key: Key of the configuration entry + :return: A set of configuration data if key exists + """ + if key in self.config_data: + data = self.config_data[key] + if isinstance(data, list): + return set(data) + return None diff --git a/src/system-health/health_checker/hardware_checker.py b/src/system-health/health_checker/hardware_checker.py new file mode 100644 index 000000000000..a04fe2abf43a --- /dev/null +++ b/src/system-health/health_checker/hardware_checker.py @@ -0,0 +1,248 @@ +from natsort import natsorted +from swsssdk import SonicV2Connector + +from .health_checker import HealthChecker + + +class HardwareChecker(HealthChecker): + """ + Check system hardware status. For now, it checks ASIC, PSU and fan status. + """ + ASIC_TEMPERATURE_KEY = 'TEMPERATURE_INFO|ASIC' + FAN_TABLE_NAME = 'FAN_INFO' + PSU_TABLE_NAME = 'PSU_INFO' + + def __init__(self): + HealthChecker.__init__(self) + self._db = SonicV2Connector(host="127.0.0.1") + self._db.connect(self._db.STATE_DB) + + def get_category(self): + return 'Hardware' + + def check(self, config): + self.reset() + self._check_asic_status(config) + self._check_fan_status(config) + self._check_psu_status(config) + + def _check_asic_status(self, config): + """ + Check if ASIC temperature is in valid range. + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'asic' in config.ignore_devices: + return + + temperature = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'temperature') + temperature_threshold = self._db.get(self._db.STATE_DB, HardwareChecker.ASIC_TEMPERATURE_KEY, 'high_threshold') + if not temperature: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature') + elif not temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', 'Failed to get ASIC temperature threshold') + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('ASIC', 'ASIC', + 'ASIC temperature is too hot, temperature={}, threshold={}'.format( + temperature, + temperature_threshold)) + else: + self.set_object_ok('ASIC', 'ASIC') + except ValueError as e: + self.set_object_not_ok('ASIC', 'ASIC', + 'Invalid ASIC temperature data, temperature={}, threshold={}'.format(temperature, + temperature_threshold)) + + def _check_fan_status(self, config): + """ + Check fan status including: + 1. Check all fans are present + 2. Check all fans are in good state + 3. Check fan speed is in valid range + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'fan' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.FAN_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('Fan', 'Fan', 'Failed to get fan information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('Fan', key, 'Invalid key for FAN_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is missing'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('Fan', name, '{} is broken'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'fan', name, 'speed'): + speed = data_dict.get('speed', None) + speed_target = data_dict.get('speed_target', None) + speed_tolerance = data_dict.get('speed_tolerance', None) + if not speed: + self.set_object_not_ok('Fan', name, 'Failed to get actual speed data for {}'.format(name)) + continue + elif not speed_target: + self.set_object_not_ok('Fan', name, 'Failed to get target speed date for {}'.format(name)) + continue + elif not speed_tolerance: + self.set_object_not_ok('Fan', name, 'Failed to get speed tolerance for {}'.format(name)) + continue + else: + try: + speed = float(speed) + speed_target = float(speed_target) + speed_tolerance = float(speed_tolerance) + speed_min_th = speed_target * (1 - float(speed_tolerance) / 100) + speed_max_th = speed_target * (1 + float(speed_tolerance) / 100) + if speed < speed_min_th or speed > speed_max_th: + self.set_object_not_ok('Fan', name, + '{} speed is out of range, speed={}, range=[{},{}]'.format(name, + speed, + speed_min_th, + speed_max_th)) + continue + except ValueError: + self.set_object_not_ok('Fan', name, + 'Invalid fan speed data for {}, speed={}, target={}, tolerance={}'.format( + name, + speed, + speed_target, + speed_tolerance)) + continue + + self.set_object_ok('Fan', name) + + def _check_psu_status(self, config): + """ + Check PSU status including: + 1. Check all PSUs are present + 2. Check all PSUs are power on + 3. Check PSU temperature is in valid range + 4. Check PSU voltage is in valid range + :param config: Health checker configuration + :return: + """ + if config.ignore_devices and 'psu' in config.ignore_devices: + return + + keys = self._db.keys(self._db.STATE_DB, HardwareChecker.PSU_TABLE_NAME + '*') + if not keys: + self.set_object_not_ok('PSU', 'PSU', 'Failed to get PSU information') + return + + for key in natsorted(keys): + key_list = key.split('|') + if len(key_list) != 2: # error data in DB, log it and ignore + self.set_object_not_ok('PSU', key, 'Invalid key for PSU_INFO: {}'.format(key)) + continue + + name = key_list[1] + if config.ignore_devices and name in config.ignore_devices: + continue + + data_dict = self._db.get_all(self._db.STATE_DB, key) + presence = data_dict.get('presence', 'false') + if presence.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is missing or not available'.format(name)) + continue + + status = data_dict.get('status', 'false') + if status.lower() != 'true': + self.set_object_not_ok('PSU', name, '{} is out of power'.format(name)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'temperature'): + temperature = data_dict.get('temp', None) + temperature_threshold = data_dict.get('temp_threshold', None) + if temperature is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature data for {}'.format(name)) + continue + elif temperature_threshold is None: + self.set_object_not_ok('PSU', name, 'Failed to get temperature threshold data for {}'.format(name)) + continue + else: + try: + temperature = float(temperature) + temperature_threshold = float(temperature_threshold) + if temperature > temperature_threshold: + self.set_object_not_ok('PSU', name, + '{} temperature is too hot, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid temperature data for {}, temperature={}, threshold={}'.format( + name, temperature, + temperature_threshold)) + continue + + if not self._ignore_check(config.ignore_devices, 'psu', name, 'voltage'): + voltage = data_dict.get('voltage', None) + voltage_min_th = data_dict.get('voltage_min_threshold', None) + voltage_max_th = data_dict.get('voltage_max_threshold', None) + if voltage is None: + self.set_object_not_ok('PSU', name, 'Failed to get voltage data for {}'.format(name)) + continue + elif voltage_min_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage minimum threshold data for {}'.format(name)) + continue + elif voltage_max_th is None: + self.set_object_not_ok('PSU', name, + 'Failed to get voltage maximum threshold data for {}'.format(name)) + continue + else: + try: + voltage = float(voltage) + voltage_min_th = float(voltage_min_th) + voltage_max_th = float(voltage_max_th) + if voltage < voltage_min_th or voltage > voltage_max_th: + self.set_object_not_ok('PSU', name, + '{} voltage is out of range, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + except ValueError: + self.set_object_not_ok('PSU', name, + 'Invalid voltage data for {}, voltage={}, range=[{},{}]'.format(name, + voltage, + voltage_min_th, + voltage_max_th)) + continue + self.set_object_ok('PSU', name) + + def reset(self): + self._info = {} + + @classmethod + def _ignore_check(cls, ignore_set, category, object_name, check_point): + if not ignore_set: + return False + + if '{}.{}'.format(category, check_point) in ignore_set: + return True + elif '{}.{}'.format(object_name, check_point) in ignore_set: + return True + return False diff --git a/src/system-health/health_checker/health_checker.py b/src/system-health/health_checker/health_checker.py new file mode 100644 index 000000000000..59519d0a05c4 --- /dev/null +++ b/src/system-health/health_checker/health_checker.py @@ -0,0 +1,86 @@ +class HealthChecker(object): + """ + Base class for health checker. A checker is an object that performs system health check for a particular category, + it collects and stores information after the check. + """ + INFO_FIELD_OBJECT_TYPE = 'type' + INFO_FIELD_OBJECT_STATUS = 'status' + INFO_FIELD_OBJECT_MSG = 'message' + + STATUS_OK = 'OK' + STATUS_NOT_OK = 'Not OK' + + summary = STATUS_OK + + def __init__(self): + self._info = {} + + def reset(self): + """ + Reset the status of the checker. Called every time before the check. + :return: + """ + pass + + def get_category(self): + """ + Get category of the checker. + :return: String category + """ + pass + + def get_info(self): + """ + Get information of the checker. A checker usually checks a few objects and each object status will be put to + self._info. + :return: Check result. + """ + return self._info + + def check(self, config): + """ + Perform the check. + :param config: Health checker configuration. + :return: + """ + pass + + def __str__(self): + return self.__class__.__name__ + + def add_info(self, object_name, key, value): + """ + Add check result for an object. + :param object_name: Object name. + :param key: Object attribute name. + :param value: Object attribute value. + :return: + """ + if object_name not in self._info: + self._info[object_name] = {} + + self._info[object_name][key] = value + + def set_object_not_ok(self, object_type, object_name, message): + """ + Set that an object is not OK. + :param object_type: Object type. + :param object_name: Object name. + :param message: A message to describe what is wrong with the object. + :return: + """ + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, message) + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_NOT_OK) + HealthChecker.summary = HealthChecker.STATUS_NOT_OK + + def set_object_ok(self, object_type, object_name): + """ + Set that an object is in good state. + :param object_type: Object type. + :param object_name: Object name. + :return: + """ + self.add_info(object_name, self.INFO_FIELD_OBJECT_TYPE, object_type) + self.add_info(object_name, self.INFO_FIELD_OBJECT_MSG, '') + self.add_info(object_name, self.INFO_FIELD_OBJECT_STATUS, self.STATUS_OK) diff --git a/src/system-health/health_checker/manager.py b/src/system-health/health_checker/manager.py new file mode 100644 index 000000000000..933d6a9d543f --- /dev/null +++ b/src/system-health/health_checker/manager.py @@ -0,0 +1,101 @@ +class HealthCheckerManager(object): + """ + Manage all system health checkers and system health configuration. + """ + STATE_BOOTING = 'booting' + STATE_RUNNING = 'running' + boot_timeout = None + + def __init__(self): + self._checkers = [] + self._state = self.STATE_BOOTING + + from .config import Config + self.config = Config() + self.initialize() + + def initialize(self): + """ + Initialize the manager. Create service checker and hardware checker by default. + :return: + """ + from .service_checker import ServiceChecker + from .hardware_checker import HardwareChecker + self._checkers.append(ServiceChecker()) + self._checkers.append(HardwareChecker()) + + def check(self, chassis): + """ + Load new configuration if any and perform the system health check for all existing checkers. + :param chassis: A chassis object. + :return: A tuple. The first element indicate the status of the checker; the second element is a dictionary that + contains the status for all objects that was checked. + """ + from .health_checker import HealthChecker + HealthChecker.summary = HealthChecker.STATUS_OK + stats = {} + self.config.load_config() + # check state first to avoid user change boot timeout in configuration file + # after finishing system boot + if self._state == self.STATE_BOOTING and self._is_system_booting(): + self._set_system_led(chassis, self.config, 'booting') + return self._state, stats + + for checker in self._checkers: + self._do_check(checker, stats) + + if self.config.user_defined_checkers: + from .user_defined_checker import UserDefinedChecker + for udc in self.config.user_defined_checkers: + checker = UserDefinedChecker(udc) + self._do_check(checker, stats) + + led_status = 'normal' if HealthChecker.summary == HealthChecker.STATUS_OK else 'fault' + self._set_system_led(chassis, self.config, led_status) + + return self._state, stats + + def _do_check(self, checker, stats): + """ + Do check for a particular checker and collect the check statistic. + :param checker: A checker object. + :param stats: Check statistic. + :return: + """ + try: + checker.check(self.config) + category = checker.get_category() + info = checker.get_info() + if category not in stats: + stats[category] = info + else: + stats[category].update(info) + except Exception as e: + from .health_checker import HealthChecker + error_msg = 'Failed to perform health check for {} due to exception - {}'.format(checker, repr(e)) + entry = {str(checker): { + HealthChecker.INFO_FIELD_OBJECT_STATUS: HealthChecker.STATUS_NOT_OK, + HealthChecker.INFO_FIELD_OBJECT_MSG: error_msg + }} + if 'Internal' not in stats: + stats['Internal'] = entry + else: + stats['Internal'].update(entry) + + def _is_system_booting(self): + from .utils import get_uptime + uptime = get_uptime() + if not self.boot_timeout: + self.boot_timeout = self.config.get_bootup_timeout() + booting = uptime < self.boot_timeout + if not booting: + self._state = self.STATE_RUNNING + return booting + + def _set_system_led(self, chassis, config, status): + try: + chassis.set_status_led(config.get_led_color(status)) + except NotImplementedError: + print('chassis.set_status_led is not implemented') + except Exception as e: + print('Failed to set system led due to - {}'.format(repr(e))) diff --git a/src/system-health/health_checker/service_checker.py b/src/system-health/health_checker/service_checker.py new file mode 100644 index 000000000000..8f18a6d7245e --- /dev/null +++ b/src/system-health/health_checker/service_checker.py @@ -0,0 +1,72 @@ +from .health_checker import HealthChecker +from . import utils + + +class ServiceChecker(HealthChecker): + """ + Checker that checks critical system service status via monit service. + """ + + # Command to query the status of monit service. + CHECK_MONIT_SERVICE_CMD = 'systemctl is-active monit.service' + + # Command to get summary of critical system service. + CHECK_CMD = 'monit summary -B' + MIN_CHECK_CMD_LINES = 3 + + # Expect status for different system service category. + EXPECT_STATUS_DICT = { + 'System': 'Running', + 'Process': 'Running', + 'Filesystem': 'Accessible', + 'Program': 'Status ok' + } + + def __init__(self): + HealthChecker.__init__(self) + + def reset(self): + self._info = {} + + def get_category(self): + return 'Services' + + def check(self, config): + """ + Check critical system service status. Get and analyze the output of $CHECK_CMD, collect status for system, + process and file system. + :param config: Health checker configuration. + :return: + """ + self.reset() + output = utils.run_command(ServiceChecker.CHECK_MONIT_SERVICE_CMD).strip() + if output != 'active': + self.set_object_not_ok('Service', 'monit', 'monit service is not running') + return + + output = utils.run_command(ServiceChecker.CHECK_CMD) + lines = output.splitlines() + if not lines or len(lines) < ServiceChecker.MIN_CHECK_CMD_LINES: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + status_begin = lines[1].find('Status') + type_begin = lines[1].find('Type') + if status_begin < 0 or type_begin < 0: + self.set_object_not_ok('Service', 'monit', 'output of \"monit summary -B\" is invalid or incompatible') + return + + for line in lines[2:]: + name = line[0:status_begin].strip() + if config.ignore_services and name in config.ignore_services: + continue + status = line[status_begin:type_begin].strip() + service_type = line[type_begin:].strip() + if service_type not in ServiceChecker.EXPECT_STATUS_DICT: + continue + expect_status = ServiceChecker.EXPECT_STATUS_DICT[service_type] + if expect_status != status: + self.set_object_not_ok(service_type, name, '{} is not {}'.format(name, expect_status)) + else: + self.set_object_ok(service_type, name) + return diff --git a/src/system-health/health_checker/system_health_monitoring_config.json b/src/system-health/health_checker/system_health_monitoring_config.json new file mode 100644 index 000000000000..0fc475e766e4 --- /dev/null +++ b/src/system-health/health_checker/system_health_monitoring_config.json @@ -0,0 +1,11 @@ +{ + "services_to_ignore": [], + "devices_to_ignore": [], + "user_defined_checkers": [], + "polling_interval": 60, + "led_color": { + "fault": "amber", + "normal": "green", + "booting": "orange_blink" + } +} \ No newline at end of file diff --git a/src/system-health/health_checker/user_defined_checker.py b/src/system-health/health_checker/user_defined_checker.py new file mode 100644 index 000000000000..ed0cdce6194d --- /dev/null +++ b/src/system-health/health_checker/user_defined_checker.py @@ -0,0 +1,88 @@ +from .health_checker import HealthChecker +from . import utils + + +class UserDefinedChecker(HealthChecker): + """ + User could implement a script or program to perform customize check for particular system. In order to enable a + user defined checker: + 1. Add an element to "user_defined_checkers" in the configuration file. The element must be an command string + that can be executed by shell. For example: "python my_checker.py". + 2. The command output must match the following pattern: + ${UserDefineCategory} + ${Object1}:${ObjectStatusMessage1} + ${Object2}:${ObjectStatusMessage2} + + An example of the command output: + MyCategory + Device1:OK + Device2:OK + Device3:Out of power + """ + def __init__(self, cmd): + """ + Constructor. + :param cmd: Command string of the user defined checker. + """ + HealthChecker.__init__(self) + self._cmd = cmd + self._category = None + + def reset(self): + self._category = 'UserDefine' + self._info = {} + + def get_category(self): + return self._category + + def check(self, config): + """ + Execute the user defined command and parse the output. + :param config: Health checker configuration. + :return: + """ + self.reset() + + output = utils.run_command(self._cmd) + if not output: + self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + output = output.strip() + if not output: + self.set_object_not_ok('UserDefine', str(self), 'Failed to get output of command \"{}\"'.format(self._cmd)) + return + + raw_lines = output.splitlines() + if not raw_lines: + self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + lines = [] + for line in raw_lines: + line = line.strip() + if not line: + continue + + lines.append(line) + + if not lines: + self.set_object_not_ok('UserDefine', str(self), 'Invalid output of command \"{}\"'.format(self._cmd)) + return + + self._category = lines[0] + if len(lines) > 1: + for line in lines[1:]: + pos = line.find(':') + if pos == -1: + continue + obj_name = line[:pos].strip() + msg = line[pos + 1:].strip() + if msg != 'OK': + self.set_object_not_ok('UserDefine', obj_name, msg) + else: + self.set_object_ok('UserDefine', obj_name) + return + + def __str__(self): + return 'UserDefinedChecker - {}'.format(self._cmd) diff --git a/src/system-health/health_checker/utils.py b/src/system-health/health_checker/utils.py new file mode 100644 index 000000000000..5da8a7346c2a --- /dev/null +++ b/src/system-health/health_checker/utils.py @@ -0,0 +1,25 @@ +import subprocess + + +def run_command(command): + """ + Utility function to run an shell command and return the output. + :param command: Shell command string. + :return: Output of the shell command. + """ + try: + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) + return process.communicate()[0].encode('utf-8') + except Exception: + return None + + +def get_uptime(): + """ + Utility to get the system up time. + :return: System up time in seconds. + """ + with open('/proc/uptime', 'r') as f: + uptime_seconds = float(f.readline().split()[0]) + + return uptime_seconds diff --git a/src/system-health/pytest.ini b/src/system-health/pytest.ini new file mode 100644 index 000000000000..a9c5a74860a3 --- /dev/null +++ b/src/system-health/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +addopts = --cov=health_checker --cov-report html --cov-report term --cov-report xml diff --git a/src/system-health/scripts/healthd b/src/system-health/scripts/healthd new file mode 100644 index 000000000000..00078d0c3ee0 --- /dev/null +++ b/src/system-health/scripts/healthd @@ -0,0 +1,108 @@ +#!/usr/bin/env python2 + +""" + healthd + System health monitor daemon for SONiC +""" + +import signal +import threading + +from sonic_py_common.daemon_base import DaemonBase +from swsssdk import SonicV2Connector + +from health_checker.manager import HealthCheckerManager + +SYSLOG_IDENTIFIER = 'healthd' + + +class HealthDaemon(DaemonBase): + """ + A daemon that run as a service to perform system health checker with a configurable interval. Also set system LED + according to the check result and store the check result to redis. + """ + SYSTEM_HEALTH_TABLE_NAME = 'SYSTEM_HEALTH_INFO' + + def __init__(self): + """ + Constructor of HealthDaemon. + """ + DaemonBase.__init__(self, SYSLOG_IDENTIFIER) + self._db = SonicV2Connector(host="127.0.0.1") + self._db.connect(self._db.STATE_DB) + self.stop_event = threading.Event() + + def deinit(self): + """ + Destructor. Remove all entries in $SYSTEM_HEALTH_TABLE_NAME table. + :return: + """ + self._clear_system_health_table() + + def _clear_system_health_table(self): + self._db.delete_all_by_pattern(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME) + + # Signal handler + def signal_handler(self, sig, frame): + """ + Signal handler + :param sig: Signal number + :param frame: not used + :return: + """ + if sig == signal.SIGHUP: + self.log_notice("Caught SIGHUP - ignoring...") + elif sig == signal.SIGINT: + self.log_notice("Caught SIGINT - exiting...") + self.stop_event.set() + elif sig == signal.SIGTERM: + self.log_notice("Caught SIGTERM - exiting...") + self.stop_event.set() + else: + self.log_warning("Caught unhandled signal '" + sig + "'") + + def run(self): + """ + Check system health in an infinite loop. + :return: + """ + self.log_notice("Starting up...") + + import sonic_platform.platform + chassis = sonic_platform.platform.Platform().get_chassis() + manager = HealthCheckerManager() + if not manager.config.config_file_exists(): + self.log_warning("System health configuration file not found, exit...") + return + while 1: + state, stat = manager.check(chassis) + if state == HealthCheckerManager.STATE_RUNNING: + self._process_stat(chassis, manager.config, stat) + + if self.stop_event.wait(manager.config.interval): + break + + self.deinit() + + def _process_stat(self, chassis, config, stat): + from health_checker.health_checker import HealthChecker + self._clear_system_health_table() + for category, info in stat.items(): + for obj_name, obj_data in info.items(): + if obj_data[HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK: + self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, obj_name, + obj_data[HealthChecker.INFO_FIELD_OBJECT_MSG]) + + self._db.set(self._db.STATE_DB, HealthDaemon.SYSTEM_HEALTH_TABLE_NAME, 'summary', HealthChecker.summary) + + +# +# Main ========================================================================= +# +def main(): + health_monitor = HealthDaemon() + health_monitor.run() + + +if __name__ == '__main__': + main() diff --git a/src/system-health/setup.cfg b/src/system-health/setup.cfg new file mode 100644 index 000000000000..b7e478982ccf --- /dev/null +++ b/src/system-health/setup.cfg @@ -0,0 +1,2 @@ +[aliases] +test=pytest diff --git a/src/system-health/setup.py b/src/system-health/setup.py new file mode 100644 index 000000000000..a7decca09a97 --- /dev/null +++ b/src/system-health/setup.py @@ -0,0 +1,49 @@ +from setuptools import setup + +dependencies = [ + 'natsort', + 'sonic_py_common', + 'swsssdk>=2.0.1', +] + +setup( + name='system-health', + version='1.0', + description='SONiC system health package', + license='Apache 2.0', + author='SONiC Team', + author_email='linuxnetdev@microsoft.com', + url='https://github.com/Azure/sonic-buildimage', + maintainer='Junchao Chen', + maintainer_email='junchaow@mellanox.com', + install_requires=dependencies, + packages=[ + 'health_checker', + 'tests' + ], + scripts=[ + 'scripts/healthd', + ], + setup_requires= [ + 'pytest-runner' + ], + tests_require = [ + 'pytest', + 'mock>=2.0.0' + ], + classifiers=[ + 'Development Status :: 4 - Beta', + 'Environment :: No Input/Output (Daemon)', + 'Intended Audience :: Developers', + 'Intended Audience :: Information Technology', + 'Intended Audience :: System Administrators', + 'License :: OSI Approved :: Apache Software License', + 'Natural Language :: English', + 'Operating System :: POSIX :: Linux', + 'Programming Language :: Python :: 2.7', + 'Topic :: System :: Hardware', + ], + keywords='SONiC sonic HEALTH health', + test_suite='setup.get_test_suite' +) + diff --git a/src/system-health/tests/__init__.py b/src/system-health/tests/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/src/system-health/tests/mock_connector.py b/src/system-health/tests/mock_connector.py new file mode 100644 index 000000000000..c65198dbe145 --- /dev/null +++ b/src/system-health/tests/mock_connector.py @@ -0,0 +1,25 @@ +class MockConnector(object): + STATE_DB = None + data = {} + + def __init__(self, host): + pass + + def connect(self, db_id): + pass + + def get(self, db_id, key, field): + return MockConnector.data[key][field] + + def keys(self, db_id, pattern): + match = pattern.split('*')[0] + ret = [] + for key in MockConnector.data.keys(): + if match in key: + ret.append(key) + + return ret + + def get_all(self, db_id, key): + return MockConnector.data[key] + diff --git a/src/system-health/tests/test_system_health.py b/src/system-health/tests/test_system_health.py new file mode 100644 index 000000000000..6a791613a475 --- /dev/null +++ b/src/system-health/tests/test_system_health.py @@ -0,0 +1,219 @@ +""" + Unit test cases for system health checker. The current test case contains: + 1. test_user_defined_checker mocks the output of a user defined checker and verify class UserDefinedChecker + 2. test_service_checker mocks the output of monit service and verify class ServiceChecker + 3. test_hardware_checker mocks the hardware status data in db and verify class HardwareChecker + And there are class that are not covered by unit test. These class will be covered by sonic-mgmt regression test. + 1. HealthDaemon + 2. HealthCheckerManager + 3. Config +""" +import os +import sys +import swsssdk + +from mock import Mock, MagicMock, patch +from sonic_py_common import device_info + +from .mock_connector import MockConnector + +swsssdk.SonicV2Connector = MockConnector + +test_path = os.path.dirname(os.path.abspath(__file__)) +modules_path = os.path.dirname(test_path) +sys.path.insert(0, modules_path) +from health_checker import utils +from health_checker.config import Config +from health_checker.hardware_checker import HardwareChecker +from health_checker.health_checker import HealthChecker +from health_checker.manager import HealthCheckerManager +from health_checker.service_checker import ServiceChecker +from health_checker.user_defined_checker import UserDefinedChecker + +device_info.get_platform = MagicMock(return_value='unittest') + + +def test_user_defined_checker(): + utils.run_command = MagicMock(return_value='') + + checker = UserDefinedChecker('') + checker.check(None) + assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + checker.reset() + assert len(checker._info) == 0 + + utils.run_command = MagicMock(return_value='\n\n\n') + checker.check(None) + assert checker._info[str(checker)][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + valid_output = 'MyCategory\nDevice1:OK\nDevice2:Device2 is broken\n' + utils.run_command = MagicMock(return_value=valid_output) + checker.check(None) + assert 'Device1' in checker._info + assert 'Device2' in checker._info + assert checker._info['Device1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + assert checker._info['Device2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_service_checker(): + return_value = '' + + def mock_run_command(cmd): + if cmd == ServiceChecker.CHECK_MONIT_SERVICE_CMD: + return 'active' + else: + return return_value + + utils.run_command = mock_run_command + return_value = 'Monit 5.20.0 uptime: 3h 54m\n' \ + 'Service Name Status Type\n' \ + 'sonic Running System\n' \ + 'sonic1 Not running System\n' \ + 'telemetry Does not exist Process\n' \ + 'orchagent Running Process\n' \ + 'root-overlay Accessible Filesystem\n' \ + 'var-log Is not accessible Filesystem\n' + + checker = ServiceChecker() + config = Config() + checker.check(config) + assert 'sonic' in checker._info + assert checker._info['sonic'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'sonic1' in checker._info + assert checker._info['sonic1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'orchagent' in checker._info + assert checker._info['orchagent'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'telemetry' in checker._info + assert checker._info['telemetry'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'root-overlay' in checker._info + assert checker._info['root-overlay'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'var-log' in checker._info + assert checker._info['var-log'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + +def test_hardware_checker(): + MockConnector.data.update({ + 'TEMPERATURE_INFO|ASIC': { + 'temperature': '20', + 'high_threshold': '21' + } + }) + + MockConnector.data.update({ + 'FAN_INFO|fan1': { + 'presence': 'True', + 'status': 'True', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan2': { + 'presence': 'False', + 'status': 'True', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan3': { + 'presence': 'True', + 'status': 'False', + 'speed': '60', + 'speed_target': '60', + 'speed_tolerance': '20' + }, + 'FAN_INFO|fan4': { + 'presence': 'True', + 'status': 'True', + 'speed': '20', + 'speed_target': '60', + 'speed_tolerance': '20' + } + }) + + MockConnector.data.update({ + 'PSU_INFO|PSU 1': { + 'presence': 'True', + 'status': 'True', + 'temp': '55', + 'temp_threshold': '100', + 'voltage': '10', + 'voltage_min_threshold': '8', + 'voltage_max_threshold': '15', + }, + 'PSU_INFO|PSU 2': { + 'presence': 'False', + 'status': 'True', + 'temp': '55', + 'temp_threshold': '100', + 'voltage': '10', + 'voltage_min_threshold': '8', + 'voltage_max_threshold': '15', + }, + 'PSU_INFO|PSU 3': { + 'presence': 'True', + 'status': 'False', + 'temp': '55', + 'temp_threshold': '100', + 'voltage': '10', + 'voltage_min_threshold': '8', + 'voltage_max_threshold': '15', + }, + 'PSU_INFO|PSU 4': { + 'presence': 'True', + 'status': 'True', + 'temp': '101', + 'temp_threshold': '100', + 'voltage': '10', + 'voltage_min_threshold': '8', + 'voltage_max_threshold': '15', + }, + 'PSU_INFO|PSU 5': { + 'presence': 'True', + 'status': 'True', + 'temp': '55', + 'temp_threshold': '100', + 'voltage': '10', + 'voltage_min_threshold': '12', + 'voltage_max_threshold': '15', + } + }) + + checker = HardwareChecker() + config = Config() + checker.check(config) + + assert 'ASIC' in checker._info + assert checker._info['ASIC'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'fan1' in checker._info + assert checker._info['fan1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'fan2' in checker._info + assert checker._info['fan2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'fan3' in checker._info + assert checker._info['fan3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'fan4' in checker._info + assert checker._info['fan4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 1' in checker._info + assert checker._info['PSU 1'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_OK + + assert 'PSU 2' in checker._info + assert checker._info['PSU 2'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 3' in checker._info + assert checker._info['PSU 3'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 4' in checker._info + assert checker._info['PSU 4'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK + + assert 'PSU 5' in checker._info + assert checker._info['PSU 5'][HealthChecker.INFO_FIELD_OBJECT_STATUS] == HealthChecker.STATUS_NOT_OK