Skip to content

Commit

Permalink
[reboot-history] Add reboot history to state db (sonic-net#5933)
Browse files Browse the repository at this point in the history
- Why I did it
Add reboot history to State db so that can be used telemetry service
- How I did it
Split the process-reboot-cause service to determine-reboot-cause and process-reboot-cause
determine-reboot-cause to determine the reboot cause
process-reboot-cause to parse the reboot cause files and put the reboot history to state db
Moved to sonic-host-service* packages
- How to verify it
Performed unit test and tested on DUT
  • Loading branch information
sujinmkang authored and santhosh-kt committed Feb 25, 2021
1 parent 95d0827 commit dbaf2c0
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 55 deletions.
10 changes: 0 additions & 10 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -428,16 +428,6 @@ sudo cp $IMAGE_CONFIGS/pcie-check/pcie-check.service $FILESYSTEM_ROOT_USR_LIB_SY
echo "pcie-check.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/pcie-check/pcie-check.sh $FILESYSTEM_ROOT/usr/bin/

# Copy systemd timer configuration
# It implements delayed start of services
sudo cp $BUILD_TEMPLATES/process-reboot-cause.timer $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
sudo LANG=C chroot $FILESYSTEM_ROOT systemctl enable process-reboot-cause.timer

# Copy process-reboot-cause service files
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause.service $FILESYSTEM_ROOT_USR_LIB_SYSTEMD_SYSTEM
echo "process-reboot-cause.service" | sudo tee -a $GENERATED_SERVICE_FILE
sudo cp $IMAGE_CONFIGS/process-reboot-cause/process-reboot-cause $FILESYSTEM_ROOT/usr/bin/

## Install package without starting service
## ref: https://wiki.debian.org/chroot
sudo tee -a $FILESYSTEM_ROOT/usr/sbin/policy-rc.d > /dev/null <<EOF
Expand Down
3 changes: 3 additions & 0 deletions src/sonic-host-services-data/debian/rules
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,6 @@ override_dh_installsystemd:
dh_installsystemd --no-start --name=caclmgrd
dh_installsystemd --no-start --name=hostcfgd
dh_installsystemd --no-start --name=procdockerstatsd
dh_installsystemd --no-start --name=determine-reboot-cause
dh_installsystemd --no-start --name=process-reboot-cause

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[Unit]
Description=Reboot cause determination service
Requires=rc-local.service
After=rc-local.service

[Service]
Type=simple
ExecStart=/usr/local/bin/determine-reboot-cause

[Install]
WantedBy=multi-user.target
Original file line number Diff line number Diff line change
@@ -1,28 +1,32 @@
#!/usr/bin/env python
#!/usr/bin/env python3
#
# process-reboot-cause
# determine-reboot-cause
#
# Program designed to run once, soon after system boot which will
# determine the cause of the previous reboot and store it to the disk,
#

try:
import datetime
import json
import os
import pwd
import re
import sys

from sonic_py_common import device_info, logger

except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

VERSION = "1.0"

SYSLOG_IDENTIFIER = "process-reboot-cause"
SYSLOG_IDENTIFIER = "determine-reboot-cause"

REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "reboot-cause.txt"
PREVIOUS_REBOOT_CAUSE_FILE = REBOOT_CAUSE_DIR + "previous-reboot-cause.txt"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "reboot-cause.txt")
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
FIRST_BOOT_PLATFORM_FILE = "/tmp/notify_firstboot_to_platform"
REBOOT_TYPE_KEXEC_FILE = "/proc/cmdline"
# The following SONIC_BOOT_TYPEs come from the warm/fast reboot script which is in sonic-utilities
Expand All @@ -45,7 +49,7 @@ sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)
# ============================= Functions =============================
def parse_warmfast_reboot_from_proc_cmdline():
if os.path.isfile(REBOOT_TYPE_KEXEC_FILE):
with open(REBOOT_TYPE_KEXEC_FILE, "r") as cause_file:
with open(REBOOT_TYPE_KEXEC_FILE) as cause_file:
cause_file_kexec = cause_file.readline()
m = re.search(REBOOT_TYPE_KEXEC_PATTERN_WARM, cause_file_kexec)
if m and m.group(1):
Expand All @@ -56,69 +60,100 @@ def parse_warmfast_reboot_from_proc_cmdline():
return None


def find_software_reboot_cause():
software_reboot_cause = REBOOT_CAUSE_UNKNOWN

def find_software_reboot_cause_from_reboot_cause_file():
software_reboot_cause = None
if os.path.isfile(REBOOT_CAUSE_FILE):
with open(REBOOT_CAUSE_FILE, "r") as cause_file:
with open(REBOOT_CAUSE_FILE) as cause_file:
software_reboot_cause = cause_file.readline().rstrip('\n')
sonic_logger.log_info("{} indicates the reboot cause: {}".format(REBOOT_CAUSE_FILE, software_reboot_cause))
else:
sonic_logger.log_info("Reboot cause file {} not found".format(REBOOT_CAUSE_FILE))
return software_reboot_cause

if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
version_info = device_info.get_sonic_version_info()
build_version = version_info['build_version'] if version_info else "unknown"
software_reboot_cause += " (First boot of SONiC version {})".format(build_version)
os.remove(FIRST_BOOT_PLATFORM_FILE)

def find_first_boot_version():
build_version = "unknown"
version_info = device_info.get_sonic_version_info()
if version_info:
build_version = version_info['build_version']
return " (First boot of SONiC version {})".format(build_version)


def find_software_reboot_cause():
software_reboot_cause = find_software_reboot_cause_from_reboot_cause_file()
if software_reboot_cause == REBOOT_CAUSE_UNKNOWN:
if os.path.isfile(FIRST_BOOT_PLATFORM_FILE):
software_reboot_cause += find_first_boot_version()
os.remove(FIRST_BOOT_PLATFORM_FILE)
return software_reboot_cause


def find_proc_cmdline_reboot_cause():
proc_cmdline_reboot_cause = parse_warmfast_reboot_from_proc_cmdline()

if proc_cmdline_reboot_cause:
sonic_logger.log_info("/proc/cmdline indicates reboot type: {}".format(proc_cmdline_reboot_cause))
else:
sonic_logger.log_info("No reboot cause found from /proc/cmdline")

return proc_cmdline_reboot_cause


def find_hardware_reboot_cause():
hardware_reboot_cause = None
return proc_cmdline_reboot_cause

def get_reboot_cause_from_platform():
# Until all platform vendors have provided sonic_platform packages,
# if there is no sonic_platform package installed, we only provide
# software-related reboot causes.
try:
import sonic_platform
platform = sonic_platform.platform.Platform()
chassis = platform.get_chassis()
return chassis.get_reboot_cause()
except ImportError as err:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")

platform = sonic_platform.platform.Platform()

chassis = platform.get_chassis()
def find_hardware_reboot_cause():
hardware_reboot_cause = None

hardware_reboot_cause_major, hardware_reboot_cause_minor = chassis.get_reboot_cause()
REBOOT_CAUSE_HARDWARE_OTHER = "Hardware - Other"
REBOOT_CAUSE_NON_HARDWARE = "Non-Hardware"

if hardware_reboot_cause_major == chassis.REBOOT_CAUSE_NON_HARDWARE:
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
# contain any software-related reboot info. We will use it as the previous cause.
pass
elif hardware_reboot_cause_major == chassis.REBOOT_CAUSE_HARDWARE_OTHER:
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
else:
hardware_reboot_cause = hardware_reboot_cause_major
except ImportError as err:
sonic_logger.log_warning("sonic_platform package not installed. Unable to detect hardware reboot causes.")
hardware_reboot_cause_major, hardware_reboot_cause_minor = get_reboot_cause_from_platform()
sonic_logger.log_info("Platform api returns reboot cause {}, {}".format(hardware_reboot_cause_major, hardware_reboot_cause_minor))

if hardware_reboot_cause_major == REBOOT_CAUSE_NON_HARDWARE:
# The reboot was not caused by hardware. If there is a REBOOT_CAUSE_FILE, it will
# contain any software-related reboot info. We will use it as the previous cause.
pass
elif hardware_reboot_cause_major == REBOOT_CAUSE_HARDWARE_OTHER:
hardware_reboot_cause = "{} ({})".format(hardware_reboot_cause_major, hardware_reboot_cause_minor)
else:
hardware_reboot_cause = hardware_reboot_cause_major

if hardware_reboot_cause:
sonic_logger.log_info("Platform api indicates reboot cause {}".format(hardware_reboot_cause))
else:
sonic_logger.log_info("No reboot cause found from platform api")

return hardware_reboot_cause
return hardware_reboot_cause, hardware_reboot_cause_minor

def get_reboot_cause_dict(previous_reboot_cause, comment, gen_time):
# resultant dictionary
reboot_cause_dict = {}
reboot_cause_dict['gen_time'] = gen_time
reboot_cause_dict['cause'] = previous_reboot_cause
reboot_cause_dict['user'] = "N/A"
reboot_cause_dict['time'] = "N/A"
reboot_cause_dict['comment'] = comment if comment is not None else "N/A"

if re.search(r'User issued', previous_reboot_cause):
# Match with "User issued '{}' command [User: {}, Time: {}]"
match = re.search(r'User issued \'(.*)\' command \[User: (.*), Time: (.*)\]', previous_reboot_cause)
if match is not None:
reboot_cause_dict['cause'] = match.group(1)
reboot_cause_dict['user'] = match.group(2)
reboot_cause_dict['time'] = match.group(3)

return reboot_cause_dict


def main():
Expand All @@ -139,23 +174,23 @@ def main():
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
os.remove(PREVIOUS_REBOOT_CAUSE_FILE)

# Set a default previous reboot cause
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN
hardware_reboot_cause = None
additional_reboot_info = None

# 1. Check if the previous reboot was warm/fast reboot by testing whether there is "fast|fastfast|warm" in /proc/cmdline
proc_cmdline_reboot_cause = find_proc_cmdline_reboot_cause()

# 2. Check if the previous reboot was caused by hardware
# If yes, the hardware reboot cause will be treated as the reboot cause
hardware_reboot_cause = find_hardware_reboot_cause()
(hardware_reboot_cause, additional_reboot_info) = find_hardware_reboot_cause()

# 3. If there is a REBOOT_CAUSE_FILE, it will contain any software-related
# reboot info. We will use it as the previous cause.
software_reboot_cause = find_software_reboot_cause()

# The main decision logic of the reboot cause:
# If there is a reboot cause indicated by /proc/cmdline, it should be warmreboot/fastreboot
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# the software_reboot_cause which is the content of /hosts/reboot-cause/reboot-cause.txt
# will be treated as the reboot cause
# Elif there is a reboot cause indicated by platform API,
# the hardware_reboot_cause will be treated as the reboot cause
Expand All @@ -167,12 +202,26 @@ def main():
else:
previous_reboot_cause = software_reboot_cause

# Write the previous reboot cause to PREVIOUS_REBOOT_CAUSE_FILE
with open(PREVIOUS_REBOOT_CAUSE_FILE, "w") as prev_cause_file:
prev_cause_file.write(previous_reboot_cause)
# Current time
reboot_cause_gen_time = str(datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S'))

# Save the previous cause info into its history file as json format
reboot_cause_dict = get_reboot_cause_dict(previous_reboot_cause, additional_reboot_info, reboot_cause_gen_time)

# Create reboot-cause-#time#.json under history directory
REBOOT_CAUSE_HISTORY_FILE = os.path.join(REBOOT_CAUSE_HISTORY_DIR, "reboot-cause-{}.json".format(reboot_cause_gen_time))

# Create REBOOT_CAUSE_HISTORY_DIR if it doesn't exist
if not os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
os.makedirs(REBOOT_CAUSE_HISTORY_DIR)

# Write the previous reboot cause to REBOOT_CAUSE_HISTORY_FILE as a JSON format
with open(REBOOT_CAUSE_HISTORY_FILE, "w") as reboot_cause_history_file:
json.dump(reboot_cause_dict, reboot_cause_history_file)

# Create a symbolic link to previous-reboot-cause.json file
os.symlink(REBOOT_CAUSE_HISTORY_FILE, PREVIOUS_REBOOT_CAUSE_FILE)

# Also log the previous reboot cause to the syslog
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))

# Remove the old REBOOT_CAUSE_FILE
if os.path.exists(REBOOT_CAUSE_FILE):
Expand Down
100 changes: 100 additions & 0 deletions src/sonic-host-services/scripts/process-reboot-cause
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
#!/usr/bin/env python3
#
# process-reboot-cause
#
# Program designed to read the previous reboot-cause files, log the last previous reboot-cause.
# And read the saved reboot-cause history files and save the reboot cause in the state-db.
#

try:
import json
import os
import pwd
import sys

import swsssdk
from sonic_py_common import logger
except ImportError as err:
raise ImportError("%s - required module not found" % str(err))

VERSION = "1.0"

SYSLOG_IDENTIFIER = "process-reboot-cause"

REBOOT_CAUSE_DIR = "/host/reboot-cause/"
REBOOT_CAUSE_HISTORY_DIR = "/host/reboot-cause/history/"
PREVIOUS_REBOOT_CAUSE_FILE = os.path.join(REBOOT_CAUSE_DIR, "previous-reboot-cause.json")
USER_ISSUED_REBOOT_CAUSE_REGEX ="User issued \'{}\' command [User: {}, Time: {}]"

REBOOT_CAUSE_UNKNOWN = "Unknown"
REBOOT_CAUSE_TABLE_NAME = "REBOOT_CAUSE"

REDIS_HOSTIP = "127.0.0.1"
state_db = None

# Global logger class instance
sonic_logger = logger.Logger(SYSLOG_IDENTIFIER)


# ============================= Functions =============================
def read_reboot_cause_files_and_save_state_db():
# Connect State DB
state_db = swsssdk.SonicV2Connector(host=REDIS_HOSTIP)
state_db.connect(state_db.STATE_DB)

# Sort the previous reboot cause files by creation time
REBOOT_FILE_LIST = [os.path.join(REBOOT_CAUSE_HISTORY_DIR, i) for i in os.listdir(REBOOT_CAUSE_HISTORY_DIR)]
TIME_SORTED_FULL_REBOOT_FILE_LIST = sorted(REBOOT_FILE_LIST, key=os.path.getmtime, reverse=True)

data = []
# Read each sorted previous reboot cause file and update the state db with previous reboot cause information
for i in range(min(10, len(TIME_SORTED_FULL_REBOOT_FILE_LIST))):
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
if os.path.isfile(x):
with open(x, "r") as cause_file:
data = json.load(cause_file)
_hash = '{}|{}'.format(REBOOT_CAUSE_TABLE_NAME, data['gen_time'])
state_db.set(state_db.STATE_DB, _hash, 'cause', data['cause'])
state_db.set(state_db.STATE_DB, _hash, 'time', data['time'])
state_db.set(state_db.STATE_DB, _hash, 'user', data['user'])
state_db.set(state_db.STATE_DB, _hash, 'comment', data['comment'])

if len(TIME_SORTED_FULL_REBOOT_FILE_LIST) > 10:
for i in range(len(TIME_SORTED_FULL_REBOOT_FILE_LIST)):
if i >= 10:
x = TIME_SORTED_FULL_REBOOT_FILE_LIST[i]
os.remove(x)


def main():
# Configure logger to log all messages INFO level and higher
sonic_logger.set_min_log_priority_info()

sonic_logger.log_info("Starting up...")

if not os.geteuid() == 0:
sonic_logger.log_error("User {} does not have permission to execute".format(pwd.getpwuid(os.getuid()).pw_name))
sys.exit("This utility must be run as root")

# Set a default previous reboot cause
previous_reboot_cause = REBOOT_CAUSE_UNKNOWN

# Read the most recent reboot cause file and log data to syslog
if os.path.exists(PREVIOUS_REBOOT_CAUSE_FILE):
with open(PREVIOUS_REBOOT_CAUSE_FILE, "r") as last_cause_file:
data = json.load(last_cause_file)
if data['user']:
previous_reboot_cause = USER_ISSUED_REBOOT_CAUSE_REGEX.format(data['cause'], data['user'], data['time'])
else:
previous_reboot_cause = "{}".format(data['cause'])

# Log the last reboot cause to the syslog
sonic_logger.log_info("Previous reboot cause: {}".format(previous_reboot_cause))

if os.path.exists(REBOOT_CAUSE_HISTORY_DIR):
# Read the previous reboot cause from saved reboot-cause files and save the previous reboot cause upto 10 entry to the state db
read_reboot_cause_files_and_save_state_db()


if __name__ == "__main__":
main()
2 changes: 2 additions & 0 deletions src/sonic-host-services/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
'scripts/caclmgrd',
'scripts/hostcfgd',
'scripts/procdockerstatsd',
'scripts/determine-reboot-cause',
'scripts/process-reboot-cause',
],
install_requires = [
'Jinja2>=2.10',
Expand Down
Loading

0 comments on commit dbaf2c0

Please sign in to comment.