Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Monit] Monitoring the running status of containers. #6251

Merged
merged 18 commits into from
Jan 8, 2021
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
2d963ee
[Monit] Use Monit to monitor the running status of each container and
yozhao101 Dec 18, 2020
75b924f
[Monit] Remove trailing space.
yozhao101 Dec 18, 2020
88b7e0f
[Monit] Use python3 to run this script and change the script name to
yozhao101 Jan 4, 2021
a4aa60c
[Monit] Use the updated script name in Monit configuration file.
yozhao101 Jan 4, 2021
7285539
[Monit] Organize the 'import's alphabetically.
yozhao101 Jan 4, 2021
3b461fb
[Monit] Simplify the logic to generate the alerting message.
yozhao101 Jan 5, 2021
5db9a8e
[Monit] Delete an extra blank line and add the 'import' statement.
yozhao101 Jan 5, 2021
b895ced
[Monit] Declare the logger to be a global variable.
yozhao101 Jan 5, 2021
a9df984
[Monit] Use 'print(...)' instead of Logger class to write the alerting
yozhao101 Jan 5, 2021
fe21185
[Monit] Add docstring for this script.
yozhao101 Jan 5, 2021
90e5e5e
[Monit] Add the missing keywords of Monit example in docstring.
yozhao101 Jan 5, 2021
8386802
[Monit] Add blank lines above and below the docstring.
yozhao101 Jan 5, 2021
e63ad78
[Monit] Fix the typo and write an log message showing unexpected running
yozhao101 Jan 5, 2021
d256748
[Monit] Delete the extra trailing space.
yozhao101 Jan 5, 2021
3175f1c
[Monit] Simplify the logic to get current running containers.
yozhao101 Jan 6, 2021
3eae57e
[Monit] Use .rstrip() to remove "\n" before doing the split.
yozhao101 Jan 6, 2021
0ba3ea6
[Monit] Skip the header line when parsing the command output.
yozhao101 Jan 6, 2021
623001d
[Monit] Chnage the syslog message for containers which were expected to
yozhao101 Jan 7, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions files/build_templates/sonic_debian_extension.j2
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,9 @@ sudo cp $IMAGE_CONFIGS/monit/conf.d/* $FILESYSTEM_ROOT/etc/monit/conf.d/
sudo chmod 600 $FILESYSTEM_ROOT/etc/monit/conf.d/*
sudo cp $IMAGE_CONFIGS/monit/process_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/process_checker
sudo cp $IMAGE_CONFIGS/monit/container_checker $FILESYSTEM_ROOT/usr/bin/
sudo chmod 755 $FILESYSTEM_ROOT/usr/bin/container_checker


# Install custom-built openssh sshd
sudo dpkg --root=$FILESYSTEM_ROOT -i $debs_path/openssh-server_*.deb
Expand Down
2 changes: 2 additions & 0 deletions files/image_config/monit/conf.d/sonic-host
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ check program routeCheck with path "/usr/local/bin/route_check.py"
every 5 cycles
if status != 0 for 3 cycle then alert repeat every 1 cycles

check program container_checker with path "/usr/bin/container_checker"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
116 changes: 116 additions & 0 deletions files/image_config/monit/container_checker
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3

jleveque marked this conversation as resolved.
Show resolved Hide resolved
"""
container_checker

This script is intended to be run by Monit. It will write an alerting message into
syslog if it found containers which were expected to run but were not running. At
the same time, if some containers were unexpected to run, it also writes an alerting
syslog message. Note that if print(...) statement in this script was executed, the
string in it will be appended to Monit syslog messages.

The following is an example in Monit configuration file to show how Monit will run
this script:

check program container_checker with path "/usr/bin/container_checker"
if status != 0 for 5 times within 5 cycles then alert repeat every 1 cycles
"""

import subprocess
import sys

import swsssdk
from sonic_py_common import multi_asic


def get_command_result(command):
"""
@summary: This function will execute the command and return the resulting output.
@return: A string which contains the output of command.
"""
command_stdout = ""

try:
proc_instance = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
shell=True, universal_newlines=True)
command_stdout, command_stderr = proc_instance.communicate()
if proc_instance.returncode != 0:
print("Failed to execute the command '{}'. Return code: '{}'".format(
command, proc_instance.returncode))
sys.exit(1)
except (OSError, ValueError) as err:
print("Failed to execute the command '{}'. Error: '{}'".format(command, err))
sys.exit(2)

return command_stdout.rstrip().split("\n")


def get_expected_running_containers():
"""
@summary: This function will get the expected running containers by following the rule:
The 'state' field of container in 'FEATURE' table should not be 'disabled'. Then
if the device has Multi-ASIC, this function will get container list by determining the
value of field 'has_global_scope', the number of ASICs and the value of field
'has_per_asic_scope'. If the device has single ASIC, the container name was put into
the list.
@return: A set which contains the expected running containers.
abdosi marked this conversation as resolved.
Show resolved Hide resolved
"""
config_db = swsssdk.ConfigDBConnector()
config_db.connect()
feature_table = config_db.get_table("FEATURE")

expected_running_containers = set()

for container_name in feature_table.keys():
if feature_table[container_name]["state"] != "disabled":
if multi_asic.is_multi_asic():
if feature_table[container_name]["has_global_scope"] == "True":
expected_running_containers.add(container_name)
if feature_table[container_name]["has_per_asic_scope"] == "True":
num_asics = multi_asic.get_num_asics()
for asic_id in range(num_asics):
expected_running_containers.add(container_name + str(asic_id))
else:
expected_running_containers.add(container_name)

return expected_running_containers


def get_current_running_containers():
"""
@summary: This function will get the current running container list by analyzing the
output of command `docker ps`.
@return: A set which contains the current running contianers.
"""
running_containers = set()

command = "docker ps"
command_stdout = get_command_result(command)
for line in command_stdout[1:]:
running_containers.add(line.split()[-1].strip())

return running_containers


def main():
"""
@summary: This function will compare the difference between the current running containers
and the containers which were expected to run. If containers which were exepcted
to run were not running, then an alerting message will be written into syslog.
"""
expected_running_containers = get_expected_running_containers()
current_running_containers = get_current_running_containers()

not_running_containers = expected_running_containers.difference(current_running_containers)
if not_running_containers:
abdosi marked this conversation as resolved.
Show resolved Hide resolved
print("Containers not running: " + ", ".join(not_running_containers))
abdosi marked this conversation as resolved.
Show resolved Hide resolved
sys.exit(3)

unexpected_running_containers = current_running_containers.difference(expected_running_containers)
if unexpected_running_containers:
print("Unexpected running containers: " + ", ".join(unexpected_running_containers))
sys.exit(4)
abdosi marked this conversation as resolved.
Show resolved Hide resolved


if __name__ == "__main__":
main()