Skip to content

Commit

Permalink
adding fail hard to the main work loop
Browse files Browse the repository at this point in the history
  • Loading branch information
Naor Livne committed Jan 15, 2019
1 parent 89881f2 commit 453941e
Showing 1 changed file with 134 additions and 126 deletions.
260 changes: 134 additions & 126 deletions worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,136 +189,144 @@ def get_device_group_info(nebula_connection_object, device_group_to_get_info):

if __name__ == "__main__":

# read config file and config envvars at startup, order preference is envvar>config file>default value (if exists)
if os.path.exists("config/conf.json"):
print("reading config file")
auth_file = json.load(open("config/conf.json"))
else:
print("config file not found - skipping reading it and checking if needed params are given from envvars")
auth_file = {}
print("reading config variables")
nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
device_group = get_conf_setting("device_group", auth_file)

# get number of cpu cores on host
cpu_cores = get_number_of_cpu_cores()

# work against docker socket
docker_socket = DockerFunctions()

# ensure default "nebula" named network exists
docker_socket.create_docker_network("nebula", "bridge")

# login to the docker registry - if no registry login details are configured will just print a message stating that
docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
registry_pass=registry_auth_password)

# login to the nebula manager
nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
request_timeout=nebula_manager_request_timeout)

# make sure the nebula manager connects properly
try:
print("checking nebula manager connection")
api_check = nebula_connection.check_api()
if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
print("nebula manager connection ok")
# read config file/envvars at startup, order preference is envvar>config file>default value (if exists)
if os.path.exists("config/conf.json"):
print("reading config file")
auth_file = json.load(open("config/conf.json"))
else:
print("nebula manager initial connection check failure, dropping container")
except Exception as e:
print >> sys.stderr, e
print("error confirming connection to nebula manager - please check connection & authentication params and "
"that the manager is online")
os._exit(2)

# stop all nebula managed containers on start to ensure a clean slate to work on
print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
stop_containers({"app_name": ""})
print("config file not found - skipping reading it and checking if needed params are given from envvars")
auth_file = {}
print("reading config variables")
nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
device_group = get_conf_setting("device_group", auth_file)

# get number of cpu cores on host
cpu_cores = get_number_of_cpu_cores()

# work against docker socket
docker_socket = DockerFunctions()

# ensure default "nebula" named network exists
docker_socket.create_docker_network("nebula", "bridge")

# login to the docker registry - if no registry login details are configured will just print a message stating
# that
docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
registry_pass=registry_auth_password)

# login to the nebula manager
nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
request_timeout=nebula_manager_request_timeout)

# make sure the nebula manager connects properly
try:
print("checking nebula manager connection")
api_check = nebula_connection.check_api()
if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
print("nebula manager connection ok")
else:
print("nebula manager initial connection check failure, dropping container")
os._exit(2)
except Exception as e:
print >> sys.stderr, e
print("error confirming connection to nebula manager - please check connection & authentication params and "
"that the manager is online")
os._exit(2)

# stop all nebula managed containers on start to ensure a clean slate to work on
print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
stop_containers({"app_name": ""})

# get the initial device_group configuration and store it in memory
local_device_group_info = get_device_group_info(nebula_connection, device_group)

# get the initial device_group configuration and store it in memory
local_device_group_info = get_device_group_info(nebula_connection, device_group)
# make sure the device_group exists in the nebula cluster
while local_device_group_info["status_code"] == 403 and \
local_device_group_info["reply"]["device_group_exists"] is False:
print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
local_device_group_info = get_device_group_info(nebula_connection, device_group)
time.sleep(nebula_manager_check_in_time)

# start all apps that are set to running on boot
for nebula_app in local_device_group_info["reply"]["apps"]:
if nebula_app["running"] is True:
print("initial start of " + nebula_app["app_name"] + " app")
start_containers(nebula_app)
print("completed initial start of " + nebula_app["app_name"] + " app")

# open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
print("starting work container health checking thread")
Thread(target=restart_unhealthy_containers).start()

# loop forever
print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
+ str(nebula_manager_check_in_time) + " seconds")
while True:

# make sure the device_group exists in the nebula cluster
while local_device_group_info["status_code"] == 403 and \
local_device_group_info["reply"]["device_group_exists"] is False:
print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
local_device_group_info = get_device_group_info(nebula_connection, device_group)
time.sleep(nebula_manager_check_in_time)

# start all apps that are set to running on boot
for nebula_app in local_device_group_info["reply"]["apps"]:
if nebula_app["running"] is True:
print("initial start of " + nebula_app["app_name"] + " app")
start_containers(nebula_app)
print("completed initial start of " + nebula_app["app_name"] + " app")

# open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
print("starting work container health checking thread")
Thread(target=restart_unhealthy_containers).start()

# loop forever
print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
+ str(nebula_manager_check_in_time) + " seconds")
while True:

# wait the configurable time before checking the device_group info page again
time.sleep(nebula_manager_check_in_time)

monotonic_id_increase = False

# get the device_group configuration
remote_device_group_info = get_device_group_info(nebula_connection, device_group)

# logic that checks if the each app_id was increased and updates the app containers if the answer is yes
# the logic also starts containers of newly added apps to the device_group
for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
# wait the configurable time before checking the device_group info page again
time.sleep(nebula_manager_check_in_time)

monotonic_id_increase = False

# get the device_group configuration
remote_device_group_info = get_device_group_info(nebula_connection, device_group)

# logic that checks if the each app_id was increased and updates the app containers if the answer is yes
# the logic also starts containers of newly added apps to the device_group
for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
monotonic_id_increase = True
if remote_nebula_app["running"] is False:
print("stopping app " + remote_nebula_app["app_name"] +
" do to changes in the app configuration")
stop_containers(remote_nebula_app)
elif remote_nebula_app["rolling_restart"] is True and \
local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
print("rolling app " + remote_nebula_app["app_name"] +
" do to changes in the app configuration")
roll_containers(remote_nebula_app)
else:
print("restarting app " + remote_nebula_app["app_name"] +
" do to changes in the app configuration")
restart_containers(remote_nebula_app)
else:
print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
monotonic_id_increase = True
if remote_nebula_app["running"] is False:
print("stopping app " + remote_nebula_app["app_name"] +
" do to changes in the app configuration")
stop_containers(remote_nebula_app)
elif remote_nebula_app["rolling_restart"] is True and \
local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
print("rolling app " + remote_nebula_app["app_name"] +
" do to changes in the app configuration")
roll_containers(remote_nebula_app)
else:
print("restarting app " + remote_nebula_app["app_name"] +
restart_containers(remote_nebula_app)

# logic that removes containers of apps that was removed from the device_group
if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
monotonic_id_increase = True
for local_nebula_app in local_device_group_info["reply"]["apps"]:
if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
print("removing app " + local_nebula_app["app_name"] +
" do to changes in the app configuration")
restart_containers(remote_nebula_app)
else:
print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
stop_containers(local_nebula_app)

# logic that runs image pruning if prune_id increased
if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
print("pruning images do to changes in the app configuration")
monotonic_id_increase = True
restart_containers(remote_nebula_app)

# logic that removes containers of apps that was removed from the device_group
if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
monotonic_id_increase = True
for local_nebula_app in local_device_group_info["reply"]["apps"]:
if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
print("removing app " + local_nebula_app["app_name"] + " do to changes in the app configuration")
stop_containers(local_nebula_app)

# logic that runs image pruning if prune_id increased
if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
print("pruning images do to changes in the app configuration")
monotonic_id_increase = True
prune_images()

# set the in memory device_group info to be the one recently received if any id increased
if monotonic_id_increase is True:
local_device_group_info = remote_device_group_info
prune_images()

# set the in memory device_group info to be the one recently received if any id increased
if monotonic_id_increase is True:
local_device_group_info = remote_device_group_info
except Exception as e:
print >> sys.stderr, e
print("failed main loop - exiting")
os._exit(2)

0 comments on commit 453941e

Please sign in to comment.