adding fail hard to the main work loop

nebula-orchestrator · Jan 15, 2019 · 453941e · 453941e
1 parent 89881f2
commit 453941e
Showing 1 changed file with 134 additions and 126 deletions.
diff --git a/worker.py b/worker.py
@@ -189,136 +189,144 @@ def get_device_group_info(nebula_connection_object, device_group_to_get_info):
 
 if __name__ == "__main__":
 
-    # read config file and config envvars at startup, order preference is envvar>config file>default value (if exists)
-    if os.path.exists("config/conf.json"):
-        print("reading config file")
-        auth_file = json.load(open("config/conf.json"))
-    else:
-        print("config file not found - skipping reading it and checking if needed params are given from envvars")
-        auth_file = {}
-    print("reading config variables")
-    nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
-    nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
-    nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
-    nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
-    nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
-    nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
-    nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
-    registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
-    registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
-    registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
-    max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
-    device_group = get_conf_setting("device_group", auth_file)
-
-    # get number of cpu cores on host
-    cpu_cores = get_number_of_cpu_cores()
-
-    # work against docker socket
-    docker_socket = DockerFunctions()
-
-    # ensure default "nebula" named network exists
-    docker_socket.create_docker_network("nebula", "bridge")
-
-    # login to the docker registry - if no registry login details are configured will just print a message stating that
-    docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
-                                 registry_pass=registry_auth_password)
-
-    # login to the nebula manager
-    nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
-                               host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
-                               request_timeout=nebula_manager_request_timeout)
-
-    # make sure the nebula manager connects properly
     try:
-        print("checking nebula manager connection")
-        api_check = nebula_connection.check_api()
-        if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
-            print("nebula manager connection ok")
+        # read config file/envvars at startup, order preference is envvar>config file>default value (if exists)
+        if os.path.exists("config/conf.json"):
+            print("reading config file")
+            auth_file = json.load(open("config/conf.json"))
         else:
-            print("nebula manager initial connection check failure, dropping container")
-    except Exception as e:
-        print >> sys.stderr, e
-        print("error confirming connection to nebula manager - please check connection & authentication params and "
-              "that the manager is online")
-        os._exit(2)
-
-    # stop all nebula managed containers on start to ensure a clean slate to work on
-    print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
-    stop_containers({"app_name": ""})
+            print("config file not found - skipping reading it and checking if needed params are given from envvars")
+            auth_file = {}
+        print("reading config variables")
+        nebula_manager_auth_user = get_conf_setting("nebula_manager_auth_user", auth_file, None)
+        nebula_manager_auth_password = get_conf_setting("nebula_manager_auth_password", auth_file, None)
+        nebula_manager_host = get_conf_setting("nebula_manager_host", auth_file, "127.0.0.1")
+        nebula_manager_port = int(get_conf_setting("nebula_manager_port", auth_file, "80"))
+        nebula_manager_protocol = get_conf_setting("nebula_manager_protocol", auth_file, "http")
+        nebula_manager_request_timeout = int(get_conf_setting("nebula_manager_request_timeout", auth_file, "60"))
+        nebula_manager_check_in_time = int(get_conf_setting("nebula_manager_check_in_time", auth_file, "30"))
+        registry_auth_user = get_conf_setting("registry_auth_user", auth_file, None)
+        registry_auth_password = get_conf_setting("registry_auth_password", auth_file, None)
+        registry_host = get_conf_setting("registry_host", auth_file, "https://index.docker.io/v1/")
+        max_restart_wait_in_seconds = int(get_conf_setting("max_restart_wait_in_seconds", auth_file, 0))
+        device_group = get_conf_setting("device_group", auth_file)
+
+        # get number of cpu cores on host
+        cpu_cores = get_number_of_cpu_cores()
+
+        # work against docker socket
+        docker_socket = DockerFunctions()
+
+        # ensure default "nebula" named network exists
+        docker_socket.create_docker_network("nebula", "bridge")
+
+        # login to the docker registry - if no registry login details are configured will just print a message stating
+        # that
+        docker_socket.registry_login(registry_host=registry_host, registry_user=registry_auth_user,
+                                     registry_pass=registry_auth_password)
+
+        # login to the nebula manager
+        nebula_connection = Nebula(username=nebula_manager_auth_user, password=nebula_manager_auth_password,
+                                   host=nebula_manager_host, port=nebula_manager_port, protocol=nebula_manager_protocol,
+                                   request_timeout=nebula_manager_request_timeout)
+
+        # make sure the nebula manager connects properly
+        try:
+            print("checking nebula manager connection")
+            api_check = nebula_connection.check_api()
+            if api_check["status_code"] == 200 and api_check["reply"]["api_available"] is True:
+                print("nebula manager connection ok")
+            else:
+                print("nebula manager initial connection check failure, dropping container")
+                os._exit(2)
+        except Exception as e:
+            print >> sys.stderr, e
+            print("error confirming connection to nebula manager - please check connection & authentication params and "
+                  "that the manager is online")
+            os._exit(2)
+
+        # stop all nebula managed containers on start to ensure a clean slate to work on
+        print("stopping all preexisting nebula manager containers in order to ensure a clean slate on boot")
+        stop_containers({"app_name": ""})
+
+        # get the initial device_group configuration and store it in memory
+        local_device_group_info = get_device_group_info(nebula_connection, device_group)
 
-    # get the initial device_group configuration and store it in memory
-    local_device_group_info = get_device_group_info(nebula_connection, device_group)
+        # make sure the device_group exists in the nebula cluster
+        while local_device_group_info["status_code"] == 403 and \
+                local_device_group_info["reply"]["device_group_exists"] is False:
+            print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
+            local_device_group_info = get_device_group_info(nebula_connection, device_group)
+            time.sleep(nebula_manager_check_in_time)
+
+        # start all apps that are set to running on boot
+        for nebula_app in local_device_group_info["reply"]["apps"]:
+            if nebula_app["running"] is True:
+                print("initial start of " + nebula_app["app_name"] + " app")
+                start_containers(nebula_app)
+                print("completed initial start of " + nebula_app["app_name"] + " app")
+
+        # open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
+        print("starting work container health checking thread")
+        Thread(target=restart_unhealthy_containers).start()
+
+        # loop forever
+        print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
+              + str(nebula_manager_check_in_time) + " seconds")
+        while True:
 
-    # make sure the device_group exists in the nebula cluster
-    while local_device_group_info["status_code"] == 403 and \
-            local_device_group_info["reply"]["device_group_exists"] is False:
-        print("device_group " + device_group + " doesn't exist in nebula cluster, waiting for it to be created")
-        local_device_group_info = get_device_group_info(nebula_connection, device_group)
-        time.sleep(nebula_manager_check_in_time)
-
-    # start all apps that are set to running on boot
-    for nebula_app in local_device_group_info["reply"]["apps"]:
-        if nebula_app["running"] is True:
-            print("initial start of " + nebula_app["app_name"] + " app")
-            start_containers(nebula_app)
-            print("completed initial start of " + nebula_app["app_name"] + " app")
-
-    # open a thread which is in charge of restarting any containers which healthcheck shows them as unhealthy
-    print("starting work container health checking thread")
-    Thread(target=restart_unhealthy_containers).start()
-
-    # loop forever
-    print("starting device_group " + device_group + " /info check loop, configured to check for changes every "
-          + str(nebula_manager_check_in_time) + " seconds")
-    while True:
-
-        # wait the configurable time before checking the device_group info page again
-        time.sleep(nebula_manager_check_in_time)
-
-        monotonic_id_increase = False
-
-        # get the device_group configuration
-        remote_device_group_info = get_device_group_info(nebula_connection, device_group)
-
-        # logic that checks if the each app_id was increased and updates the app containers if the answer is yes
-        # the logic also starts containers of newly added apps to the device_group
-        for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
-            if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
-                local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
-                if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
+            # wait the configurable time before checking the device_group info page again
+            time.sleep(nebula_manager_check_in_time)
+
+            monotonic_id_increase = False
+
+            # get the device_group configuration
+            remote_device_group_info = get_device_group_info(nebula_connection, device_group)
+
+            # logic that checks if the each app_id was increased and updates the app containers if the answer is yes
+            # the logic also starts containers of newly added apps to the device_group
+            for remote_nebula_app in remote_device_group_info["reply"]["apps"]:
+                if remote_nebula_app["app_name"] in local_device_group_info["reply"]["apps_list"]:
+                    local_app_index = local_device_group_info["reply"]["apps_list"].index(remote_nebula_app["app_name"])
+                    if remote_nebula_app["app_id"] > local_device_group_info["reply"]["apps"][local_app_index]["app_id"]:
+                        monotonic_id_increase = True
+                        if remote_nebula_app["running"] is False:
+                            print("stopping app " + remote_nebula_app["app_name"] +
+                                  " do to changes in the app configuration")
+                            stop_containers(remote_nebula_app)
+                        elif remote_nebula_app["rolling_restart"] is True and \
+                                local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
+                            print("rolling app " + remote_nebula_app["app_name"] +
+                                  " do to changes in the app configuration")
+                            roll_containers(remote_nebula_app)
+                        else:
+                            print("restarting app " + remote_nebula_app["app_name"] +
+                                  " do to changes in the app configuration")
+                            restart_containers(remote_nebula_app)
+                else:
+                    print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
                     monotonic_id_increase = True
-                    if remote_nebula_app["running"] is False:
-                        print("stopping app " + remote_nebula_app["app_name"] +
-                              " do to changes in the app configuration")
-                        stop_containers(remote_nebula_app)
-                    elif remote_nebula_app["rolling_restart"] is True and \
-                            local_device_group_info["reply"]["apps"][local_app_index]["running"] is True:
-                        print("rolling app " + remote_nebula_app["app_name"] +
-                              " do to changes in the app configuration")
-                        roll_containers(remote_nebula_app)
-                    else:
-                        print("restarting app " + remote_nebula_app["app_name"] +
+                    restart_containers(remote_nebula_app)
+
+            # logic that removes containers of apps that was removed from the device_group
+            if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
+                monotonic_id_increase = True
+                for local_nebula_app in local_device_group_info["reply"]["apps"]:
+                    if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
+                        print("removing app " + local_nebula_app["app_name"] +
                               " do to changes in the app configuration")
-                        restart_containers(remote_nebula_app)
-            else:
-                print("restarting app " + remote_nebula_app["app_name"] + " do to changes in the app configuration")
+                        stop_containers(local_nebula_app)
+
+            # logic that runs image pruning if prune_id increased
+            if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
+                print("pruning images do to changes in the app configuration")
                 monotonic_id_increase = True
-                restart_containers(remote_nebula_app)
-
-        # logic that removes containers of apps that was removed from the device_group
-        if remote_device_group_info["reply"]["device_group_id"] > local_device_group_info["reply"]["device_group_id"]:
-            monotonic_id_increase = True
-            for local_nebula_app in local_device_group_info["reply"]["apps"]:
-                if local_nebula_app["app_name"] not in remote_device_group_info["reply"]["apps_list"]:
-                    print("removing app " + local_nebula_app["app_name"] + " do to changes in the app configuration")
-                    stop_containers(local_nebula_app)
-
-        # logic that runs image pruning if prune_id increased
-        if remote_device_group_info["reply"]["prune_id"] > local_device_group_info["reply"]["prune_id"]:
-            print("pruning images do to changes in the app configuration")
-            monotonic_id_increase = True
-            prune_images()
-
-        # set the in memory device_group info to be the one recently received if any id increased
-        if monotonic_id_increase is True:
-            local_device_group_info = remote_device_group_info
+                prune_images()
+
+            # set the in memory device_group info to be the one recently received if any id increased
+            if monotonic_id_increase is True:
+                local_device_group_info = remote_device_group_info
+    except Exception as e:
+        print >> sys.stderr, e
+        print("failed main loop - exiting")
+        os._exit(2)