From ae80bab975dc6c4e117e55a22e5a7e8c81e777fe Mon Sep 17 00:00:00 2001 From: mssonicbld <79238446+mssonicbld@users.noreply.github.com> Date: Sun, 4 Dec 2022 23:04:45 +0800 Subject: [PATCH] Improve feature mode switch process (#12188) (#12919) * Fix kube mode to local mode long duration issue * Remove IPV6 parameters which is not necessary * Fix read node labels bug * Tag the running image to latest if it's stable * Disable image_version_higher check * Change image_version_higher checker test case Signed-off-by: Yun Li Signed-off-by: Yun Li Co-authored-by: lixiaoyuner <35456895+lixiaoyuner@users.noreply.github.com> --- src/sonic-ctrmgrd/.gitignore | 1 + src/sonic-ctrmgrd/ctrmgr/container | 25 +++++-- src/sonic-ctrmgrd/ctrmgr/container_startup.py | 16 ++-- src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py | 75 +++++++++++++++++-- src/sonic-ctrmgrd/ctrmgr/kube_commands.py | 8 +- .../ctrmgr/remote_ctr.config.json | 1 + .../tests/container_startup_test.py | 2 +- src/sonic-ctrmgrd/tests/container_test.py | 5 ++ src/sonic-ctrmgrd/tests/ctrmgrd_test.py | 49 +++++++++++- src/sonic-ctrmgrd/tests/kube_commands_test.py | 34 ++------- 10 files changed, 165 insertions(+), 51 deletions(-) diff --git a/src/sonic-ctrmgrd/.gitignore b/src/sonic-ctrmgrd/.gitignore index bdebd5e838cd..2f42f4d2c5ea 100644 --- a/src/sonic-ctrmgrd/.gitignore +++ b/src/sonic-ctrmgrd/.gitignore @@ -10,3 +10,4 @@ tests/__pycache__/ ctrmgr/__pycache__/ venv tests/.coverage* +.pytest_cache/ \ No newline at end of file diff --git a/src/sonic-ctrmgrd/ctrmgr/container b/src/sonic-ctrmgrd/ctrmgr/container index db6ded635ee9..ca2394b057bb 100755 --- a/src/sonic-ctrmgrd/ctrmgr/container +++ b/src/sonic-ctrmgrd/ctrmgr/container @@ -30,6 +30,10 @@ STATE = "state" KUBE_LABEL_TABLE = "KUBE_LABELS" KUBE_LABEL_SET_KEY = "SET" +SERVER_TABLE = "KUBERNETES_MASTER" +SERVER_KEY = "SERVER" +ST_SER_CONNECTED = "connected" +ST_SER_UPDATE_TS = "update_time" # Get seconds to wait for remote docker to start. # If not, revert to local @@ -75,8 +79,10 @@ def read_data(is_config, feature, fields): ret = [] db = cfg_db if is_config else state_db - - tbl = swsscommon.Table(db, FEATURE_TABLE) + if feature == SERVER_KEY: + tbl = swsscommon.Table(db, SERVER_TABLE) + else: + tbl = swsscommon.Table(db, FEATURE_TABLE) data = dict(tbl.get(feature)[1]) for (field, default) in fields: @@ -104,6 +110,13 @@ def read_state(feature): [(CURRENT_OWNER, "none"), (REMOTE_STATE, "none"), (CONTAINER_ID, "")]) +def read_server_state(): + """ Read requried feature state """ + + return read_data(False, SERVER_KEY, + [(ST_SER_CONNECTED, "false"), (ST_SER_UPDATE_TS, "")]) + + def docker_action(action, feature, **kwargs): """ Execute docker action """ try: @@ -192,9 +205,10 @@ def container_start(feature, **kwargs): set_owner, fallback, _ = read_config(feature) _, remote_state, _ = read_state(feature) + server_connected, _ = read_server_state() - debug_msg("{}: set_owner:{} fallback:{} remote_state:{}".format( - feature, set_owner, fallback, remote_state)) + debug_msg("{}: set_owner:{} fallback:{} remote_state:{} server_connected:{}".format( + feature, set_owner, fallback, remote_state, server_connected)) data = { SYSTEM_STATE: "up", @@ -207,8 +221,9 @@ def container_start(feature, **kwargs): start_val = START_LOCAL else: start_val = START_KUBE - if fallback and (remote_state == "none"): + if fallback and (remote_state == "none" or server_connected == "false"): start_val |= START_LOCAL + data[REMOTE_STATE] = "none" if start_val == START_LOCAL: # Implies *only* local. diff --git a/src/sonic-ctrmgrd/ctrmgr/container_startup.py b/src/sonic-ctrmgrd/ctrmgr/container_startup.py index c56160aa488d..0d836fe02300 100755 --- a/src/sonic-ctrmgrd/ctrmgr/container_startup.py +++ b/src/sonic-ctrmgrd/ctrmgr/container_startup.py @@ -229,14 +229,14 @@ def container_up(feature, owner, version): do_freeze(feature, "This version is marked disabled. Exiting ...") return - if not instance_higher(feature, state_data[VERSION], version): - # TODO: May Remove label __enabled - # Else kubelet will continue to re-deploy every 5 mins, until - # master removes the lable to un-deploy. - # - do_freeze(feature, "bail out as current deploy version {} is not higher". - format(version)) - return + # if not instance_higher(feature, state_data[VERSION], version): + # # TODO: May Remove label __enabled + # # Else kubelet will continue to re-deploy every 5 mins, until + # # master removes the lable to un-deploy. + # # + # do_freeze(feature, "bail out as current deploy version {} is not higher". + # format(version)) + # return update_data(state_db, feature, { VERSION: version }) diff --git a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py index 2defb6e45387..d53ca0b42e3a 100755 --- a/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py +++ b/src/sonic-ctrmgrd/ctrmgr/ctrmgrd.py @@ -60,7 +60,7 @@ CFG_SER_IP: "", CFG_SER_PORT: "6443", CFG_SER_DISABLE: "false", - CFG_SER_INSECURE: "false" + CFG_SER_INSECURE: "true" } dflt_st_ser = { @@ -88,18 +88,20 @@ JOIN_LATENCY = "join_latency_on_boot_seconds" JOIN_RETRY = "retry_join_interval_seconds" LABEL_RETRY = "retry_labels_update_seconds" +TAG_IMAGE_LATEST = "tag_latest_image_on_wait_seconds" USE_K8S_PROXY = "use_k8s_as_http_proxy" remote_ctr_config = { JOIN_LATENCY: 10, JOIN_RETRY: 10, LABEL_RETRY: 2, + TAG_IMAGE_LATEST: 30, USE_K8S_PROXY: "" } def log_debug(m): msg = "{}: {}".format(inspect.stack()[1][3], m) - print(msg) + #print(msg) syslog.syslog(syslog.LOG_DEBUG, msg) @@ -148,6 +150,8 @@ def init(): with open(SONIC_CTR_CONFIG, "r") as s: d = json.load(s) remote_ctr_config.update(d) + if UNIT_TESTING: + remote_ctr_config[TAG_IMAGE_LATEST] = 0 class MainServer: @@ -172,11 +176,11 @@ def register_db(self, db_name): self.db_connectors[db_name] = swsscommon.DBConnector(db_name, 0) - def register_timer(self, ts, handler): + def register_timer(self, ts, handler, args=()): """ Register timer based handler. The handler will be called on/after give timestamp, ts """ - self.timer_handlers[ts].append(handler) + self.timer_handlers[ts].append((handler, args)) def register_handler(self, db_name, table_name, handler): @@ -235,7 +239,7 @@ def run(self): lst = self.timer_handlers[k] del self.timer_handlers[k] for fn in lst: - fn() + fn[0](*fn[1]) else: timeout = (k - ct_ts).seconds break @@ -426,6 +430,54 @@ def do_join(self, ip, port, insecure): format(remote_ctr_config[JOIN_RETRY], self.start_time)) +def tag_latest_image(server, feat, docker_id, image_ver): + res = 1 + if not UNIT_TESTING: + status = os.system("docker ps |grep {} >/dev/null".format(docker_id)) + if status: + syslog.syslog(syslog.LOG_ERR, + "Feature {}:{} is not stable".format(feat, image_ver)) + else: + image_item = os.popen("docker inspect {} |jq -r .[].Image".format(docker_id)).read().strip() + if image_item: + image_id = image_item.split(":")[1][:12] + image_info = os.popen("docker images |grep {}".format(image_id)).read().split() + if image_info: + image_rep = image_info[0] + res = os.system("docker tag {} {}:latest".format(image_id, image_rep)) + if res != 0: + syslog.syslog(syslog.LOG_ERR, + "Failed to tag {}:{} to latest".format(image_rep, image_ver)) + else: + syslog.syslog(syslog.LOG_INFO, + "Successfully tag {}:{} to latest".format(image_rep, image_ver)) + feat_status = os.popen("docker inspect {} |jq -r .[].State.Running".format(feat)).read().strip() + if feat_status: + if feat_status == 'true': + os.system("docker stop {}".format(feat)) + syslog.syslog(syslog.LOG_ERR, + "{} should not run, stop it".format(feat)) + os.system("docker rm {}".format(feat)) + syslog.syslog(syslog.LOG_INFO, + "Delete previous {} container".format(feat)) + else: + syslog.syslog(syslog.LOG_ERR, + "Failed to docker images |grep {} to get image repo".format(image_id)) + else: + syslog.syslog(syslog.LOG_ERR, + "Failed to inspect container:{} to get image id".format(docker_id)) + else: + server.mod_db_entry(STATE_DB_NAME, + FEATURE_TABLE, feat, {"tag_latest": "true"}) + res = 0 + if res: + log_debug("failed to tag {}:{} to latest".format(feat, image_ver)) + else: + log_debug("successfully tag {}:{} to latest".format(feat, image_ver)) + + return res + + # # Feature changes # @@ -523,6 +575,19 @@ def on_state_update(self, key, op, data): self.st_data[key] = _update_entry(dflt_st_feat, data) remote_state = self.st_data[key][ST_FEAT_REMOTE_STATE] + if (old_remote_state != remote_state) and (remote_state == "running"): + # Tag latest + start_time = datetime.datetime.now() + datetime.timedelta( + seconds=remote_ctr_config[TAG_IMAGE_LATEST]) + self.server.register_timer(start_time, tag_latest_image, ( + self.server, + key, + self.st_data[key][ST_FEAT_CTR_ID], + self.st_data[key][ST_FEAT_CTR_VER])) + + log_debug("try to tag latest label after {} seconds @{}".format( + remote_ctr_config[TAG_IMAGE_LATEST], start_time)) + if (not init) and ( (old_remote_state == remote_state) or (remote_state != "pending")): # no change or nothing to do. diff --git a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py index 3adea36ef12c..91415390ccd5 100755 --- a/src/sonic-ctrmgrd/ctrmgr/kube_commands.py +++ b/src/sonic-ctrmgrd/ctrmgr/kube_commands.py @@ -84,7 +84,7 @@ def _run_command(cmd, timeout=5): def kube_read_labels(): """ Read current labels on node and return as dict. """ - KUBECTL_GET_CMD = "kubectl --kubeconfig {} get nodes {} --show-labels |tr -s ' ' | cut -f6 -d' '" + KUBECTL_GET_CMD = "kubectl --kubeconfig {} get nodes {} --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '" labels = {} ret, out, _ = _run_command(KUBECTL_GET_CMD.format( @@ -332,12 +332,12 @@ def _do_reset(pending_join = False): def _do_join(server, port, insecure): - KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {} --apiserver-advertise-address {}" + KUBEADM_JOIN_CMD = "kubeadm join --discovery-file {} --node-name {}" err = "" out = "" ret = 0 try: - local_ipv6 = _get_local_ipv6() + #local_ipv6 = _get_local_ipv6() #_download_file(server, port, insecure) _gen_cli_kubeconf(server, port, insecure) _do_reset(True) @@ -349,7 +349,7 @@ def _do_join(server, port, insecure): if ret == 0: (ret, out, err) = _run_command(KUBEADM_JOIN_CMD.format( - KUBE_ADMIN_CONF, get_device_name(), local_ipv6), timeout=60) + KUBE_ADMIN_CONF, get_device_name()), timeout=60) log_debug("ret = {}".format(ret)) except IOError as e: diff --git a/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json b/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json index 3fb0c20fddcf..0b91fde36473 100644 --- a/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json +++ b/src/sonic-ctrmgrd/ctrmgr/remote_ctr.config.json @@ -3,6 +3,7 @@ "retry_join_interval_seconds": 30, "retry_labels_update_seconds": 5, "revert_to_local_on_wait_seconds": 60, + "tag_latest_image_on_wait_seconds": 600, "use_k8s_as_http_proxy": "n" } diff --git a/src/sonic-ctrmgrd/tests/container_startup_test.py b/src/sonic-ctrmgrd/tests/container_startup_test.py index bffffaadca94..b21fe855662c 100755 --- a/src/sonic-ctrmgrd/tests/container_startup_test.py +++ b/src/sonic-ctrmgrd/tests/container_startup_test.py @@ -169,7 +169,7 @@ common_test.FEATURE_TABLE: { "snmp": { "container_id": "no_change", - "container_version": "20201230.77", + "container_version": "20201230.11", "current_owner": "no_change", "remote_state": "no_change", "system_state": "up" diff --git a/src/sonic-ctrmgrd/tests/container_test.py b/src/sonic-ctrmgrd/tests/container_test.py index 4738597c72c4..4581111015e5 100755 --- a/src/sonic-ctrmgrd/tests/container_test.py +++ b/src/sonic-ctrmgrd/tests/container_test.py @@ -125,6 +125,11 @@ "current_owner": "none", "container_id": "" } + }, + common_test.SERVER_TABLE: { + "SERVER": { + "connected": "true" + } } } }, diff --git a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py index 171534b5a8d1..842b935396d1 100755 --- a/src/sonic-ctrmgrd/tests/ctrmgrd_test.py +++ b/src/sonic-ctrmgrd/tests/ctrmgrd_test.py @@ -106,7 +106,7 @@ common_test.KUBE_JOIN: { "ip": "10.10.10.10", "port": "6443", - "insecure": "false" + "insecure": "true" } } }, @@ -151,7 +151,7 @@ common_test.KUBE_JOIN: { "ip": "10.10.10.10", "port": "6443", - "insecure": "false" + "insecure": "true" }, common_test.KUBE_RESET: { "flag": "true" @@ -276,6 +276,51 @@ } } } + }, + 3: { + common_test.DESCR: "Tag image latest when remote_state changes to running", + common_test.ARGS: "ctrmgrd", + common_test.PRE: { + common_test.CONFIG_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "set_owner": "kube" + } + } + }, + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "remote_state": "pending" + } + } + } + }, + common_test.UPD: { + common_test.CONFIG_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "set_owner": "kube" + } + } + }, + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "remote_state": "running" + } + } + } + }, + common_test.POST: { + common_test.STATE_DB_NO: { + common_test.FEATURE_TABLE: { + "snmp": { + "tag_latest": "true" + } + } + } + } } } diff --git a/src/sonic-ctrmgrd/tests/kube_commands_test.py b/src/sonic-ctrmgrd/tests/kube_commands_test.py index d8e0939efb85..60da7fd2c073 100755 --- a/src/sonic-ctrmgrd/tests/kube_commands_test.py +++ b/src/sonic-ctrmgrd/tests/kube_commands_test.py @@ -27,7 +27,7 @@ common_test.DESCR: "read labels", common_test.RETVAL: 0, common_test.PROC_CMD: ["\ -kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], +kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], common_test.PROC_OUT: ["foo=bar,hello=world"], common_test.POST: { "foo": "bar", @@ -40,7 +40,7 @@ common_test.TRIGGER_THROW: True, common_test.RETVAL: -1, common_test.PROC_CMD: ["\ -kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], +kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], common_test.POST: { }, common_test.PROC_KILLED: 1 @@ -49,7 +49,7 @@ common_test.DESCR: "read labels fail", common_test.RETVAL: -1, common_test.PROC_CMD: ["\ -kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], +kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF)], common_test.PROC_OUT: [""], common_test.PROC_ERR: ["command failed"], common_test.POST: { @@ -64,7 +64,7 @@ common_test.RETVAL: 0, common_test.ARGS: { "foo": "bar", "hello": "World!", "test": "ok" }, common_test.PROC_CMD: [ -"kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF), +"kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF), "kubectl --kubeconfig {} label --overwrite nodes none hello-".format( KUBE_ADMIN_CONF), "kubectl --kubeconfig {} label --overwrite nodes none hello=World! test=ok".format( @@ -77,7 +77,7 @@ common_test.RETVAL: 0, common_test.ARGS: { "foo": "bar", "hello": "world" }, common_test.PROC_CMD: [ -"kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF) +"kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF) ], common_test.PROC_OUT: ["foo=bar,hello=world"] }, @@ -87,7 +87,7 @@ common_test.ARGS: { "any": "thing" }, common_test.RETVAL: -1, common_test.PROC_CMD: [ -"kubectl --kubeconfig {} get nodes none --show-labels |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF) +"kubectl --kubeconfig {} get nodes none --show-labels --no-headers |tr -s ' ' | cut -f6 -d' '".format(KUBE_ADMIN_CONF) ], common_test.PROC_ERR: ["read failed"] } @@ -110,19 +110,10 @@ "mkdir -p {}".format(CNI_DIR), "cp {} {}".format(FLANNEL_CONF_FILE, CNI_DIR), "systemctl start kubelet", - "kubeadm join --discovery-file {} --node-name none --apiserver-advertise-address FC00:2::32".format( + "kubeadm join --discovery-file {} --node-name none".format( KUBE_ADMIN_CONF) ], common_test.PROC_RUN: [True, True], - common_test.PRE: { - common_test.CONFIG_DB_NO: { - common_test.MGMT_INTERFACE_TABLE: { - "eth0|FC00:2::32/64": { - "gwaddr": "fc00:2::1" - } - } - } - }, common_test.REQ: { "data": {"ca.crt": "test"} } @@ -143,19 +134,10 @@ "mkdir -p {}".format(CNI_DIR), "cp {} {}".format(FLANNEL_CONF_FILE, CNI_DIR), "systemctl start kubelet", - "kubeadm join --discovery-file {} --node-name none --apiserver-advertise-address FC00:2::32".format( + "kubeadm join --discovery-file {} --node-name none".format( KUBE_ADMIN_CONF) ], common_test.PROC_RUN: [True, True], - common_test.PRE: { - common_test.CONFIG_DB_NO: { - common_test.MGMT_INTERFACE_TABLE: { - "eth0|FC00:2::32/64": { - "gwaddr": "fc00:2::1" - } - } - } - }, common_test.REQ: { "data": {"ca.crt": "test"} }