From 31e98c1badcf135503ae6da08f6b5251ab146629 Mon Sep 17 00:00:00 2001 From: gargsans-yb Date: Thu, 24 Oct 2024 12:10:49 +0000 Subject: [PATCH] [#23007]yugabyted: 2nd node not joining correctly Summary: When the 2nd node joins the cluster using a config file containing the `--join` flag, the 2nd node starts with a different `placemnet_uuid`. Added checks to have join_ip from config file verified. Jira: DB-11929 Test Plan: Jenkins Reviewers: nikhil Reviewed By: nikhil Subscribers: svc_phabricator, yugabyted-dev, sgarg-yb Differential Revision: https://phorge.dev.yugabyte.com/D39412 --- bin/yugabyted | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/bin/yugabyted b/bin/yugabyted index a302b1dac00..95efebbee70 100755 --- a/bin/yugabyted +++ b/bin/yugabyted @@ -6539,7 +6539,7 @@ class ControlScript(object): ": The directory mentioned in the --cert_dir value of the configs file " + "doen't exist. Please create the directory and copy the certs into that " + "directory.") - elif args.join: + elif (args.join or self.configs.saved_data.get("join")): # Case Scenario: When the user is trying to start and join a node # to a secure cluster but the generated certs are neither present # in the default directory nor given through --certs_dir flag. @@ -7584,19 +7584,22 @@ class ControlScript(object): args.background = "True" cluster_member = self.configs.saved_data.get("cluster_member") - if args.join is not None: - if not self.validate_hostname_ip(args.join): + join_ip = args.join if args.join else self.configs.saved_data.get("join") + if join_ip: + if not self.validate_hostname_ip(join_ip): Output.log_error_and_exit(Output.make_red("ERROR") + ": --join" + - " provided is not a valid address. Please try again with a valid IPV4, " + - "IPV6 or DNS.") + " provided is not a valid address. Please try again with a " + + "valid IPV4, IPV6 or DNS.") - Output.print_and_log("Fetching configs from join IP...") if not cluster_member: + # Node is starting for the first time. + Output.print_and_log("Fetching configs from join IP...") + # Check if tserver webserver at join_IP is reachable or not # Also get the leader master(used to get the info of all tservers) - master_leader = self.get_current_master_leader_from_api(args.join, + master_leader = self.get_current_master_leader_from_api(join_ip, tserver_webserver_port) - args.join = master_leader + join_ip = master_leader # Get info on all tservers master_leader_hostport = "{}:{}".format(master_leader, master_webserver_port) @@ -7607,7 +7610,7 @@ class ControlScript(object): for node in [node.split(":")[0] for node in list(nodes.keys())]: if args.advertise_address == node: Output.log_error_and_exit(Output.make_red("ERROR:") + " A node" + - " is already running on {}, please ".format(args.join) + + " is already running on {}, please ".format(join_ip) + "specify a valid address.") is_placement_uuid_set = False @@ -7674,8 +7677,8 @@ class ControlScript(object): ": --certs_dir flag needs to be accompanied with the --secure flag.") if args.insecure: - if args.join and not cluster_member: - master_hostport = "{}:{}".format(args.join, master_webserver_port) + if join_ip and not cluster_member: + master_hostport = "{}:{}".format(join_ip, master_webserver_port) if self.is_leader_master_secure(master_hostport): # Case Scenario: When a User starts the 1st node in secure mode and tries # to start the second node in insecure mode @@ -7684,8 +7687,8 @@ class ControlScript(object): "IP was provided in --join flag has SSL/TLS enabled. Cannot join a " + "secure and an insecure node.") elif args.secure: - if args.join and not cluster_member: - master_hostport = "{}:{}".format(args.join, master_webserver_port) + if join_ip and not cluster_member: + master_hostport = "{}:{}".format(join_ip, master_webserver_port) if not self.is_leader_master_secure(master_hostport): # Case Scenario: When the user starts the 1st node in insecure mode and # tries to start the second node in secure mode.