Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(cluster_mgr): Improvements to cluster_mgr.py #3118

Merged
merged 4 commits into from
Jun 3, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 69 additions & 17 deletions tools/cluster_mgr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
"""


def die_with_err(err):
print("!!!", err)
exit(-1)


class Node:
def __init__(self, host, port):
self.id = ""
Expand Down Expand Up @@ -166,8 +171,7 @@ def config_single_remote(args):

test = send_command(master.node, ["get", "x"], print_errors=False)
if type(test) is not Exception:
print("Node either not found or already configured")
exit(-1)
die_with_err("Node either not found or already configured")

config = build_config_from_list([master])
print(f"Pushing config:\n{config}\n")
Expand Down Expand Up @@ -204,33 +208,46 @@ def build_slots(slot_list):
return config


def find_node(config, host, port):
def find_master(config, host, port, die_if_not_found=True):
new_owner = None
for shard in config:
if shard["master"]["ip"] == host and shard["master"]["port"] == port:
new_owner = shard
break
else:
print(f"Can't find master with port {port} (hint: use flag --target_port).")
exit(-1)

if new_owner == None and die_if_not_found:
die_with_err(f"Can't find master (hint: use flag --target_host / --target_port).")

return new_owner


def attach(args):
print(f"Attaching remote Dragonfly {args.attach_host}:{args.attach_port} to cluster")
if args.attach_as_replica:
config = build_config_from_existing(args)
master_node = find_node(config, args.target_host, args.target_port)

newcomer = Node(args.attach_host, args.attach_port)
replica_resp = send_command(newcomer, ["info", "replication"])
if replica_resp["role"] != "replica":
die_with_err("Node is not in replica mode")
if (
replica_resp["master_host"] != args.target_host
or replica_resp["master_port"] != args.target_port
):
die_with_err("Node is not a replica of target")

newcomer.update_id()
newcomer_node = build_node(newcomer)

config = build_config_from_existing(args)
master_node = find_master(config, args.target_host, args.target_port)

master_node["replicas"].append(newcomer_node)
print(f"Pushing config:\n{config}\n")
push_config(config)
else:
newcomer = Master(args.attach_host, args.attach_port)
replica_resp = send_command(newcomer.node, ["info", "replication"])
if replica_resp["role"] != "master":
die_with_err("Node is not in master mode")
newcomer.node.update_id()

newcomer_config = build_config_from_list([newcomer])
Expand All @@ -241,9 +258,34 @@ def attach(args):
print()


def detach(args):
print(f"Detaching remote Dragonfly {args.target_host}:{args.target_port} from cluster")
print(
"Important: detached node will not receive a new config! This means that the detached node will still 'think' that it belongs to the cluster"
)
config = build_config_from_existing(args)
node = find_master(config, args.target_host, args.target_port, die_if_not_found=False)
if node == None:
found = False
for master in config:
for replica in master["replicas"]:
if replica["ip"] == args.target_host and replica["port"] == args.target_port:
master["replicas"].remove(replica)
found = True
if not found:
die_with_err("Can't find target node")
else:
if len(node["slot_ranges"]) != 0:
die_with_err("Can't detach a master with assigned slots")
if len(node["replicas"]) != 0:
die_with_err("Can't detach a master with replicas")
config = [m for m in config if m != node]
push_config(config)


def move(args):
config = build_config_from_existing(args)
new_owner = find_node(config, args.target_host, args.target_port)
new_owner = find_master(config, args.target_host, args.target_port)

def remove_slot(slot, from_range, from_shard):
if from_range["start"] == slot:
Expand Down Expand Up @@ -320,7 +362,7 @@ def pack(slot_ranges):

def migrate(args):
config = build_config_from_existing(args)
target = find_node(config, args.target_host, args.target_port)
target = find_master(config, args.target_host, args.target_port)
target_node = Node(target["master"]["ip"], target["master"]["port"])
target_node.update_id()

Expand All @@ -333,8 +375,7 @@ def migrate(args):
source = node
break
if source == None:
print("Unsupported slot range migration (currently only 1-node migration supported)")
exit(-1)
die_with_err("Unsupported slot range migration (currently only 1-node migration supported)")
source_node = Node(source["master"]["ip"], source["master"]["port"])
source_node.update_id()

Expand Down Expand Up @@ -410,20 +451,31 @@ def main():
Attach an existing Dragonfly server to an existing cluster (owning no slots):
./cluster_mgr.py --action=attach --attach_host=HOST --attach_port=PORT
This will connect to existing cluster present at localhost:6379 by default. Override with
`--target_host` and `--target_port`
`--target_host` and `--target_port`.
To attach node as a replica - use --attach_as_replica=True. In such case, the node will be a
replica of --target_host/--target_port.

To set up a new cluster - start the servers and then use
./cluster_mgr.py --action=config_single_remote ...
./cluster_mgr.py --action=attach ...
And repeat `--action=attach` for all servers.
Afterwards, distribute the slots between the servers as desired with `--action=move` or
`--action=migrate`
`--action=migrate`.

To detach (remove) a node from the cluster:
./cluster_mgr.py --action=detach --target_host=X --target_port=X
Notes:
- If the node is a master, it must not have any slots assigned to it.
- The node will not be notified that it's no longer in a cluster. It's a good idea to shut it down
after detaching it from the cluster.

Connect to cluster and move slots 10-20 to target:
./cluster_mgr.py --action=move --slot_start=10 --slot_end=20 --target_host=X --target_port=X
WARNING: This will NOT migrate existing data, i.e. data in slots 10-20 will be erased.

Migrate slots 10-20 to target:
./cluster_mgr.py --action=migrate --slot_start=10 --slot_end=20 --target_host=X --target_port=X
Unlike --action=move above, this will migrate the data to the new owner.

Connect to cluster and shutdown all nodes:
./cluster_mgr.py --action=shutdown
Expand Down Expand Up @@ -471,6 +523,7 @@ def main():
shutdown,
config_single_remote,
attach,
detach,
move,
print_config,
migrate,
Expand All @@ -481,8 +534,7 @@ def main():
if action:
action(args)
else:
print(f'Error - unknown action "{args.action}". See --help')
exit(-1)
die_with_err(f'Error - unknown action "{args.action}". See --help')


if __name__ == "__main__":
Expand Down
Loading