tarantool · Gerold103 · Nov 17, 2023 · Sep 26, 2023 · Oct 20, 2023 · Oct 4, 2023
diff --git a/test/luatest_helpers/vtest.lua b/test/luatest_helpers/vtest.lua
@@ -151,6 +151,7 @@ local function cluster_new(g, cfg)
     local all_servers = {}
     local masters = {}
     local replicas = {}
+    local master_map = {}
     for replicaset_uuid, replicaset in pairs(cfg.sharding) do
         -- Luatest depends on box.cfg being ready and listening. Need to
         -- configure it before vshard.storage.cfg().
@@ -169,7 +170,21 @@ local function cluster_new(g, cfg)
             box_cfg.replicaset_uuid = replicaset_uuid
             box_cfg.listen = helpers.instance_uri(replica.name)
             -- Need to specify read-only explicitly to know how is master.
-            box_cfg.read_only = not replica.master
+            local is_master
+            if replica.read_only ~= nil then
+                is_master = not replica.read_only
+            else
+                is_master = replica.master
+            end
+            if is_master then
+                local prev_uuid = master_map[replicaset_uuid]
+                if prev_uuid then
+                    error('On bootstrap each replicaset has to have exactly '..
+                          'one master')
+                end
+                master_map[replicaset_uuid] = replica_uuid
+            end
+            box_cfg.read_only = not is_master
             box_cfg.memtx_use_mvcc_engine = cfg.memtx_use_mvcc_engine
             local server = g.cluster:build_server({
                 alias = name,
@@ -184,7 +199,7 @@ local function cluster_new(g, cfg)
             g.cluster:add_server(server)
 
             table.insert(all_servers, server)
-            if replica.master then
+            if is_master then
                 table.insert(masters, server)
             else
                 table.insert(replicas, server)
@@ -332,20 +347,29 @@ local function cluster_bootstrap(g, cfg)
     local masters = {}
     local etalon_balance = {}
     local replicaset_count = 0
-    for rs_uuid, rs in pairs(cfg.sharding) do
-        local is_master_found = false
-        for _, rep in pairs(rs.replicas) do
-            if rep.master then
-                t.assert(not is_master_found, 'only one master')
-                local server = g[rep.name]
-                t.assert_not_equals(server, nil, 'find master instance')
-                t.assert_equals(server:replicaset_uuid(), rs_uuid,
-                                'replicaset uuid')
-                masters[rs_uuid] = server
-                is_master_found = true
-            end
+    local master_info, err = cluster_exec_each(g, function()
+        local info = box.info
+        return {
+            is_master = ivshard.storage.internal.is_master,
+            rs_uuid = ivutil.replicaset_uuid(info),
+            uuid = info.uuid,
+        }
+    end)
+    t.assert_equals(err, nil)
+    for name, info in pairs(master_info) do
+        if info.is_master then
+            local rs_uuid = info.rs_uuid
+            local server = g[name]
+            t.assert_not_equals(server, nil, 'find master instance')
+            t.assert_equals(masters[rs_uuid], nil, 'only one master')
+            local rs_cfg = cfg.sharding[rs_uuid]
+            t.assert_not_equals(rs_cfg, nil)
+            t.assert_not_equals(rs_cfg.replicas[info.uuid], nil)
+            masters[info.rs_uuid] = server
         end
-        t.assert(is_master_found, 'found master')
+    end
+    for rs_uuid, rs in pairs(cfg.sharding) do
+        t.assert_not_equals(masters[rs_uuid], nil, 'found master')
         local weight = rs.weight
         if weight == nil then
             weight = 1
@@ -420,6 +444,7 @@ end
 local function cluster_rebalancer_enable(g)
     local _, err =  cluster_exec_each(g, function()
         ivshard.storage.rebalancer_enable()
+        ivshard.storage.rebalancer_wakeup()
     end)
     t.assert_equals(err, nil, 'rebalancer enable')
 end
@@ -527,6 +552,26 @@ local function cluster_wait_fullsync(g)
     end
 end
 
+local function cluster_rebalancer_find_all(g)
+    local map, err = cluster_exec_each(g, function()
+        return ivshard.storage.internal.rebalancer_fiber ~= nil
+    end)
+    t.assert_equals(err, nil)
+    local names = {}
+    for name, res in pairs(map) do
+        if res then
+            table.insert(names, name)
+        end
+    end
+    return names
+end
+
+local function cluster_rebalancer_find(g)
+    local names = cluster_rebalancer_find_all(g)
+    t.assert_lt(#names, 2)
+    return names[1]
+end
+
 --
 -- Stop data node. Wrapped into a one-line function in case in the future would
 -- want to do something more here.
@@ -768,6 +813,7 @@ return {
     cluster_rebalancer_enable = cluster_rebalancer_enable,
     cluster_wait_vclock_all = cluster_wait_vclock_all,
     cluster_wait_fullsync = cluster_wait_fullsync,
+    cluster_rebalancer_find = cluster_rebalancer_find,
     storage_first_bucket = storage_first_bucket,
     storage_stop = storage_stop,
     storage_start = storage_start,
@@ -782,6 +828,7 @@ return {
     service_wait_for_new_ok = service_wait_for_new_ok,
     service_wait_for_error = service_wait_for_error,
     service_wait_for_new_error = service_wait_for_new_error,
+    service_wait_for_new_status = service_wait_for_new_status,
     service_wait_for_activity = service_wait_for_activity,
     wait_for_not_nil = wait_for_not_nil,
     wait_for_nil = wait_for_nil,

diff --git a/test/rebalancer/bucket_ref.result b/test/rebalancer/bucket_ref.result
@@ -394,10 +394,14 @@ fiber_to_lock:cancel()
 while not send_result do fiber.sleep(0.01) end
 ---
 ...
-send_result
+assert(not send_result[1])
+---
+- true
+...
+util.portable_error(send_result[2])
 ---
-- - false
-  - fiber is cancelled
+- type: FiberIsCancelled
+  message: fiber is cancelled
 ...
 vshard.storage.buckets_info(1)
 ---

diff --git a/test/rebalancer/bucket_ref.test.lua b/test/rebalancer/bucket_ref.test.lua
@@ -149,7 +149,8 @@ while not vshard.storage.buckets_info(1)[1].rw_lock do fiber.sleep(0.01) end
 
 fiber_to_lock:cancel()
 while not send_result do fiber.sleep(0.01) end
-send_result
+assert(not send_result[1])
+util.portable_error(send_result[2])
 vshard.storage.buckets_info(1)
 
 -- Cleanup after the test.

diff --git a/test/rebalancer/rebalancer.result b/test/rebalancer/rebalancer.result
@@ -507,10 +507,10 @@ switch_rs1_master()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_2_b)
 ---
 ...
-while not test_run:grep_log('box_2_a', "rebalancer_f has been started") do fiber.sleep(0.1) end
+while not test_run:grep_log('box_2_a', "Starting the rebalancer") do fiber.sleep(0.1) end
 ---
 ...
-while not test_run:grep_log('box_1_a', "Rebalancer location has changed") do fiber.sleep(0.1) end
+while not test_run:grep_log('box_1_a', "Stopping the rebalancer") do fiber.sleep(0.1) end
 ---
 ...
 --

diff --git a/test/rebalancer/rebalancer.test.lua b/test/rebalancer/rebalancer.test.lua
@@ -230,8 +230,8 @@ test_run:switch('box_2_b')
 switch_rs1_master()
 vshard.storage.cfg(cfg, util.name_to_uuid.box_2_b)
 
-while not test_run:grep_log('box_2_a', "rebalancer_f has been started") do fiber.sleep(0.1) end
-while not test_run:grep_log('box_1_a', "Rebalancer location has changed") do fiber.sleep(0.1) end
+while not test_run:grep_log('box_2_a', "Starting the rebalancer") do fiber.sleep(0.1) end
+while not test_run:grep_log('box_1_a', "Stopping the rebalancer") do fiber.sleep(0.1) end
 
 --
 -- gh-40: introduce custom replicaset weights. Weight allows to

diff --git a/test/replicaset-luatest/replicaset_3_test.lua b/test/replicaset-luatest/replicaset_3_test.lua
@@ -207,3 +207,38 @@ test_group.test_map_call = function(g)
         _G.test_sleep_is_called = nil
     end)
 end
+
+test_group.test_locate_master_when_no_conn_object = function(g)
+    local new_cfg_template = table.deepcopy(cfg_template)
+    local rs_cfg = new_cfg_template.sharding[1]
+    rs_cfg.master = 'auto'
+    rs_cfg.replicas.replica_1_a.master = nil
+    local new_global_cfg = vtest.config_new(new_cfg_template)
+    local replicasets = vreplicaset.buildall(new_global_cfg)
+    local _, rs = next(replicasets)
+    t.assert_equals(rs.master, nil)
+    for _, r in pairs(rs.replicas) do
+        t.assert_equals(r.conn, nil)
+    end
+    t.assert(rs.is_master_auto)
+    --
+    -- First attempt to locate the masters only creates the connections, but
+    -- doesn't wait for their establishment. The call is supposed to be retried
+    -- later.
+    --
+    local is_all_done, is_all_nop, last_err =
+        vreplicaset.locate_masters(replicasets)
+    t.assert_equals(last_err, nil)
+    t.assert(not is_all_done)
+    t.assert(not is_all_nop)
+    for _, r in pairs(rs.replicas) do
+        t.assert_not_equals(r.conn, nil)
+        r.conn:wait_connected(vtest.wait_timeout)
+    end
+    is_all_done, is_all_nop, last_err =
+        vreplicaset.locate_masters(replicasets)
+    t.assert_equals(last_err, nil)
+    t.assert(is_all_done)
+    t.assert(not is_all_nop)
+    t.assert_equals(rs.master, rs.replicas[g.replica_1_a:instance_uuid()])
+end
diff --git a/test/router/master_discovery.result b/test/router/master_discovery.result
@@ -835,7 +835,7 @@ assert(rs.master.uuid == storage_a_uuid)
 rs.master = nil
  | ---
  | ...
-rs.is_auto_master = false
+rs.is_master_auto = false
  | ---
  | ...
 
@@ -861,7 +861,7 @@ assert(not rs.master)
 
 -- With auto-search and not known master it is not assigned if a new master is
 -- not reported.
-rs.is_auto_master = true
+rs.is_master_auto = true
  | ---
  | ...
 -- But update returns true, because it makes sense to try a next request later
@@ -908,7 +908,7 @@ assert(rs.master.uuid == storage_b_uuid)
 -- It does not depend on auto-search. Still returns true, because if the master
 -- was changed since the request was sent, it means it could be retried and
 -- might succeed.
-rs.is_auto_master = false
+rs.is_master_auto = false
  | ---
  | ...
 assert(rs:update_master(storage_a_uuid))
@@ -936,7 +936,7 @@ assert(rs.master.uuid == storage_b_uuid)
 -- the current master should be reset. Because makes no sense to send more RW
 -- requests to him. But update returns true, because the current request could
 -- be retried after waiting for a new master discovery.
-rs.is_auto_master = true
+rs.is_master_auto = true
  | ---
  | ...
 assert(rs:update_master(storage_b_uuid))

diff --git a/test/router/master_discovery.test.lua b/test/router/master_discovery.test.lua
@@ -386,7 +386,7 @@ storage_b_uuid = util.name_to_uuid.storage_1_b
 
 assert(rs.master.uuid == storage_a_uuid)
 rs.master = nil
-rs.is_auto_master = false
+rs.is_master_auto = false
 
 -- When auto-search is disabled and master is not known, nothing will make it
 -- known. It is up to the config.
@@ -398,7 +398,7 @@ assert(not rs.master)
 
 -- With auto-search and not known master it is not assigned if a new master is
 -- not reported.
-rs.is_auto_master = true
+rs.is_master_auto = true
 -- But update returns true, because it makes sense to try a next request later
 -- when the master is found.
 assert(rs:update_master(storage_a_uuid))
@@ -419,7 +419,7 @@ assert(rs.master.uuid == storage_b_uuid)
 -- It does not depend on auto-search. Still returns true, because if the master
 -- was changed since the request was sent, it means it could be retried and
 -- might succeed.
-rs.is_auto_master = false
+rs.is_master_auto = false
 assert(rs:update_master(storage_a_uuid))
 assert(rs.master.uuid == storage_b_uuid)
 
@@ -433,7 +433,7 @@ assert(rs.master.uuid == storage_b_uuid)
 -- the current master should be reset. Because makes no sense to send more RW
 -- requests to him. But update returns true, because the current request could
 -- be retried after waiting for a new master discovery.
-rs.is_auto_master = true
+rs.is_master_auto = true
 assert(rs:update_master(storage_b_uuid))
 assert(rs.master == nil)
 

diff --git a/test/router/router.result b/test/router/router.result
@@ -1495,6 +1495,7 @@ error_messages
   - Use replicaset:connect_master(...) instead of replicaset.connect_master(...)
   - Use replicaset:connect_replica(...) instead of replicaset.connect_replica(...)
   - Use replicaset:down_replica_priority(...) instead of replicaset.down_replica_priority(...)
+  - Use replicaset:locate_master(...) instead of replicaset.locate_master(...)
   - Use replicaset:map_call(...) instead of replicaset.map_call(...)
   - Use replicaset:up_replica_priority(...) instead of replicaset.up_replica_priority(...)
   - Use replicaset:update_master(...) instead of replicaset.update_master(...)