From 7773a12525b57c0ab4446523362c81eec18b5fb8 Mon Sep 17 00:00:00 2001 From: Clarke Geunyeong Bak Date: Mon, 15 Apr 2024 09:49:14 +0900 Subject: [PATCH] =?UTF-8?q?[NES-55]=20Resharding=20=EC=9D=B4=ED=9B=84=20?= =?UTF-8?q?=EC=98=A4=EB=B8=8C=EC=A0=9D=ED=8A=B8=20=EC=8A=A4=ED=86=A0?= =?UTF-8?q?=EC=96=B4=20=EC=9D=B8=EB=8D=B1=EC=8A=A4=EA=B0=80=20=EB=B9=A0?= =?UTF-8?q?=EC=A7=80=EB=8A=94=20=EC=9D=B4=EC=8A=88=20=ED=95=B4=EA=B2=B0=20?= =?UTF-8?q?(#54)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move the step removing old bucket instance to immediately after resharding and before unlocking rehard_lock * when shard info query fails, make shard query to be retried with bucket id that would have been changed by resharding * add guard_reshard to RGWRados::Bucket::UpdateIndex::complete --- src/rgw/rgw_rados.cc | 41 +++++++++++++++++++++++++++-------------- src/rgw/rgw_reshard.cc | 17 +++++++++-------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/src/rgw/rgw_rados.cc b/src/rgw/rgw_rados.cc index 7c4b2280965c8..f863a4114a7e8 100644 --- a/src/rgw/rgw_rados.cc +++ b/src/rgw/rgw_rados.cc @@ -5988,7 +5988,22 @@ int RGWRados::Bucket::UpdateIndex::guard_reshard(BucketShard **pbs, std::functio int ret = get_bucket_shard(&bs); if (ret < 0) { ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl; - return ret; + + ret = store->try_refresh_bucket_info(target->bucket_info, nullptr); + if (ret < 0) { + ldout(store->ctx(), 1) << "ERROR: failed to refresh bucket info: " << ret << dendl; + return ret; + } + + ldout(store->ctx(), 10) << "retry to get BucketShard object with new bucket id! " + << "new_bucket_id=" << target->bucket_info.bucket.bucket_id << dendl; + + invalidate_bs(); + ret = get_bucket_shard(&bs); + if (ret < 0) { + ldout(store->ctx(), 5) << "failed to get BucketShard object with refreshed bucket id too: ret=" << ret << dendl; + return ret; + } } r = call(bs); if (r != -ERR_BUSY_RESHARDING) { @@ -6065,13 +6080,6 @@ int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch, return 0; } RGWRados *store = target->get_store(); - BucketShard *bs; - - int ret = get_bucket_shard(&bs); - if (ret < 0) { - ldout(store->ctx(), 5) << "failed to get BucketShard object: ret=" << ret << dendl; - return ret; - } rgw_bucket_dir_entry ent; obj.key.get_index_key(&ent.key); @@ -6095,12 +6103,17 @@ int RGWRados::Bucket::UpdateIndex::complete(int64_t poolid, uint64_t epoch, ent.meta.content_type = content_type; ent.meta.appendable = appendable; - ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); - - int r = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id); - if (r < 0) { - lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; - } + // Follows up the bucket shards that may have changed during the data write. + // Wait for resharding that may be in progress. + int ret = guard_reshard(nullptr, [&](BucketShard *bs) -> int { + int comp_add_ret = store->cls_obj_complete_add(*bs, obj, optag, poolid, epoch, ent, category, remove_objs, bilog_flags, zones_trace); + + int log_add_ret = store->svc.datalog_rados->add_entry(target->bucket_info, bs->shard_id); + if (log_add_ret < 0) { + lderr(store->ctx()) << "ERROR: failed writing data log" << dendl; + } + return comp_add_ret; + }); return ret; } diff --git a/src/rgw/rgw_reshard.cc b/src/rgw/rgw_reshard.cc index b5733b240f325..80f3db4fd2475 100644 --- a/src/rgw/rgw_reshard.cc +++ b/src/rgw/rgw_reshard.cc @@ -737,6 +737,15 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries, // at this point we've done the main work; we'll make a best-effort // to clean-up but will not indicate any errors encountered + ret = store->ctl()->bucket->remove_bucket_instance_info(bucket_info.bucket, + bucket_info, null_yield); + if (ret < 0) { + lderr(store->ctx()) << "Error: " << __func__ << + " failed to clean old bucket info object \"" << + bucket_info.bucket.get_key() << + "\"created after successful resharding with error " << ret << dendl; + } + reshard_lock.unlock(); // resharding successful, so remove old bucket index shards; use @@ -750,14 +759,6 @@ int RGWBucketReshard::execute(int num_shards, int max_op_entries, "RGWRados::clean_bucket_index returned " << ret << dendl; } - ret = store->ctl()->bucket->remove_bucket_instance_info(bucket_info.bucket, - bucket_info, null_yield); - if (ret < 0) { - lderr(store->ctx()) << "Error: " << __func__ << - " failed to clean old bucket info object \"" << - bucket_info.bucket.get_key() << - "\"created after successful resharding with error " << ret << dendl; - } ldout(store->ctx(), 1) << __func__ << " INFO: reshard of bucket \"" << bucket_info.bucket.name << "\" from \"" <<