curvefs: support space deallocate for curvebs volume as backend

Signed-off-by: ilixiaocui <[email protected]>
opencurve · Jul 20, 2023 · f011606 · f011606
1 parent 3cda511
commit f011606
Show file tree

Hide file tree

Showing 135 changed files with 5,882 additions and 907 deletions.
diff --git a/curvefs/conf/client.conf b/curvefs/conf/client.conf
@@ -58,10 +58,6 @@ executorOpt.maxRetryTimesBeforeConsiderSuspend=20
 # batch limit of get inode attr and xattr
 executorOpt.batchInodeAttrLimit=10000
 
-#### spaceserver
-spaceServer.spaceAddr=127.0.0.1:19999  # __ANSIBLE_TEMPLATE__ {{ groups.space | join_peer(hostvars, "space_listen_port") }} __ANSIBLE_TEMPLATE__
-spaceServer.rpcTimeoutMs=1000
-
 #### bdev
 # curve client's config file
 bdev.confPath=/etc/curve/client.conf
@@ -177,6 +173,15 @@ volume.bitmapAllocator.smallAllocProportion=0.2
 # number of block groups that allocated once
 volume.blockGroup.allocateOnce=4
 
+## spaceserver
+# the space used by the blockgroup exceeds this percentage and can
+# be returned to mds [0.8-1]
+volume.space.useThreshold=0.95
+
+# the background thread calculates the time interval for returning
+# the blockgroup to mds
+volume.space.releaseInterSec=300
+
 #### s3
 # this is for test. if s3.fakeS3=true, all data will be discarded
 s3.fakeS3=false

diff --git a/curvefs/conf/curvebs_client.conf b/curvefs/conf/curvebs_client.conf
@@ -0,0 +1,191 @@
+#
+################### mds一侧配置信息 ##################
+#
+
+# mds的地址信息，对于mds集群，地址以逗号隔开
+mds.listen.addr=127.0.0.1:6666
+
+# 初始化阶段向mds注册开关，默认为开
+mds.registerToMDS=true
+
+# 与mds通信的rpc超时时间
+mds.rpcTimeoutMS=500
+
+# 与mds通信rpc最大的超时时间, 指数退避的超时间不能超过这个值
+mds.maxRPCTimeoutMS=2000
+
+# 与mds通信重试总时间
+mds.maxRetryMS=8000
+
+# 在当前mds上连续重试次数超过该限制就切换, 这个失败次数包含超时重试次数
+mds.maxFailedTimesBeforeChangeMDS=2
+
+# 与MDS一侧保持一个lease时间内多少次续约
+mds.refreshTimesPerLease=4
+
+# mds RPC接口每次重试之前需要先睡眠一段时间
+mds.rpcRetryIntervalUS=100000
+
+# The normal retry times for trigger wait strategy
+mds.normalRetryTimesBeforeTriggerWait=3
+
+# Max retry time for IO-Path request
+mds.maxRetryMsInIOPath=86400000
+
+# Sleep interval for wait
+mds.waitSleepMs=10000
+
+#
+################# metacache配置信息 ################
+#
+
+# 获取leader的rpc超时时间
+metacache.getLeaderTimeOutMS=500
+
+# 获取leader的重试次数
+metacache.getLeaderRetry=5
+
+# 获取leader接口每次重试之前需要先睡眠一段时间
+metacache.rpcRetryIntervalUS=100000
+
+#
+############### 调度层的配置信息 #############
+#
+
+# 调度层队列大小，每个文件对应一个队列
+# 调度队列的深度会影响client端整体吞吐，这个队列存放的是异步IO任务。。
+schedule.queueCapacity=1000000
+
+# 队列的执行线程数量
+# 执行线程所要做的事情就是将IO取出，然后发到网络就返回取下一个网络任务。一个任务从
+# 队列取出到发送完rpc请求大概在(20us-100us)，20us是正常情况下不需要获取leader的时候
+# 如果在发送的时候需要获取leader，时间会在100us左右，一个线程的吞吐在10w-50w
+# 性能已经满足需求
+schedule.threadpoolSize=2
+
+# 为隔离qemu侧线程引入的任务队列，因为qemu一侧只有一个IO线程
+# 当qemu一侧调用aio接口的时候直接将调用push到任务队列就返回，
+# 这样libcurve不占用qemu的线程，不阻塞其异步调用
+isolation.taskQueueCapacity=1000000
+
+# 隔离qemu线程的任务队列线程池大小, 默认值为1个线程
+isolation.taskThreadPoolSize=1
+
+
+#
+################ 与chunkserver通信相关配置 #############
+#
+# 读写接口失败的OP之间重试睡眠
+chunkserver.opRetryIntervalUS=100000
+
+# 失败的OP重试次数
+chunkserver.opMaxRetry=2500000
+
+# 与chunkserver通信的rpc超时时间
+chunkserver.rpcTimeoutMS=1000
+
+# 开启基于appliedindex的读，用于性能优化
+chunkserver.enableAppliedIndexRead=1
+
+# 重试请求之间睡眠最长时间
+# 因为当网络拥塞的时候或者chunkserver出现过载的时候，需要增加睡眠时间
+# 这个时间最大为maxRetrySleepIntervalUs
+chunkserver.maxRetrySleepIntervalUS=8000000
+
+# 重试请求的超时rpc时间最大值，超时时间会遵循指数退避策略
+# 因为当网络拥塞的时候出现超时，需要增加RPC超时时间
+# 这个时间最大为maxTimeoutMS
+chunkserver.maxRPCTimeoutMS=8000
+
+# 同一个chunkserver连续超时上限次数
+# 如果超过这个值，就会进行健康检查，健康检查失败后，会标记为unstable
+chunkserver.maxStableTimeoutTimes=10
+# chunkserver上rpc连续超时后，健康检查请求的超时间
+chunkserver.checkHealthTimeoutMs=100
+# 同一个server上unstable的chunkserver数量超过这个值之后
+# 所有的chunkserver都会标记为unstable
+chunkserver.serverStableThreshold=3
+
+# 当底层chunkserver压力大时，可能也会触发unstable
+# 由于copyset leader may change，会导致请求超时时间设置为默认值，从而导致IO hang
+# 真正宕机的情况下，请求重试一定次数后会处理完成
+# 如果一直重试，则不是宕机情况，这时候超时时间还是要进入指数退避逻辑
+# 当一个请求重试次数超过这个值时，其超时时间一定进入指数退避
+chunkserver.minRetryTimesForceTimeoutBackoff=5
+
+# 当一个rpc重试超过次数maxRetryTimesBeforeConsiderSuspend的时候
+# 记为悬挂IO，metric会报警
+chunkserver.maxRetryTimesBeforeConsiderSuspend=20
+
+#
+################# 文件级别配置项 #############
+#
+# libcurve底层rpc调度允许最大的未返回rpc数量，每个文件的inflight RPC独立
+global.fileMaxInFlightRPCNum=128
+
+# 文件IO下发到底层chunkserver最大的分片KB
+global.fileIOSplitMaxSizeKB=64
+
+#
+################# log相关配置 ###############
+#
+# enable logging or not
+global.logging.enable=True
+#
+# log等级 INFO=0/WARNING=1/ERROR=2/FATAL=3
+global.logLevel=0
+# 设置log的路径
+global.logPath=/data/log/curve/  # __CURVEADM_TEMPLATE__ /curvebs/client/logs __CURVEADM_TEMPLATE__
+# 单元测试情况下
+# logpath=./runlog/
+
+#
+################# 读源卷相关配置 ###############
+#
+# 读取源卷时打开的fd超时关闭时间300s
+closefd.timeout=300
+# 读取源卷时打开的fd后台线程每600s扫描一遍fdMap，关闭超时fd
+closefd.timeInterval=600
+
+#
+############### metric 配置信息 #############
+#
+global.metricDummyServerStartPort=9000
+
+# 是否关闭健康检查: true/关闭 false/不关闭
+global.turnOffHealthCheck=true
+
+#
+### throttle config
+#
+throttle.enable=false
+
+##### discard configurations #####
+# enable/disable discard
+discard.enable=true
+# discard granularity
+discard.granularity=4096
+# discard cleanup task delay times in millisecond
+discard.taskDelayMs=60000
+
+##### alignment #####
+# default alignment
+global.alignment.commonVolume=512
+# alignment for clone volume
+# default is 4096, because lazy clone chunk bitmap granularity is 4096
+global.alignment.cloneVolume=4096
+
+##### chunkserver client option #####
+# chunkserver client rpc timeout time
+csClientOpt.rpcTimeoutMs=500
+# chunkserver client rpc max try
+csClientOpt.rpcMaxTry=86400000
+# chunkserver client rpc retry interval
+csClientOpt.rpcIntervalUs=100000
+# chunkserver client rpc max timeout time
+csClientOpt.rpcMaxTimeoutMs=8000
+
+##### chunkserver broadcaster option #####
+# broad cast max machine num
+csBroadCasterOpt.broadCastMaxNum=200
+
diff --git a/curvefs/conf/mds.conf b/curvefs/conf/mds.conf
@@ -162,3 +162,6 @@ bs.mds.maxFailedTimesBeforeChangeMDS=2
 bs.mds.normalRetryTimesBeforeTriggerWait=3
 # sleep interval in ms for wait
 bs.mds.waitSleepMs=1000
+
+#### Options for volume space deallcatable setting
+mds.space.calIntervalSec=60
diff --git a/curvefs/conf/metaserver.conf b/curvefs/conf/metaserver.conf
@@ -325,3 +325,9 @@ metaCacheOpt.metacacheGetLeaderRetry=3
 metaCacheOpt.metacacheRPCRetryIntervalUS=100000
 # RPC timeout of get leader
 metaCacheOpt.metacacheGetLeaderRPCTimeOutMS=1000
+
+#### volume deallocate
+volume.deallocate.enable=true
+volume.deallocate.workerNum=5
+volume.deallocate.batchClean=10
+volume.sdk.confPath=conf/curvebs_client.conf # __CURVEADM_TEMPLATE__ /curvefs/metaserver/conf/curvebs_client.conf __CURVEADM_TEMPLATE__
diff --git a/curvefs/proto/BUILD b/curvefs/proto/BUILD
@@ -113,6 +113,7 @@ proto_library(
     name = "curvefs_heartbeat_proto",
     srcs = ["heartbeat.proto"],
     deps = [":curvefs_common_proto",
+            ":metaserver_proto",
             "//proto:heartbeat_proto"],
 )
 

diff --git a/curvefs/proto/common.proto b/curvefs/proto/common.proto
@@ -31,6 +31,11 @@ enum BitmapLocation {
     AtEnd = 2;
 }
 
+message BlockGroupID {
+    required uint64 fsId = 1;
+    required uint64 offset = 2;
+}
+
 // When creating fs, `volumeSize` and `extendAlignment` are fetched from the bs cluster
 message Volume {
     optional uint64 volumeSize = 1;

diff --git a/curvefs/proto/heartbeat.proto b/curvefs/proto/heartbeat.proto
@@ -16,6 +16,7 @@
 
 syntax = "proto2";
 import "curvefs/proto/common.proto";
+import "curvefs/proto/metaserver.proto";
 import "proto/heartbeat.proto";
 package curvefs.mds.heartbeat;
 option cc_generic_services = true;
@@ -27,6 +28,17 @@ option go_package = "curvefs/proto/heartbeat";
 //     required uint64 diskUsedByte = 2;  // the disk this copyset used
 // };
 
+enum BlockGroupDeallcateStatusCode {
+    BGDP_PROCESSING = 0;
+    BGDP_DONE = 1;
+}
+
+message BlockGroupStatInfo {
+    required uint32 fsId = 1;
+    repeated metaserver.DeallocatableBlockGroup deallocatableBlockGroups = 2;
+    map<uint64, BlockGroupDeallcateStatusCode> blockGroupDeallocateStatus = 3;
+}
+
 message CopySetInfo {
     required uint32 poolId = 1;
     required uint32 copysetId = 2;
@@ -81,6 +93,7 @@ message MetaServerHeartbeatRequest {
     required uint32 leaderCount = 7;
     required uint32 copysetCount = 8;
     required MetaServerSpaceStatus spaceStatus = 9;
+    repeated BlockGroupStatInfo blockGroupStatInfos = 10;
 };
 
 message CopySetConf {
@@ -107,11 +120,14 @@ enum HeartbeatStatusCode {
     hbMetaServerIpPortNotMatch = 2;
     hbMetaServerTokenNotMatch = 3;
     hbAnalyseCopysetError = 4;
+    hbMetaServerFSUnkown = 5;
 }
 
 message MetaServerHeartbeatResponse {
     required HeartbeatStatusCode statusCode = 1;
     repeated CopySetConf needUpdateCopysets = 2;
+    // key is fsid, value is blockgroup offset
+    map<uint64, uint64> issuedBlockGroups = 3;
 };
 
 service HeartbeatService {

diff --git a/curvefs/proto/mds.proto b/curvefs/proto/mds.proto
@@ -60,6 +60,7 @@ enum FSStatusCode {
     INSERT_MANAGE_INODE_FAIL = 35;
     DELETE_DENTRY_FAIL = 36;
     UPDATE_FS_FAIL = 37;
+    SPACE_RELEASE_FAIL = 38;
 }
 
 // fs interface