Skip to content

Commit

Permalink
curvefs: support space deallocate for curvebs volume as backend
Browse files Browse the repository at this point in the history
Signed-off-by: ilixiaocui <[email protected]>
  • Loading branch information
ilixiaocui committed Jul 20, 2023
1 parent 3cda511 commit f011606
Show file tree
Hide file tree
Showing 135 changed files with 5,882 additions and 907 deletions.
13 changes: 9 additions & 4 deletions curvefs/conf/client.conf
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,6 @@ executorOpt.maxRetryTimesBeforeConsiderSuspend=20
# batch limit of get inode attr and xattr
executorOpt.batchInodeAttrLimit=10000

#### spaceserver
spaceServer.spaceAddr=127.0.0.1:19999 # __ANSIBLE_TEMPLATE__ {{ groups.space | join_peer(hostvars, "space_listen_port") }} __ANSIBLE_TEMPLATE__
spaceServer.rpcTimeoutMs=1000

#### bdev
# curve client's config file
bdev.confPath=/etc/curve/client.conf
Expand Down Expand Up @@ -177,6 +173,15 @@ volume.bitmapAllocator.smallAllocProportion=0.2
# number of block groups that allocated once
volume.blockGroup.allocateOnce=4

## spaceserver
# the space used by the blockgroup exceeds this percentage and can
# be returned to mds [0.8-1]
volume.space.useThreshold=0.95

# the background thread calculates the time interval for returning
# the blockgroup to mds
volume.space.releaseInterSec=300

#### s3
# this is for test. if s3.fakeS3=true, all data will be discarded
s3.fakeS3=false
Expand Down
191 changes: 191 additions & 0 deletions curvefs/conf/curvebs_client.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#
################### mds一侧配置信息 ##################
#

# mds的地址信息,对于mds集群,地址以逗号隔开
mds.listen.addr=127.0.0.1:6666

# 初始化阶段向mds注册开关,默认为开
mds.registerToMDS=true

# 与mds通信的rpc超时时间
mds.rpcTimeoutMS=500

# 与mds通信rpc最大的超时时间, 指数退避的超时间不能超过这个值
mds.maxRPCTimeoutMS=2000

# 与mds通信重试总时间
mds.maxRetryMS=8000

# 在当前mds上连续重试次数超过该限制就切换, 这个失败次数包含超时重试次数
mds.maxFailedTimesBeforeChangeMDS=2

# 与MDS一侧保持一个lease时间内多少次续约
mds.refreshTimesPerLease=4

# mds RPC接口每次重试之前需要先睡眠一段时间
mds.rpcRetryIntervalUS=100000

# The normal retry times for trigger wait strategy
mds.normalRetryTimesBeforeTriggerWait=3

# Max retry time for IO-Path request
mds.maxRetryMsInIOPath=86400000

# Sleep interval for wait
mds.waitSleepMs=10000

#
################# metacache配置信息 ################
#

# 获取leader的rpc超时时间
metacache.getLeaderTimeOutMS=500

# 获取leader的重试次数
metacache.getLeaderRetry=5

# 获取leader接口每次重试之前需要先睡眠一段时间
metacache.rpcRetryIntervalUS=100000

#
############### 调度层的配置信息 #############
#

# 调度层队列大小,每个文件对应一个队列
# 调度队列的深度会影响client端整体吞吐,这个队列存放的是异步IO任务。。
schedule.queueCapacity=1000000

# 队列的执行线程数量
# 执行线程所要做的事情就是将IO取出,然后发到网络就返回取下一个网络任务。一个任务从
# 队列取出到发送完rpc请求大概在(20us-100us),20us是正常情况下不需要获取leader的时候
# 如果在发送的时候需要获取leader,时间会在100us左右,一个线程的吞吐在10w-50w
# 性能已经满足需求
schedule.threadpoolSize=2

# 为隔离qemu侧线程引入的任务队列,因为qemu一侧只有一个IO线程
# 当qemu一侧调用aio接口的时候直接将调用push到任务队列就返回,
# 这样libcurve不占用qemu的线程,不阻塞其异步调用
isolation.taskQueueCapacity=1000000

# 隔离qemu线程的任务队列线程池大小, 默认值为1个线程
isolation.taskThreadPoolSize=1


#
################ 与chunkserver通信相关配置 #############
#
# 读写接口失败的OP之间重试睡眠
chunkserver.opRetryIntervalUS=100000

# 失败的OP重试次数
chunkserver.opMaxRetry=2500000

# 与chunkserver通信的rpc超时时间
chunkserver.rpcTimeoutMS=1000

# 开启基于appliedindex的读,用于性能优化
chunkserver.enableAppliedIndexRead=1

# 重试请求之间睡眠最长时间
# 因为当网络拥塞的时候或者chunkserver出现过载的时候,需要增加睡眠时间
# 这个时间最大为maxRetrySleepIntervalUs
chunkserver.maxRetrySleepIntervalUS=8000000

# 重试请求的超时rpc时间最大值,超时时间会遵循指数退避策略
# 因为当网络拥塞的时候出现超时,需要增加RPC超时时间
# 这个时间最大为maxTimeoutMS
chunkserver.maxRPCTimeoutMS=8000

# 同一个chunkserver连续超时上限次数
# 如果超过这个值,就会进行健康检查,健康检查失败后,会标记为unstable
chunkserver.maxStableTimeoutTimes=10
# chunkserver上rpc连续超时后,健康检查请求的超时间
chunkserver.checkHealthTimeoutMs=100
# 同一个server上unstable的chunkserver数量超过这个值之后
# 所有的chunkserver都会标记为unstable
chunkserver.serverStableThreshold=3

# 当底层chunkserver压力大时,可能也会触发unstable
# 由于copyset leader may change,会导致请求超时时间设置为默认值,从而导致IO hang
# 真正宕机的情况下,请求重试一定次数后会处理完成
# 如果一直重试,则不是宕机情况,这时候超时时间还是要进入指数退避逻辑
# 当一个请求重试次数超过这个值时,其超时时间一定进入指数退避
chunkserver.minRetryTimesForceTimeoutBackoff=5

# 当一个rpc重试超过次数maxRetryTimesBeforeConsiderSuspend的时候
# 记为悬挂IO,metric会报警
chunkserver.maxRetryTimesBeforeConsiderSuspend=20

#
################# 文件级别配置项 #############
#
# libcurve底层rpc调度允许最大的未返回rpc数量,每个文件的inflight RPC独立
global.fileMaxInFlightRPCNum=128

# 文件IO下发到底层chunkserver最大的分片KB
global.fileIOSplitMaxSizeKB=64

#
################# log相关配置 ###############
#
# enable logging or not
global.logging.enable=True
#
# log等级 INFO=0/WARNING=1/ERROR=2/FATAL=3
global.logLevel=0
# 设置log的路径
global.logPath=/data/log/curve/ # __CURVEADM_TEMPLATE__ /curvebs/client/logs __CURVEADM_TEMPLATE__
# 单元测试情况下
# logpath=./runlog/

#
################# 读源卷相关配置 ###############
#
# 读取源卷时打开的fd超时关闭时间300s
closefd.timeout=300
# 读取源卷时打开的fd后台线程每600s扫描一遍fdMap,关闭超时fd
closefd.timeInterval=600

#
############### metric 配置信息 #############
#
global.metricDummyServerStartPort=9000

# 是否关闭健康检查: true/关闭 false/不关闭
global.turnOffHealthCheck=true

#
### throttle config
#
throttle.enable=false

##### discard configurations #####
# enable/disable discard
discard.enable=true
# discard granularity
discard.granularity=4096
# discard cleanup task delay times in millisecond
discard.taskDelayMs=60000

##### alignment #####
# default alignment
global.alignment.commonVolume=512
# alignment for clone volume
# default is 4096, because lazy clone chunk bitmap granularity is 4096
global.alignment.cloneVolume=4096

##### chunkserver client option #####
# chunkserver client rpc timeout time
csClientOpt.rpcTimeoutMs=500
# chunkserver client rpc max try
csClientOpt.rpcMaxTry=86400000
# chunkserver client rpc retry interval
csClientOpt.rpcIntervalUs=100000
# chunkserver client rpc max timeout time
csClientOpt.rpcMaxTimeoutMs=8000

##### chunkserver broadcaster option #####
# broad cast max machine num
csBroadCasterOpt.broadCastMaxNum=200

3 changes: 3 additions & 0 deletions curvefs/conf/mds.conf
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,6 @@ bs.mds.maxFailedTimesBeforeChangeMDS=2
bs.mds.normalRetryTimesBeforeTriggerWait=3
# sleep interval in ms for wait
bs.mds.waitSleepMs=1000

#### Options for volume space deallcatable setting
mds.space.calIntervalSec=60
6 changes: 6 additions & 0 deletions curvefs/conf/metaserver.conf
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,9 @@ metaCacheOpt.metacacheGetLeaderRetry=3
metaCacheOpt.metacacheRPCRetryIntervalUS=100000
# RPC timeout of get leader
metaCacheOpt.metacacheGetLeaderRPCTimeOutMS=1000

#### volume deallocate
volume.deallocate.enable=true
volume.deallocate.workerNum=5
volume.deallocate.batchClean=10
volume.sdk.confPath=conf/curvebs_client.conf # __CURVEADM_TEMPLATE__ /curvefs/metaserver/conf/curvebs_client.conf __CURVEADM_TEMPLATE__
1 change: 1 addition & 0 deletions curvefs/proto/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ proto_library(
name = "curvefs_heartbeat_proto",
srcs = ["heartbeat.proto"],
deps = [":curvefs_common_proto",
":metaserver_proto",
"//proto:heartbeat_proto"],
)

Expand Down
5 changes: 5 additions & 0 deletions curvefs/proto/common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ enum BitmapLocation {
AtEnd = 2;
}

message BlockGroupID {
required uint64 fsId = 1;
required uint64 offset = 2;
}

// When creating fs, `volumeSize` and `extendAlignment` are fetched from the bs cluster
message Volume {
optional uint64 volumeSize = 1;
Expand Down
16 changes: 16 additions & 0 deletions curvefs/proto/heartbeat.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

syntax = "proto2";
import "curvefs/proto/common.proto";
import "curvefs/proto/metaserver.proto";
import "proto/heartbeat.proto";
package curvefs.mds.heartbeat;
option cc_generic_services = true;
Expand All @@ -27,6 +28,17 @@ option go_package = "curvefs/proto/heartbeat";
// required uint64 diskUsedByte = 2; // the disk this copyset used
// };

enum BlockGroupDeallcateStatusCode {
BGDP_PROCESSING = 0;
BGDP_DONE = 1;
}

message BlockGroupStatInfo {
required uint32 fsId = 1;
repeated metaserver.DeallocatableBlockGroup deallocatableBlockGroups = 2;
map<uint64, BlockGroupDeallcateStatusCode> blockGroupDeallocateStatus = 3;
}

message CopySetInfo {
required uint32 poolId = 1;
required uint32 copysetId = 2;
Expand Down Expand Up @@ -81,6 +93,7 @@ message MetaServerHeartbeatRequest {
required uint32 leaderCount = 7;
required uint32 copysetCount = 8;
required MetaServerSpaceStatus spaceStatus = 9;
repeated BlockGroupStatInfo blockGroupStatInfos = 10;
};

message CopySetConf {
Expand All @@ -107,11 +120,14 @@ enum HeartbeatStatusCode {
hbMetaServerIpPortNotMatch = 2;
hbMetaServerTokenNotMatch = 3;
hbAnalyseCopysetError = 4;
hbMetaServerFSUnkown = 5;
}

message MetaServerHeartbeatResponse {
required HeartbeatStatusCode statusCode = 1;
repeated CopySetConf needUpdateCopysets = 2;
// key is fsid, value is blockgroup offset
map<uint64, uint64> issuedBlockGroups = 3;
};

service HeartbeatService {
Expand Down
1 change: 1 addition & 0 deletions curvefs/proto/mds.proto
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ enum FSStatusCode {
INSERT_MANAGE_INODE_FAIL = 35;
DELETE_DENTRY_FAIL = 36;
UPDATE_FS_FAIL = 37;
SPACE_RELEASE_FAIL = 38;
}

// fs interface
Expand Down
Loading

0 comments on commit f011606

Please sign in to comment.