-
Notifications
You must be signed in to change notification settings - Fork 527
/
metaserver.conf
342 lines (309 loc) · 15.6 KB
/
metaserver.conf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#
# trash option
#
trash.scanPeriodSec=600
trash.expiredAfterSec=604800
# s3
# if s3.enableBatchDelete set True, batch size limit the object num of delete count per delete request
s3.batchsize=100
# if s3 sdk support batch delete objects, set True; other set False
s3.enableBatchDelete=False
# http = 0, https = 1
s3.http_scheme=0
s3.verify_SSL=False
s3.user_agent=S3 Browser
s3.region=us-east-1
s3.maxConnections=32
s3.connectTimeout=60000
s3.requestTimeout=10000
# Off = 0,Fatal = 1,Error = 2,Warn = 3,Info = 4,Debug = 5,Trace = 6
s3.logLevel=4
s3.logPrefix=/tmp/curvefs/metaserver/aws_
s3.asyncThreadNum=10
# throttle
s3.throttle.iopsTotalLimit=0
s3.throttle.iopsReadLimit=0
s3.throttle.iopsWriteLimit=0
s3.throttle.bpsTotalMB=0
s3.throttle.bpsReadMB=0
s3.throttle.bpsWriteMB=0
s3.useVirtualAddressing=false
# s3 workqueue
s3compactwq.enable=True
s3compactwq.thread_num=2
s3compactwq.queue_size=5
# fragments threshold in a s3chuninfolist
s3compactwq.fragment_threshold=20
# max chunks to process per compact task
s3compactwq.max_chunks_per_compact=10
# roughly control the compact freq
s3compactwq.enqueue_sleep_ms=1000
s3compactwq.s3infocache_size=100
# workaround read failure when diskcache is enabled
s3compactwq.s3_read_max_retry=5
s3compactwq.s3_read_retry_interval=5 # in seconds
# metaserver listen ip and port
# these two config items ip and port can be replaced by start up options `-ip` and `-port`
global.ip=127.0.0.1 # __CURVEADM_TEMPLATE__ ${service_addr} __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ {{ curvefs_metaserver_listen_host }} __ANSIBLE_TEMPLATE__
global.port=16701 # __CURVEADM_TEMPLATE__ ${service_port} __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ {{ curvefs_metaserver_listen_port }} __ANSIBLE_TEMPLATE__
global.external_ip=127.0.0.1 # __CURVEADM_TEMPLATE__ ${service_external_addr} __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ {{ curvefs_metaserver_listen_host }} __ANSIBLE_TEMPLATE__
global.external_port=16701 # __CURVEADM_TEMPLATE__ ${service_external_port} __CURVEADM_TEMPLATE__
global.enable_external_server=false
# metaserver log directory
# this config item can be replaced by start up option `-log_dir`
metaserver.common.logDir=/tmp/curvefs/metaserver # __CURVEADM_TEMPLATE__ ${prefix}/logs __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ /tmp/{{ inventory_hostname }}/curvefs/metaserver __ANSIBLE_TEMPLATE__
# we have loglevel: {3,6,9}
# as the number increases, it becomes more and more detailed
metaserver.loglevel=0
# metaserver meta file path, every metaserver need persist MetaServerMetadata on its own disk
metaserver.meta_file_path=./0/metaserver.dat # __CURVEADM_TEMPLATE__ ${prefix}/data/metaserver.dat __CURVEADM_TEMPLATE__
# enable lease read, default value is true
# if value = true: read requests will judge lease.
# if value = false: all requests including read requests will propose to raft.
copyset.enable_lease_read=true
# copyset data uri
# all uri (data_uri/raft_log_uri/raft_meta_uri/raft_snapshot_uri/trash.uri) are ${protocol}://${path}
# e.g., when save data to local disk, protocol is `local`, path can be `absolute path` or `relative path`
# local:///mnt/data or local://./data
# this config item can be replaced by start up option `-dataUri`
copyset.data_uri=local://./0/copysets # __CURVEADM_TEMPLATE__ local://${prefix}/data/copysets __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ local://{{ curvefs_metaserver_data_root }}/copysets __ANSIBLE_TEMPLATE__
# copyset reload concurrency
# when server restart, it will reload copysets from `copyset.data_uri`
# if value set to 1, means all copysets are loaded one by one it may cause a long start-up time
# if value bigger than 1, means at most |load_concurrency| copysets are loaded parallelly
# but larger value may cause higher cpu/memory/disk usgae
copyset.load_concurrency=5
# if the difference between the applied_index of the current replica and the
# committed_index on the leader is less than |finishLoadMargin|, it's
# determined that the copyset has been loaded completed
copyset.finishload_margin=2000
# the maximum number of retries to check whether a copyset is loaded completed
copyset.check_retrytimes=3
# sleep time in microseconds between different cycles check whether copyset is loaded
copyset.check_loadmargin_interval_ms=1000
# raft election timeout in milliseconds
# follower would become a candidate if it doesn't receive any message
# from the leader in |election_timeout_ms| milliseconds
copyset.election_timeout_ms=1000
# raft snapshot interval in seconds
# snapshot saving would be triggered every |snapshot_interval_s| seconds if this was reset as a positive number
# if |snapshot_interval_s| <= 0, the time based snapshot would be disabled
copyset.snapshot_interval_s=1800
# raft catchup margin
# regard a adding peer as caught up if the margin between
# the last_log_index of this peer and the last_log_index of leader is less than |catchup_margin|
copyset.catchup_margin=1000
# raft-log storage uri
# this config item can be replaced by start up option `-raftLogUri`
copyset.raft_log_uri=local://./0/copysets # __CURVEADM_TEMPLATE__ local://${prefix}/data/copysets __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ local://{{ curvefs_metaserver_data_root }}/copysets __ANSIBLE_TEMPLATE__
# raft-meta storage uri
# this config item can be replaced by start up option `-raftMetaUri`
copyset.raft_meta_uri=local://./0/copysets # __CURVEADM_TEMPLATE__ local://${prefix}/data/copysets __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ local://{{ curvefs_metaserver_data_root }}/copysets __ANSIBLE_TEMPLATE__
# raft-snapshot storage uri
# this config item can be replaced by start up option `-raftSnapshotUri`
copyset.raft_snapshot_uri=local://./0/copysets # __CURVEADM_TEMPLATE__ local://${prefix}/data/copysets __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ local://{{ curvefs_metaserver_data_root }}/copysets __ANSIBLE_TEMPLATE__
# trash-uri
# if coyset was deleted, its data path was first move to trash directory
# this config item can be replaced by start up option `-trashUriUri`
copyset.trash.uri=local://./0/trash # __CURVEADM_TEMPLATE__ local://${prefix}/data/trash __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ local://{{ curvefs_metaserver_data_root }}/trash __ANSIBLE_TEMPLATE__
# after a copyset data has been move to trash directory for more than #expired_aftersec seconds
# its data will be deleted
copyset.trash.expired_aftersec=300
# backend trash thread scan interval in seconds
copyset.trash.scan_periodsec=120
# number of reqeusts being processed
# this config item should be tuned according cpu/memory/disk
service.max_inflight_request=5000
#
# Concurrent apply queue
### concurrent apply queue options for each copyset
### concurrent apply queue is used to isolate raft threads, each worker has its own queue
### when a task can be applied it's been pushed into a corresponding read/write worker queue by certain rules
# worker_count: number of apply queue workers for each, each worker will start a indepent thread
# queue_depth: apply queue depth for each copyset
# all tasks in queue must be done when do raft snapshot, and raft apply and raft snapshot are executed in same thread
# so, if queue depth is too large, it will cause other tasks to wait too long for apply
# write apply queue workers count
applyqueue.write_worker_count=3
# write apply queue depth
applyqueue.write_queue_depth=1
# read apply queue workers count
applyqueue.read_worker_count=2
# read apply queue depth
applyqueue.read_queue_depth=1
# number of worker threads that created by brpc::Server
# if set to |auto|, threads create by brpc::Server is equal to `getconf _NPROCESSORS_ONLN` + 1
# if set to a fixed value, it will create |wroker_count| threads, and its range is [4, 1024]
# it is recommended to set it to |auto| unless there is a significant performance improvement
bthread.worker_count=auto
# If a connection does not read or write, it's treated as "idle" and will be closed by server soon.
# Default value is -1 which disables the feature.
server.idleTimeoutSec=-1
### Braft related flags
### These configurations are ignored if the command line startup options are set
# Call fsync when need
# braft default is True. Setting to false can greatly improve performance
# but data maybe lost when all the duplicates are powered off at the same time
# We can select according to the specified scene.
braft.raft_sync=False
# Sync log meta, snapshot meta and raft meta
# braft default is False
braft.raft_sync_meta=True
# Call fsync when a segment is closed
# braft default is False
braft.raft_sync_segments=True
# Use fsync rather than fdatasync to flush page cache
# braft default is True
braft.raft_use_fsync_rather_than_fdatasync=False
# Max num of install_snapshot tasks per disk at the same time
# braft default is 1000
braft.raft_max_install_snapshot_tasks_num=10
#
# MDS settings
#
# support multiple addr, use ',' to seperate addr: 127.0.0.1:6700,127.0.0.1:6701
mds.listen.addr=127.0.0.1:6700 # __CURVEADM_TEMPLATE__ ${cluster_mds_addr} __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ {{ groups.mds | join_peer(hostvars, "mds_listen_port") }} __ANSIBLE_TEMPLATE__
# the max retry times for metaserver to register to mds
mds.register_retries=100
# the time of rpc timeout when metaserver register to mds, normally 1000ms
mds.register_timeoutMs=1000
# the interval of metaserver send heartbeat to mds, normally 10s
mds.heartbeat_intervalSec=10
# the rpc timeout of metaserver send heartbeat to mds, normally1000ms
mds.heartbeat_timeoutMs=1000
#
# partition clean settings
#
# partition clean manager scan partition every scanPeriodSec
partition.clean.scanPeriodSec=10
# partition clean manager delete inode every inodeDeletePeriodMs
partition.clean.inodeDeletePeriodMs=500
##### mdsOpt
# RPC total retry time with MDS
mdsOpt.mdsMaxRetryMS=16000
# The maximum timeout of RPC communicating with MDS.
# The timeout of exponential backoff cannot exceed this value
mdsOpt.rpcRetryOpt.maxRPCTimeoutMS=2000
# RPC timeout for once communication with MDS
mdsOpt.rpcRetryOpt.rpcTimeoutMs=500
# RPC with mds needs to sleep for a period of time before each retry
mdsOpt.rpcRetryOpt.rpcRetryIntervalUS=50000
# Switch if the number of consecutive retries on the current MDS exceeds the limit.
# The number of failures includes the number of timeout retries
mdsOpt.rpcRetryOpt.maxFailedTimesBeforeChangeAddr=2
# The normal retry times for trigger wait strategy
mdsOpt.rpcRetryOpt.normalRetryTimesBeforeTriggerWait=3
# Sleep interval for wait
mdsOpt.rpcRetryOpt.waitSleepMs=1000
mdsOpt.rpcRetryOpt.addrs=127.0.0.1:6700,127.0.0.1:6701,127.0.0.1:6702 # __CURVEADM_TEMPLATE__ ${cluster_mds_addr} __CURVEADM_TEMPLATE__ __ANSIBLE_TEMPLATE__ {{ groups.mds | join_peer(hostvars, "mds_listen_port") }} __ANSIBLE_TEMPLATE__
#
# storage settings
#
# storage type, "memory" or "rocksdb"
storage.type=rocksdb
# metaserver max memory quota bytes (default: 30GB)
storage.max_memory_quota_bytes=32212254720
# metaserver max disk quota bytes (default: 2TB)
storage.max_disk_quota_bytes=2199023255552
# whether need to compress the value for memory storage (default: False)
storage.memory.compression=False
# rocksdb block cache(LRU) capacity (default: 8GB)
storage.rocksdb.block_cache_capacity=8589934592
# rocksdb writer buffer manager capacity (default: 6GB)
storage.rocksdb.write_buffer_manager_capacity=6442450944
# Control whether write buffer manager cost block cache
# If true, the total memory usage by rocksdb is limited by `block_cache_capacity`
storage.rocksdb.WBM_cost_block_cache=false
# Maximum number of concurrent background jobs (compactions and flushes)
storage.rocksdb.max_background_jobs=16
# Maxinum number of threads to perform a compaction job by simultaneously (default: 4)
storage.rocksdb.max_subcompactions=4
# Number of files to trigger level-0 compaction (default: 1)
storage.rocksdb.level0_file_num_compaction_trigger=1
# Control maximum total data size for a level (default: 1GB)
storage.rocksdb.max_bytes_for_level_base=1073741824
# rocksdb column family's write_buffer_size
# for store inode which exclude its s3chunkinfo list (unit: bytes, default: 64MB)
storage.rocksdb.unordered_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store inode which exclude its s3chunkinfo list (default: 3)
storage.rocksdb.unordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store dentry and inode's s3chunkinfo list (unit: bytes, default: 64MB)
storage.rocksdb.ordered_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store dentry and inode's s3chunkinfo list (default: 3)
storage.rocksdb.ordered_max_write_buffer_number=3
# rocksdb column family's write_buffer_size
# for store tx lock and write (unit: bytes, default: 64MB)
storage.rocksdb.tx_cf_write_buffer_size=67108864
# rocksdb column family's max_write_buffer_number
# for store tx lock and write (default: 3)
storage.rocksdb.tx_cf_max_write_buffer_number=3
# The target number of write history bytes to hold in memory (default: 20MB)
storage.rocksdb.max_write_buffer_size_to_maintain=20971520
# rocksdb memtable prefix bloom size ratio (size=write_buffer_size*memtable_prefix_bloom_size_ratio)
storage.rocksdb.memtable_prefix_bloom_size_ratio=0.1
# dump rocksdb.stats to LOG every stats_dump_period_sec
storage.rocksdb.stats_dump_period_sec=180
# rocksdb perf level:
# 0: kDisable
# 1: kEnableCount
# 2: kEnableTimeAndCPUTimeExceptForMutex
# 3: kEnableTimeExceptForMutex
# 4: kEnableTime
# see also: https://github.com/facebook/rocksdb/wiki/Perf-Context-and-IO-Stats-Context#profile-levels-and-costs
storage.rocksdb.perf_level=0
# all rocksdb operations which latency greater than perf_slow_operation_us
# will be considered a slow operation
storage.rocksdb.perf_slow_us=100
# rocksdb perf sampling ratio
storage.rocksdb.perf_sampling_ratio=0
# if the number of inode's s3chunkinfo exceed the limit_size,
# we will sending its with rpc streaming instead of
# padding its into inode (default: 25000, about 25000 * 41 (byte) = 1MB)
storage.s3_meta_inside_inode.limit_size=25000
# TTL(millisecond) for tx lock
storage.tx_lock_ttl_ms=5000
# recycle options
# metaserver scan recycle period, default 1h
recycle.manager.scanPeriodSec=3600
# metaserver recycle cleaner scan list dentry limit, default 1000
recycle.cleaner.scanLimit=1000
#### excutorOpt
# excutorOpt rpc with metaserver
# rpc retry times with metaserver
excutorOpt.maxRetry=4294967295
# internal rpc retry times with metaserver
excutorOpt.maxInternalRetry = 3
# Retry sleep time between failed RPCs
excutorOpt.retryIntervalUS=100000
# RPC timeout for communicating with metaserver
excutorOpt.rpcTimeoutMS=1000
# RPC stream idle timeout
excutorOpt.rpcStreamIdleTimeoutMS=500
# The maximum timeout RPC time of the retry request.
# The timeout time will follow the exponential backoff policy.
# Because the timeout occurs when the network is congested, the RPC timeout needs to be increased
excutorOpt.maxRPCTimeoutMS=8000
# Maximum sleep time between retry requests.
# when the network is congested or the metaserver is overloaded,
# it is necessary to increase the sleep time
excutorOpt.maxRetrySleepIntervalUS=8000000
excutorOpt.minRetryTimesForceTimeoutBackoff=5
excutorOpt.maxRetryTimesBeforeConsiderSuspend=20
# batch limit of get inode attr and xattr
excutorOpt.batchInodeAttrLimit=10000
excutorOpt.enableMultiMountPointRename=true
#### metaCacheOpt
# Gets the number of retries for the leader
metaCacheOpt.metacacheGetLeaderRetry=3
# Need to sleep for a period of time before each get leader retry
metaCacheOpt.metacacheRPCRetryIntervalUS=100000
# RPC timeout of get leader
metaCacheOpt.metacacheGetLeaderRPCTimeOutMS=1000
#### volume deallocate
volume.deallocate.enable=true
volume.deallocate.workerNum=5
volume.deallocate.batchClean=10
volume.sdk.confPath=conf/curvebs_client.conf # __CURVEADM_TEMPLATE__ /curvefs/metaserver/conf/curvebs_client.conf __CURVEADM_TEMPLATE__