Skip to content

Commit

Permalink
Btrfs: allow block group cache writeout outside critical section in c…
Browse files Browse the repository at this point in the history
…ommit

We loop through all of the dirty block groups during commit and write
the free space cache.  In order to make sure the cache is currect, we do
this while no other writers are allowed in the commit.

If a large number of block groups are dirty, this can introduce long
stalls during the final stages of the commit, which can block new procs
trying to change the filesystem.

This commit changes the block group cache writeout to take appropriate
locks and allow it to run earlier in the commit.  We'll still have to
redo some of the block groups, but it means we can get most of the work
out of the way without blocking the entire FS.

Signed-off-by: Chris Mason <[email protected]>
  • Loading branch information
masoncl committed Apr 10, 2015
1 parent 2b10826 commit 1bbc621
Show file tree
Hide file tree
Showing 9 changed files with 341 additions and 37 deletions.
8 changes: 8 additions & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1491,6 +1491,12 @@ struct btrfs_fs_info {
struct mutex chunk_mutex;
struct mutex volume_mutex;

/*
* this is taken to make sure we don't set block groups ro after
* the free space cache has been allocated on them
*/
struct mutex ro_block_group_mutex;

/* this is used during read/modify/write to make sure
* no two ios are trying to mod the same stripe at the same
* time
Expand Down Expand Up @@ -3407,6 +3413,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 owner, u64 offset, int no_quota);

int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
Expand Down
1 change: 1 addition & 0 deletions fs/btrfs/disk-io.c
Original file line number Diff line number Diff line change
Expand Up @@ -2572,6 +2572,7 @@ int open_ctree(struct super_block *sb,
mutex_init(&fs_info->transaction_kthread_mutex);
mutex_init(&fs_info->cleaner_mutex);
mutex_init(&fs_info->volume_mutex);
mutex_init(&fs_info->ro_block_group_mutex);
init_rwsem(&fs_info->commit_root_sem);
init_rwsem(&fs_info->cleanup_work_sem);
init_rwsem(&fs_info->subvol_sem);
Expand Down
241 changes: 216 additions & 25 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -3298,7 +3298,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
if (ret)
goto out_put;

ret = btrfs_truncate_free_space_cache(root, trans, inode);
ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret)
goto out_put;
}
Expand Down Expand Up @@ -3382,20 +3382,156 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
return 0;
}

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
/*
* transaction commit does final block group cache writeback during a
* critical section where nothing is allowed to change the FS. This is
* required in order for the cache to actually match the block group,
* but can introduce a lot of latency into the commit.
*
* So, btrfs_start_dirty_block_groups is here to kick off block group
* cache IO. There's a chance we'll have to redo some of it if the
* block group changes again during the commit, but it greatly reduces
* the commit latency by getting rid of the easy block groups while
* we're still allowing others to join the commit.
*/
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
struct btrfs_path *path;
LIST_HEAD(io);
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
int num_started = 0;
int num_waited = 0;
int loops = 0;

spin_lock(&cur_trans->dirty_bgs_lock);
if (!list_empty(&cur_trans->dirty_bgs)) {
list_splice_init(&cur_trans->dirty_bgs, &dirty);
}
spin_unlock(&cur_trans->dirty_bgs_lock);

if (list_empty(&cur_trans->dirty_bgs))
again:
if (list_empty(&dirty)) {
btrfs_free_path(path);
return 0;
}

/*
* make sure all the block groups on our dirty list actually
* exist
*/
btrfs_create_pending_block_groups(trans, root);

if (!path) {
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
}

while (!list_empty(&dirty)) {
cache = list_first_entry(&dirty,
struct btrfs_block_group_cache,
dirty_list);

/*
* cache_write_mutex is here only to save us from balance
* deleting this block group while we are writing out the
* cache
*/
mutex_lock(&trans->transaction->cache_write_mutex);

/*
* this can happen if something re-dirties a block
* group that is already under IO. Just wait for it to
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
}


/*
* btrfs_wait_cache_io uses the cache->dirty_list to decide
* if it should update the cache_state. Don't delete
* until after we wait.
*
* Since we're not running in the commit critical section
* we need the dirty_bgs_lock to protect from update_block_group
*/
spin_lock(&cur_trans->dirty_bgs_lock);
list_del_init(&cache->dirty_list);
spin_unlock(&cur_trans->dirty_bgs_lock);

should_put = 1;

cache_save_setup(cache, trans, path);

if (cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(root, trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
num_started++;
should_put = 0;

/*
* the cache_write_mutex is protecting
* the io_list
*/
list_add_tail(&cache->io_list, io);
} else {
/*
* if we failed to write the cache, the
* generation will be bad and life goes on
*/
ret = 0;
}
}
if (!ret)
ret = write_one_cache_group(trans, root, path, cache);
mutex_unlock(&trans->transaction->cache_write_mutex);

/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);

if (ret)
break;
}

/*
* go through delayed refs for all the stuff we've just kicked off
* and then loop back (just once)
*/
ret = btrfs_run_delayed_refs(trans, root, 0);
if (!ret && loops == 0) {
loops++;
spin_lock(&cur_trans->dirty_bgs_lock);
list_splice_init(&cur_trans->dirty_bgs, &dirty);
spin_unlock(&cur_trans->dirty_bgs_lock);
goto again;
}

btrfs_free_path(path);
return ret;
}

int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
int num_started = 0;

path = btrfs_alloc_path();
if (!path)
Expand Down Expand Up @@ -3423,14 +3559,16 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
num_waited++;
}

/*
* don't remove from the dirty list until after we've waited
* on any pending IO
*/
list_del_init(&cache->dirty_list);
should_put = 1;

if (cache->disk_cache_state == BTRFS_DC_CLEAR)
cache_save_setup(cache, trans, path);
cache_save_setup(cache, trans, path);

if (!ret)
ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);
Expand All @@ -3441,7 +3579,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
if (ret == 0 && cache->io_ctl.inode) {
num_started++;
should_put = 0;
list_add_tail(&cache->io_list, &io);
list_add_tail(&cache->io_list, io);
} else {
/*
* if we failed to write the cache, the
Expand All @@ -3458,11 +3596,10 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
btrfs_put_block_group(cache);
}

while (!list_empty(&io)) {
cache = list_first_entry(&io, struct btrfs_block_group_cache,
while (!list_empty(io)) {
cache = list_first_entry(io, struct btrfs_block_group_cache,
io_list);
list_del_init(&cache->io_list);
num_waited++;
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path, cache->key.objectid);
btrfs_put_block_group(cache);
Expand Down Expand Up @@ -5459,15 +5596,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);

spin_lock(&trans->transaction->dirty_bgs_lock);
if (list_empty(&cache->dirty_list)) {
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
trans->transaction->num_dirty_bgs++;
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);

byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);

Expand Down Expand Up @@ -5516,6 +5644,16 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->unused_bgs_lock);
}
}

spin_lock(&trans->transaction->dirty_bgs_lock);
if (list_empty(&cache->dirty_list)) {
list_add_tail(&cache->dirty_list,
&trans->transaction->dirty_bgs);
trans->transaction->num_dirty_bgs++;
btrfs_get_block_group(cache);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);

btrfs_put_block_group(cache);
total -= num_bytes;
bytenr += num_bytes;
Expand Down Expand Up @@ -8602,10 +8740,30 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,

BUG_ON(cache->ro);

again:
trans = btrfs_join_transaction(root);
if (IS_ERR(trans))
return PTR_ERR(trans);

/*
* we're not allowed to set block groups readonly after the dirty
* block groups cache has started writing. If it already started,
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
if (trans->transaction->dirty_bg_run) {
u64 transid = trans->transid;

mutex_unlock(&root->fs_info->ro_block_group_mutex);
btrfs_end_transaction(trans, root);

ret = btrfs_wait_for_commit(root, transid);
if (ret)
return ret;
goto again;
}


ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
Expand All @@ -8620,6 +8778,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
alloc_flags = update_block_group_flags(root, cache->flags);
check_system_chunk(trans, root, alloc_flags);
}
mutex_unlock(&root->fs_info->ro_block_group_mutex);

btrfs_end_transaction(trans, root);
return ret;
Expand Down Expand Up @@ -9425,7 +9584,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
goto out;
}

/*
* get the inode first so any iput calls done for the io_list
* aren't the final iput (no unlinks allowed now)
*/
inode = lookup_free_space_inode(tree_root, block_group, path);

mutex_lock(&trans->transaction->cache_write_mutex);
/*
* make sure our free spache cache IO is done before remove the
* free space inode
*/
spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->io_list)) {
list_del_init(&block_group->io_list);

WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);

spin_unlock(&trans->transaction->dirty_bgs_lock);
btrfs_wait_cache_io(root, trans, block_group,
&block_group->io_ctl, path,
block_group->key.objectid);
btrfs_put_block_group(block_group);
spin_lock(&trans->transaction->dirty_bgs_lock);
}

if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
btrfs_put_block_group(block_group);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);
mutex_unlock(&trans->transaction->cache_write_mutex);

if (!IS_ERR(inode)) {
ret = btrfs_orphan_add(trans, inode);
if (ret) {
Expand Down Expand Up @@ -9518,11 +9708,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,

spin_lock(&trans->transaction->dirty_bgs_lock);
if (!list_empty(&block_group->dirty_list)) {
list_del_init(&block_group->dirty_list);
btrfs_put_block_group(block_group);
WARN_ON(1);
}
if (!list_empty(&block_group->io_list)) {
WARN_ON(1);
}
spin_unlock(&trans->transaction->dirty_bgs_lock);

btrfs_remove_free_space_cache(block_group);

spin_lock(&block_group->space_info->lock);
Expand Down
Loading

0 comments on commit 1bbc621

Please sign in to comment.