Skip to content

Commit

Permalink
Btrfs: two stage dirty block group writeout
Browse files Browse the repository at this point in the history
Block group cache writeout is currently waiting on the pages for each
block group cache before moving on to writing the next one.  This commit
switches things around to send down all the caches and then wait on them
in batches.

The end result is much faster, since we're keeping the disk pipeline
full.

Signed-off-by: Chris Mason <[email protected]>
  • Loading branch information
masoncl committed Apr 10, 2015
1 parent 4c6d1d8 commit c9dc4c6
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 32 deletions.
6 changes: 6 additions & 0 deletions fs/btrfs/ctree.h
Original file line number Diff line number Diff line change
Expand Up @@ -1261,9 +1261,12 @@ struct btrfs_io_ctl {
struct page *page;
struct page **pages;
struct btrfs_root *root;
struct inode *inode;
unsigned long size;
int index;
int num_pages;
int entries;
int bitmaps;
unsigned check_crcs:1;
};

Expand Down Expand Up @@ -1332,6 +1335,9 @@ struct btrfs_block_group_cache {

/* For dirty block groups */
struct list_head dirty_list;
struct list_head io_list;

struct btrfs_io_ctl io_ctl;
};

/* delayed seq elem */
Expand Down
57 changes: 53 additions & 4 deletions fs/btrfs/extent-tree.c
Original file line number Diff line number Diff line change
Expand Up @@ -3388,7 +3388,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache;
struct btrfs_transaction *cur_trans = trans->transaction;
int ret = 0;
int should_put;
struct btrfs_path *path;
LIST_HEAD(io);
int num_started = 0;
int num_waited = 0;

if (list_empty(&cur_trans->dirty_bgs))
return 0;
Expand All @@ -3407,16 +3411,60 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache = list_first_entry(&cur_trans->dirty_bgs,
struct btrfs_block_group_cache,
dirty_list);

/*
* this can happen if cache_save_setup re-dirties a block
* group that is already under IO. Just wait for it to
* finish and then do it all again
*/
if (!list_empty(&cache->io_list)) {
list_del_init(&cache->io_list);
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path,
cache->key.objectid);
btrfs_put_block_group(cache);
num_waited++;
}

list_del_init(&cache->dirty_list);
should_put = 1;

if (cache->disk_cache_state == BTRFS_DC_CLEAR)
cache_save_setup(cache, trans, path);

if (!ret)
ret = btrfs_run_delayed_refs(trans, root,
(unsigned long) -1);
if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
btrfs_write_out_cache(root, trans, cache, path);
ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1);

if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(root, trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
num_started++;
should_put = 0;
list_add_tail(&cache->io_list, &io);
} else {
/*
* if we failed to write the cache, the
* generation will be bad and life goes on
*/
ret = 0;
}
}
if (!ret)
ret = write_one_cache_group(trans, root, path, cache);

/* if its not on the io list, we need to put the block group */
if (should_put)
btrfs_put_block_group(cache);
}

while (!list_empty(&io)) {
cache = list_first_entry(&io, struct btrfs_block_group_cache,
io_list);
list_del_init(&cache->io_list);
num_waited++;
btrfs_wait_cache_io(root, trans, cache,
&cache->io_ctl, path, cache->key.objectid);
btrfs_put_block_group(cache);
}

Expand Down Expand Up @@ -9013,6 +9061,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
INIT_LIST_HEAD(&cache->dirty_list);
INIT_LIST_HEAD(&cache->io_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);

Expand Down
131 changes: 104 additions & 27 deletions fs/btrfs/free-space-cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -170,13 +170,13 @@ static int __create_free_space_inode(struct btrfs_root *root,
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.offset = offset;
key.type = 0;

ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(struct btrfs_free_space_header));
if (ret < 0) {
btrfs_release_path(path);
return ret;
}

leaf = path->nodes[0];
header = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_free_space_header);
Expand Down Expand Up @@ -296,13 +296,15 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode,
io_ctl->num_pages = num_pages;
io_ctl->root = root;
io_ctl->check_crcs = check_crcs;
io_ctl->inode = inode;

return 0;
}

static void io_ctl_free(struct btrfs_io_ctl *io_ctl)
{
kfree(io_ctl->pages);
io_ctl->pages = NULL;
}

static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl)
Expand Down Expand Up @@ -1092,6 +1094,61 @@ cleanup_write_cache_enospc(struct inode *inode,
GFP_NOFS);
}

int btrfs_wait_cache_io(struct btrfs_root *root,
struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_path *path, u64 offset)
{
int ret;
struct inode *inode = io_ctl->inode;

root = root->fs_info->tree_root;

/* Flush the dirty pages in the cache file. */
ret = flush_dirty_cache(inode);
if (ret)
goto out;

/* Update the cache item to tell everyone this cache file is valid. */
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
if (block_group) {
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->key.objectid);
#endif
}
}
btrfs_update_inode(trans, root, inode);

if (block_group) {
spin_lock(&block_group->lock);

/*
* only mark this as written if we didn't get put back on
* the dirty list while waiting for IO.
*/
if (!ret && list_empty(&block_group->dirty_list))
block_group->disk_cache_state = BTRFS_DC_WRITTEN;
else if (ret)
block_group->disk_cache_state = BTRFS_DC_ERROR;

spin_unlock(&block_group->lock);
io_ctl->inode = NULL;
iput(inode);
}

return ret;

}

/**
* __btrfs_write_out_cache - write out cached info to an inode
* @root - the root the inode belongs to
Expand All @@ -1108,20 +1165,22 @@ cleanup_write_cache_enospc(struct inode *inode,
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
struct btrfs_free_space_ctl *ctl,
struct btrfs_block_group_cache *block_group,
struct btrfs_io_ctl *io_ctl,
struct btrfs_trans_handle *trans,
struct btrfs_path *path, u64 offset)
{
struct extent_state *cached_state = NULL;
struct btrfs_io_ctl io_ctl;
LIST_HEAD(bitmap_list);
int entries = 0;
int bitmaps = 0;
int ret;
int must_iput = 0;

if (!i_size_read(inode))
return -1;

ret = io_ctl_init(&io_ctl, inode, root, 1);
WARN_ON(io_ctl->pages);
ret = io_ctl_init(io_ctl, inode, root, 1);
if (ret)
return -1;

Expand All @@ -1134,22 +1193,23 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
up_write(&block_group->data_rwsem);
BTRFS_I(inode)->generation = 0;
ret = 0;
must_iput = 1;
goto out;
}
spin_unlock(&block_group->lock);
}

/* Lock all pages first so we can lock the extent safely. */
io_ctl_prepare_pages(&io_ctl, inode, 0);
io_ctl_prepare_pages(io_ctl, inode, 0);

lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
0, &cached_state);

io_ctl_set_generation(&io_ctl, trans->transid);
io_ctl_set_generation(io_ctl, trans->transid);

mutex_lock(&ctl->cache_writeout_mutex);
/* Write out the extent entries in the free space cache */
ret = write_cache_extent_entries(&io_ctl, ctl,
ret = write_cache_extent_entries(io_ctl, ctl,
block_group, &entries, &bitmaps,
&bitmap_list);
if (ret) {
Expand All @@ -1162,7 +1222,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* they will be added into free space cache after the transaction is
* committed, we shouldn't lose them.
*/
ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries);
if (ret) {
mutex_unlock(&ctl->cache_writeout_mutex);
goto out_nospc;
Expand All @@ -1173,16 +1233,16 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* locked while doing it because a concurrent trim can be manipulating
* or freeing the bitmap.
*/
ret = write_bitmap_entries(&io_ctl, &bitmap_list);
ret = write_bitmap_entries(io_ctl, &bitmap_list);
mutex_unlock(&ctl->cache_writeout_mutex);
if (ret)
goto out_nospc;

/* Zero out the rest of the pages just to make sure */
io_ctl_zero_remaining_pages(&io_ctl);
io_ctl_zero_remaining_pages(io_ctl);

/* Everything is written out, now we dirty the pages in the file. */
ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages,
0, i_size_read(inode), &cached_state);
if (ret)
goto out_nospc;
Expand All @@ -1193,30 +1253,39 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* Release the pages and unlock the extent, we will flush
* them out later
*/
io_ctl_drop_pages(&io_ctl);
io_ctl_drop_pages(io_ctl);

unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state, GFP_NOFS);

/* Flush the dirty pages in the cache file. */
ret = flush_dirty_cache(inode);
/*
* at this point the pages are under IO and we're happy,
* The caller is responsible for waiting on them and updating the
* the cache and the inode
*/
io_ctl->entries = entries;
io_ctl->bitmaps = bitmaps;

ret = btrfs_fdatawrite_range(inode, 0, (u64)-1);
if (ret)
goto out;

/* Update the cache item to tell everyone this cache file is valid. */
ret = update_cache_item(trans, root, inode, path, offset,
entries, bitmaps);
return 0;

out:
io_ctl_free(&io_ctl);
io_ctl->inode = NULL;
io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
}
btrfs_update_inode(trans, root, inode);
if (must_iput)
iput(inode);
return ret;

out_nospc:
cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list);

if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
up_write(&block_group->data_rwsem);
Expand All @@ -1232,7 +1301,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;

root = root->fs_info->tree_root;

Expand All @@ -1253,22 +1321,28 @@ int btrfs_write_out_cache(struct btrfs_root *root,
if (IS_ERR(inode))
return 0;

ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group,
&block_group->io_ctl, trans,
path, block_group->key.objectid);
if (ret) {
dcs = BTRFS_DC_ERROR;
ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
"failed to write free space cache for block group %llu",
block_group->key.objectid);
#endif
spin_lock(&block_group->lock);
block_group->disk_cache_state = BTRFS_DC_ERROR;
spin_unlock(&block_group->lock);

block_group->io_ctl.inode = NULL;
iput(inode);
}

spin_lock(&block_group->lock);
block_group->disk_cache_state = dcs;
spin_unlock(&block_group->lock);
iput(inode);
/*
* if ret == 0 the caller is expected to call btrfs_wait_cache_io
* to wait for IO and put the inode
*/

return ret;
}

Expand Down Expand Up @@ -3331,11 +3405,14 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
{
struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
int ret;
struct btrfs_io_ctl io_ctl;

if (!btrfs_test_opt(root, INODE_MAP_CACHE))
return 0;

ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl,
trans, path, 0) ||
btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0);
if (ret) {
btrfs_delalloc_release_metadata(inode, inode->i_size);
#ifdef DEBUG
Expand Down
Loading

0 comments on commit c9dc4c6

Please sign in to comment.