diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b0..33b0a1a8dad69 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-log.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -112,6 +113,30 @@ noinline void btrfs_release_path(struct btrfs_path *p) } } +static void delete_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *eb, + u64 parent, + bool is_last_ref) +{ + const u64 root_id = btrfs_root_id(root); + + btrfs_free_tree_block(trans, root_id, eb, parent, is_last_ref); + /* + * If we are deleting a block from a log tree, then delete its range from + * the io tree that tracks the blocks. This is only to ensure that if a + * transaction abort happens, we are able to do proper cleanup of space + * reservations, because we may not be able to iterate over the log tree + * in case he had a writeback failure for a log tree node - so we rely on + * the io tree to figure out the range of each log tree block. We ignore + * any error from clear_extent_bits() because it's not common and it's + * not critical either. + */ + if (root_id == BTRFS_TREE_LOG_OBJECTID) + clear_extent_bits(&root->dirty_log_pages, eb->start, + eb->start + eb->len - 1, BTRFS_LOG_PAGES_BITS); +} + /* * safely gets a reference on the root node of a tree. A lock * is not taken, so a concurrent writer may put a different node @@ -463,8 +488,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); - btrfs_free_tree_block(trans, btrfs_root_id(root), buf, - parent_start, last_ref); + delete_tree_block(trans, root, buf, parent_start, last_ref); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -485,8 +509,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return ret; } } - btrfs_free_tree_block(trans, btrfs_root_id(root), buf, - parent_start, last_ref); + delete_tree_block(trans, root, buf, parent_start, last_ref); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -930,7 +953,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, free_extent_buffer(mid); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + delete_tree_block(trans, root, mid, 0, true); /* once for the root ptr */ free_extent_buffer_stale(mid); return 0; @@ -989,8 +1012,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); - btrfs_free_tree_block(trans, btrfs_root_id(root), right, - 0, 1); + delete_tree_block(trans, root, right, 0, true); free_extent_buffer_stale(right); right = NULL; } else { @@ -1035,7 +1057,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); - btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); + delete_tree_block(trans, root, mid, 0, true); free_extent_buffer_stale(mid); mid = NULL; } else { @@ -4158,7 +4180,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, root_sub_used(root, leaf->len); atomic_inc(&leaf->refs); - btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); + delete_tree_block(trans, root, leaf, 0, true); free_extent_buffer_stale(leaf); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 03de89b45f279..75342004351a0 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1083,7 +1083,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, * on all the pages and clear them from the dirty pages state tree */ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, + bool is_log_tree) { int err = 0; int werr = 0; @@ -1101,8 +1102,22 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * after committing the log because the tree can be accessed * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). + * + * For a log tree, we convert the range bit so that we know + * about the range of log tree extent buffers even after they + * were written, so that if a transaction abort happens we + * know about the logical bytenr of the extents and can free + * them up, releasing reserved space in their block groups and + * in the metadata space_info. Ignore any errors in this case, + * we have no way to handle them, if they happen they are + * harmless, they only result in some warnings during unmount. */ - err = clear_extent_bit(dirty_pages, start, end, + if (is_log_tree) + convert_extent_bit(dirty_pages, start, end, + EXTENT_UPTODATE, EXTENT_NEED_WAIT, + &cached_state); + else + err = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, 0, 0, &cached_state); if (err == -ENOMEM) err = 0; @@ -1126,7 +1141,7 @@ static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, bool errors = false; int err; - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + err = __btrfs_wait_marked_extents(fs_info, dirty_pages, false); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; @@ -1144,7 +1159,7 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + err = __btrfs_wait_marked_extents(fs_info, dirty_pages, true); if ((mark & EXTENT_DIRTY) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c1ddbe8008975..33ca6e58d0136 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2760,6 +2760,28 @@ static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) btrfs_put_block_group(cache); } +static int release_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct extent_buffer *eb) +{ + btrfs_tree_lock(eb); + btrfs_clean_tree_block(eb); + btrfs_wait_tree_block_writeback(eb); + btrfs_tree_unlock(eb); + + /* + * We ignore errors here on purpose. They should be rare and if they + * happen then nothing really serious happens, we just trigger some + * warnings on unmount due to releasing reserved space twice for an + * extent buffer - plus only in the transaction abort case, so it's + * very rare for this to happen. + */ + clear_extent_bits(&log->dirty_log_pages, eb->start, + eb->start + eb->len - 1, BTRFS_LOG_PAGES_BITS); + + return btrfs_pin_reserved_extent(trans, eb->start, eb->len); +} + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int *level, @@ -2770,7 +2792,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u64 ptr_gen; struct extent_buffer *next; struct extent_buffer *cur; - u32 blocksize; int ret = 0; while (*level > 0) { @@ -2787,7 +2808,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); - blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, btrfs_header_owner(cur), @@ -2812,24 +2832,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - bytenr, blocksize); - if (ret) { - free_extent_buffer(next); - return ret; - } - btrfs_redirty_list_add( - trans->transaction, next); - } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, bytenr); + ret = release_tree_block(trans, root, next); + if (ret) { + free_extent_buffer(next); + return ret; } + btrfs_redirty_list_add(trans->transaction, next); } free_extent_buffer(next); continue; @@ -2858,7 +2866,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = root->fs_info; int i; int slot; int ret; @@ -2881,26 +2888,10 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct extent_buffer *next; next = path->nodes[*level]; - - if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - path->nodes[*level]->start, - path->nodes[*level]->len); - if (ret) - return ret; - btrfs_redirty_list_add(trans->transaction, - next); - } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - - unaccount_log_buffer(fs_info, - path->nodes[*level]->start); - } + ret = release_tree_block(trans, root, next); + if (ret) + return ret; + btrfs_redirty_list_add(trans->transaction, next); } free_extent_buffer(path->nodes[*level]); path->nodes[*level] = NULL; @@ -2918,13 +2909,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { - struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; int wret; int level; struct btrfs_path *path; int orig_level; + ASSERT(trans != NULL); + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2964,22 +2956,10 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, struct extent_buffer *next; next = path->nodes[orig_level]; - - if (trans) { - btrfs_tree_lock(next); - btrfs_clean_tree_block(next); - btrfs_wait_tree_block_writeback(next); - btrfs_tree_unlock(next); - ret = btrfs_pin_reserved_extent(trans, - next->start, next->len); - if (ret) - goto out; - btrfs_redirty_list_add(trans->transaction, next); - } else { - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) - clear_extent_buffer_dirty(next); - unaccount_log_buffer(fs_info, next->start); - } + ret = release_tree_block(trans, log, next); + if (ret) + goto out; + btrfs_redirty_list_add(trans->transaction, next); } } @@ -3402,35 +3382,93 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, return ret; } +/* + * When cleaning up an aborted transaction we can't iterate the log tree because + * in case writeback failed for one of its extent buffers, we won't be able to + * read it, we'll get -EIO from btrfs_read_buffer(). So we instead iterate over + * the log tree's ->dirty_log_pages io tree. + */ +static void unaccount_all_log_buffers(struct btrfs_root *log) +{ + u64 start = 0; + u64 end; + + while (!find_first_extent_bit(&log->dirty_log_pages, start, &start, &end, + BTRFS_LOG_PAGES_BITS, NULL)) { + struct btrfs_fs_info *fs_info = log->fs_info; + + for (; start < end; start += fs_info->nodesize) { + struct extent_buffer *eb; + + cond_resched(); + + unaccount_log_buffer(fs_info, start); + eb = find_extent_buffer(fs_info, start); + /* + * eb can be NULL in case it was already written and + * evicted from memory. + */ + if (!eb) + continue; + + wait_on_extent_buffer_writeback(eb); + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) + clear_extent_buffer_dirty(eb); + free_extent_buffer(eb); + } + + start = end + 1; + } +} + static void free_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log) { - int ret; - struct walk_control wc = { - .free = 1, - .process_func = process_one_buffer - }; + if (trans && log->node) { + int ret; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; - if (log->node) { ret = walk_log_tree(trans, log, &wc); if (ret) { - if (trans) - btrfs_abort_transaction(trans, ret); - else - btrfs_handle_fs_error(log->fs_info, ret, NULL); + btrfs_abort_transaction(trans, ret); + /* + * We may have failed iterating the whole tree, so we + * fallback to the transaction abort cleanup path. + */ + unaccount_all_log_buffers(log); } + } else if (!trans) { + unaccount_all_log_buffers(log); } - clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + /* + * Cleanup every state record. When walking the log tree we clean the + * range for each extent buffer as we clean it up, but that might fail + * with -ENOMEM due to extent state record splits and we ignore such + * errors during the walk - it's rare but it could happen. By clearing + * the whole range here we don't need to have such state splits and + * allocations, so this way it's guaranteed to always succeed. So we + * almosts always end up here with an empty io tree, except for the + * transaction abort case (when @trans is NULL). + */ + clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, BTRFS_LOG_PAGES_BITS); extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } /* - * free all the extents used by the tree log. This should be called - * at commit time of the full transaction + * Free all the extents used by a log tree. + * + * @trans: A transaction handle or NULL. + * @root: The parent root of a log tree. + * + * This should be called either at commit time of the full transaction, or when + * cleaning up a transaction that was aborted. In the former case @trans is not + * NULL, while in the second case @trans must be NULL. */ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f6811c3df38a6..d11a16a97e32a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,6 +12,13 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* + * All the bits we can set for an extent buffer's range in a log root's + * ->dirty_log_pages io tree. + */ +#define BTRFS_LOG_PAGES_BITS (EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT | \ + EXTENT_UPTODATE) + struct btrfs_log_ctx { int log_ret; int log_transid;