diff options
author | Filipe Manana <fdmanana@suse.com> | 2016-05-12 14:53:36 +0200 |
---|---|---|
committer | Filipe Manana <fdmanana@suse.com> | 2016-05-13 02:59:36 +0200 |
commit | 5f9a8a51d8b95505d8de8b7191ae2ed8c504d4af (patch) | |
tree | d97a7f5d321694e09c3046e9027c23a02d6a5878 /fs/btrfs/inode.c | |
parent | Btrfs: fix race between block group relocation and nocow writes (diff) | |
download | linux-5f9a8a51d8b95505d8de8b7191ae2ed8c504d4af.tar.xz linux-5f9a8a51d8b95505d8de8b7191ae2ed8c504d4af.zip |
Btrfs: add semaphore to synchronize direct IO writes with fsync
Due to the optimization of lockless direct IO writes (the inode's i_mutex
is not held) introduced in commit 38851cc19adb ("Btrfs: implement unlocked
dio write"), we started having races between such writes with concurrent
fsync operations that use the fast fsync path. These races were addressed
in the patches titled "Btrfs: fix race between fsync and lockless direct
IO writes" and "Btrfs: fix race between fsync and direct IO writes for
prealloc extents". The races happened because the direct IO path, like
every other write path, does create extent maps followed by the
corresponding ordered extents while the fast fsync path collected first
ordered extents and then it collected extent maps. This made it possible
to log file extent items (based on the collected extent maps) without
waiting for the corresponding ordered extents to complete (get their IO
done). The two fixes mentioned before added a solution that consists of
making the direct IO path create first the ordered extents and then the
extent maps, while the fsync path attempts to collect any new ordered
extents once it collects the extent maps. This was simple and did not
require adding any synchonization primitive to any data structure (struct
btrfs_inode for example) but it makes things more fragile for future
development endeavours and adds an exceptional approach compared to the
other write paths.
This change adds a read-write semaphore to the btrfs inode structure and
makes the direct IO path create the extent maps and the ordered extents
while holding read access on that semaphore, while the fast fsync path
collects extent maps and ordered extents while holding write access on
that semaphore. The logic for direct IO write path is encapsulated in a
new helper function that is used both for cow and nocow direct IO writes.
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: Josef Bacik <jbacik@fb.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r-- | fs/btrfs/inode.c | 134 |
1 files changed, 53 insertions, 81 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ee9be4199e7c..c1ee4ade2d87 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7145,6 +7145,43 @@ out: return em; } +static struct extent_map *btrfs_create_dio_extent(struct inode *inode, + const u64 start, + const u64 len, + const u64 orig_start, + const u64 block_start, + const u64 block_len, + const u64 orig_block_len, + const u64 ram_bytes, + const int type) +{ + struct extent_map *em = NULL; + int ret; + + down_read(&BTRFS_I(inode)->dio_sem); + if (type != BTRFS_ORDERED_NOCOW) { + em = create_pinned_em(inode, start, len, orig_start, + block_start, block_len, orig_block_len, + ram_bytes, type); + if (IS_ERR(em)) + goto out; + } + ret = btrfs_add_ordered_extent_dio(inode, start, block_start, + len, block_len, type); + if (ret) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_cache(inode, start, + start + len - 1, 0); + } + em = ERR_PTR(ret); + } + out: + up_read(&BTRFS_I(inode)->dio_sem); + + return em; +} + static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { @@ -7160,43 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - /* - * Create the ordered extent before the extent map. This is to avoid - * races with the fast fsync path that would lead to it logging file - * extent items that point to disk extents that were not yet written to. - * The fast fsync path collects ordered extents into a local list and - * then collects all the new extent maps, so we must create the ordered - * extent first and make sure the fast fsync path collects any new - * ordered extents after collecting new extent maps as well. - * The fsync path simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, - ins.offset, ins.offset, 0); - if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return ERR_PTR(ret); - } - + em = btrfs_create_dio_extent(inode, start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, 0); btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); - - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - + if (IS_ERR(em)) btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - oe = btrfs_lookup_ordered_extent(inode, start); - ASSERT(oe); - if (WARN_ON(!oe)) - return em; - set_bit(BTRFS_ORDERED_IOERR, &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* Once for our lookup and once for the ordered extents tree. */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - } + return em; } @@ -7670,57 +7677,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1 && btrfs_inc_nocow_writers(root->fs_info, block_start)) { + struct extent_map *em2; - /* - * Create the ordered extent before the extent map. This - * is to avoid races with the fast fsync path because it - * collects ordered extents into a local list and then - * collects all the new extent maps, so we must create - * the ordered extent first and make sure the fast fsync - * path collects any new ordered extents after - * collecting new extent maps as well. The fsync path - * simply can not rely on inode_dio_wait() because it - * causes deadlock with AIO. - */ - ret = btrfs_add_ordered_extent_dio(inode, start, - block_start, len, len, type); + em2 = btrfs_create_dio_extent(inode, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); btrfs_dec_nocow_writers(root->fs_info, block_start); - if (ret) { - free_extent_map(em); - goto unlock_err; - } - if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - em = create_pinned_em(inode, start, len, - orig_start, - block_start, len, - orig_block_len, - ram_bytes, type); - if (IS_ERR(em)) { - struct btrfs_ordered_extent *oe; - - ret = PTR_ERR(em); - oe = btrfs_lookup_ordered_extent(inode, - start); - ASSERT(oe); - if (WARN_ON(!oe)) - goto unlock_err; - set_bit(BTRFS_ORDERED_IOERR, - &oe->flags); - set_bit(BTRFS_ORDERED_IO_DONE, - &oe->flags); - btrfs_remove_ordered_extent(inode, oe); - /* - * Once for our lookup and once for the - * ordered extents tree. - */ - btrfs_put_ordered_extent(oe); - btrfs_put_ordered_extent(oe); - goto unlock_err; - } + em = em2; + } + if (em2 && IS_ERR(em2)) { + ret = PTR_ERR(em2); + goto unlock_err; } - goto unlock; } } @@ -9281,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->dio_sem); return inode; } |