From 1c6fdbd8f2465ddfb73a01ec620cbf3d14044e1a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Mar 2017 22:18:50 -0800 Subject: bcachefs: Initial commit Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write filesystem with every feature you could possibly want. Website: https://bcachefs.org Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.h | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) create mode 100644 fs/bcachefs/fs.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 index 000000000000..e8dd566285fc --- /dev/null +++ b/fs/bcachefs/fs.h @@ -0,0 +1,99 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H + +#include "opts.h" +#include "str_hash.h" +#include "quota_types.h" + +#include +#include + +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +struct pagecache_lock { + atomic_long_t v; + wait_queue_head_t wait; +}; + +static inline void pagecache_lock_init(struct pagecache_lock *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +void bch2_pagecache_add_put(struct pagecache_lock *); +void bch2_pagecache_add_get(struct pagecache_lock *); +void bch2_pagecache_block_put(struct pagecache_lock *); +void bch2_pagecache_block_get(struct pagecache_lock *); + +struct bch_inode_info { + struct inode v; + + struct mutex ei_update_lock; + u64 ei_journal_seq; + u64 ei_quota_reserved; + unsigned long ei_last_dirtied; + struct pagecache_lock ei_pagecache_lock; + + struct mutex ei_quota_lock; + struct bch_qid ei_qid; + + struct bch_hash_info ei_str_hash; + + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; +}; + +#define to_bch_ei(_inode) \ + container_of_or_null(_inode, struct bch_inode_info, v) + +static inline struct bch_inode_info *file_bch_inode(struct file *file) +{ + return to_bch_ei(file_inode(file)); +} + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +struct bch_inode_unpacked; + +#ifndef NO_BCACHEFS_FS + +/* returns 0 if we want to do the update, or error is passed up */ +typedef int (*inode_set_fn)(struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +void bch2_inode_update_after_write(struct bch_fs *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +int __must_check bch2_write_inode_trans(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + inode_set_fn, void *); +int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); +int __must_check bch2_write_inode(struct bch_fs *, + struct bch_inode_info *); + +void bch2_vfs_exit(void); +int bch2_vfs_init(void); + +#else + +static inline void bch2_vfs_exit(void) {} +static inline int bch2_vfs_init(void) { return 0; } + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_FS_H */ -- cgit v1.2.3 From 2ea9004864b918be34e742e38fb08d868600d020 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 17 Jul 2018 14:12:42 -0400 Subject: bcachefs: Fix mtime/ctime updates Also make inode flags consistent with how the rest of the inode is updated Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 3 +- fs/bcachefs/fs-io.c | 45 ++++++++++++++++-------- fs/bcachefs/fs-ioctl.c | 92 +++++--------------------------------------------- fs/bcachefs/fs-ioctl.h | 73 ++++++++++++++++++++++++++++++++++++++- fs/bcachefs/fs.c | 32 +++++++----------- fs/bcachefs/fs.h | 7 ++-- fs/bcachefs/xattr.c | 2 +- 7 files changed, 130 insertions(+), 124 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index eaf5c8e138fb..7ee2022d9501 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -286,10 +286,9 @@ static int inode_update_for_set_acl_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec64 now = current_time(&inode->v); umode_t mode = (unsigned long) p; - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); bi->bi_mode = mode; return 0; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 29d289b0dfa5..33c379ecf5a1 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -177,23 +177,40 @@ static int bch2_quota_reservation_add(struct bch_fs *c, /* i_size updates: */ +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; + static int inode_set_size(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { - loff_t *new_i_size = p; + struct inode_new_size *s = p; - lockdep_assert_held(&inode->ei_update_lock); + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; - bi->bi_size = *new_i_size; return 0; } static int __must_check bch2_write_inode_size(struct bch_fs *c, struct bch_inode_info *inode, - loff_t new_size) + loff_t new_size, unsigned fields) { - return __bch2_write_inode(c, inode, inode_set_size, &new_size, 0); + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; + + return bch2_write_inode(c, inode, inode_set_size, &s, fields); } static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, @@ -241,6 +258,7 @@ static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { + struct bch_fs *c = inode->v.i_sb->s_fs_info; struct i_sectors_hook *h = p; if (h->new_i_size != U64_MAX && @@ -249,6 +267,7 @@ static int i_sectors_dirty_finish_fn(struct bch_inode_info *inode, bi->bi_size = h->new_i_size; bi->bi_sectors += h->sectors; bi->bi_flags &= ~h->flags; + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); return 0; } @@ -259,7 +278,7 @@ static int i_sectors_dirty_finish(struct bch_fs *c, struct i_sectors_hook *h) mutex_lock(&h->inode->ei_update_lock); i_sectors_acct(c, h->inode, &h->quota_res, h->sectors); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0); + ret = bch2_write_inode(c, h->inode, i_sectors_dirty_finish_fn, h, 0); if (!ret && h->new_i_size != U64_MAX) i_size_write(&h->inode->v, h->new_i_size); @@ -289,7 +308,7 @@ static int i_sectors_dirty_start(struct bch_fs *c, struct i_sectors_hook *h) int ret; mutex_lock(&h->inode->ei_update_lock); - ret = __bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0); + ret = bch2_write_inode(c, h->inode, i_sectors_dirty_start_fn, h, 0); mutex_unlock(&h->inode->ei_update_lock); return ret; @@ -2223,9 +2242,8 @@ static int bch2_extend(struct bch_inode_info *inode, struct iattr *iattr) setattr_copy(NULL, &inode->v, iattr); mutex_lock(&inode->ei_update_lock); - inode_set_ctime_current(&inode->v); - inode->v.i_mtime = inode_get_ctime(&inode->v); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); return ret; @@ -2284,8 +2302,6 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) /* ATTR_MODE will never be set here, ns argument isn't needed: */ setattr_copy(NULL, &inode->v, iattr); - inode_set_ctime_current(&inode->v); - inode->v.i_mtime = inode_get_ctime(&inode->v); out: ret = i_sectors_dirty_finish(c, &i_sectors_hook) ?: ret; err_put_pagecache: @@ -2617,7 +2633,7 @@ btree_iter_err: i_size_write(&inode->v, end); mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } @@ -2633,7 +2649,8 @@ btree_iter_err: if (inode->ei_inode.bi_size != inode->v.i_size) { mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size); + ret = bch2_write_inode_size(c, inode, + inode->v.i_size, 0); mutex_unlock(&inode->ei_update_lock); } } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 895ccc79e782..a89786f295cf 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -12,79 +12,6 @@ #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_SYNC] = S_SYNC, - [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, - [__BCH_INODE_APPEND] = S_APPEND, - [__BCH_INODE_NOATIME] = S_NOATIME, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_SYNC] = FS_SYNC_FL, - [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, - [__BCH_INODE_APPEND] = FS_APPEND_FL, - [__BCH_INODE_NODUMP] = FS_NODUMP_FL, - [__BCH_INODE_NOATIME] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, - [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, - [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, - [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, - //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -/* Set VFS inode flags from bcachefs inode: */ -void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} - struct flags_set { unsigned mask; unsigned flags; @@ -96,6 +23,7 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { + struct bch_fs *c = inode->v.i_sb->s_fs_info; /* * We're relying on btree locking here for exclusion with other ioctl * calls - use the flags in the btree (@bi), not inode->i_flags: @@ -108,14 +36,15 @@ static int bch2_inode_flags_set(struct bch_inode_info *inode, !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - if (!S_ISREG(inode->v.i_mode) && - !S_ISDIR(inode->v.i_mode) && + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) return -EINVAL; bi->bi_flags &= ~s->mask; bi->bi_flags |= newflags; - inode_set_ctime_current(&inode->v); + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); return 0; } @@ -153,10 +82,8 @@ static int bch2_ioc_setflags(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, bch2_inode_flags_set, &s, 0); - - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); setflags_out: @@ -242,9 +169,8 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (ret) goto err_unlock; - ret = __bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, 0); - if (!ret) - bch2_inode_flags_to_vfs(inode); + ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); err_unlock: mutex_unlock(&inode->ei_update_lock); err: diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index 2d117ef80ab2..f201980ef2c3 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -2,7 +2,78 @@ #ifndef _BCACHEFS_FS_IOCTL_H #define _BCACHEFS_FS_IOCTL_H -void bch2_inode_flags_to_vfs(struct bch_inode_info *); +/* Inode flags: */ + +/* bcachefs inode flags -> vfs inode flags: */ +static const unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_SYNC] = S_SYNC, + [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, + [__BCH_INODE_APPEND] = S_APPEND, + [__BCH_INODE_NOATIME] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_SYNC] = FS_SYNC_FL, + [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, + [__BCH_INODE_APPEND] = FS_APPEND_FL, + [__BCH_INODE_NODUMP] = FS_NODUMP_FL, + [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, + [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, + [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, + [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 53107d02cbb6..2e2a5acae0eb 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -147,6 +147,8 @@ void bch2_inode_update_after_write(struct bch_fs *c, inode->ei_inode = *bi; inode->ei_qid = bch_qid(bi); + + bch2_inode_flags_to_vfs(inode); } int __must_check bch2_write_inode_trans(struct btree_trans *trans, @@ -187,10 +189,10 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, return 0; } -int __must_check __bch2_write_inode(struct bch_fs *c, - struct bch_inode_info *inode, - inode_set_fn set, - void *p, unsigned fields) +int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) { struct btree_trans trans; struct bch_inode_unpacked inode_u; @@ -271,9 +273,8 @@ static int inode_update_for_create_fn(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_inode_unpacked *new_inode = p; - struct timespec64 now = current_time(&inode->v); - bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); if (S_ISDIR(new_inode->bi_mode)) bi->bi_nlink++; @@ -469,9 +470,8 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec64 now = current_time(&inode->v); - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); if (bi->bi_flags & BCH_INODE_UNLINKED) bi->bi_flags &= ~BCH_INODE_UNLINKED; @@ -543,9 +543,8 @@ static int inode_update_dir_for_unlink_fn(struct bch_inode_info *inode, { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_inode_info *unlink_inode = p; - struct timespec64 now = current_time(&inode->v); - bi->bi_mtime = bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); bi->bi_nlink -= S_ISDIR(unlink_inode->v.i_mode); @@ -557,9 +556,8 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct timespec64 now = current_time(&inode->v); - bi->bi_ctime = timespec_to_bch2_time(c, now); + bi->bi_ctime = bch2_current_time(c); if (bi->bi_nlink) bi->bi_nlink--; else @@ -740,8 +738,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, { struct bch_fs *c = src_vdir->i_sb->s_fs_info; struct rename_info i = { - .now = timespec_to_bch2_time(c, - current_time(src_vdir)), .src_dir = to_bch_ei(src_vdir), .dst_dir = to_bch_ei(dst_vdir), .src_inode = to_bch_ei(src_dentry->d_inode), @@ -778,7 +774,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, bch2_trans_init(&trans, c); retry: bch2_trans_begin(&trans); - i.now = timespec_to_bch2_time(c, current_time(src_vdir)), + i.now = bch2_current_time(c); ret = bch2_dirent_rename(&trans, i.src_dir, &src_dentry->d_name, @@ -1271,8 +1267,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); - bch2_inode_flags_to_vfs(inode); - inode->v.i_mapping->a_ops = &bch_address_space_operations; switch (inode->v.i_mode & S_IFMT) { @@ -1346,8 +1340,8 @@ static int bch2_vfs_write_inode(struct inode *vinode, int ret; mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_update_times_fn, NULL, - ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); mutex_unlock(&inode->ei_update_lock); if (c->opts.journal_flush_disabled) diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index e8dd566285fc..4fdc11762cd7 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_FS_H #define _BCACHEFS_FS_H +#include "inode.h" #include "opts.h" #include "str_hash.h" #include "quota_types.h" @@ -81,10 +82,8 @@ int __must_check bch2_write_inode_trans(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, inode_set_fn, void *); -int __must_check __bch2_write_inode(struct bch_fs *, struct bch_inode_info *, - inode_set_fn, void *, unsigned); -int __must_check bch2_write_inode(struct bch_fs *, - struct bch_inode_info *); +int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index f0440d12a031..cb84bdabb6ed 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -436,7 +436,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } mutex_lock(&inode->ei_update_lock); - ret = __bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); mutex_unlock(&inode->ei_update_lock); if (value && -- cgit v1.2.3 From 0f5254aa98befa5187cc4d02584ab0f19d18ff68 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Dec 2018 05:43:00 -0500 Subject: bcachefs: bch2_fs_quota_transfer improve quota transfer locking & make ei_qid usage more consistent Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 15 +++-------- fs/bcachefs/fs.c | 67 ++++++++++++++++++++++++++++++++--------------- fs/bcachefs/fs.h | 6 +++++ fs/bcachefs/quota.c | 7 ++--- fs/bcachefs/quota.h | 11 +++----- fs/bcachefs/quota_types.h | 6 +++++ 6 files changed, 68 insertions(+), 44 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a89786f295cf..701882ce6024 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -108,21 +108,12 @@ static int bch2_set_projid(struct bch_fs *c, u32 projid) { struct bch_qid qid = inode->ei_qid; - int ret; - - if (projid == inode->ei_qid.q[QTYP_PRJ]) - return 0; qid.q[QTYP_PRJ] = projid; - return bch2_quota_transfer(c, 1 << QTYP_PRJ, qid, inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved); - if (ret) - return ret; - - inode->ei_qid.q[QTYP_PRJ] = projid; - return 0; + return bch2_fs_quota_transfer(c, inode, qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); } static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 8f0b049aa1ec..d22b9e7e2082 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -170,7 +170,6 @@ void bch2_inode_update_after_write(struct bch_fs *c, inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); inode->ei_inode = *bi; - inode->ei_qid = bch_qid(bi); bch2_inode_flags_to_vfs(inode); } @@ -248,6 +247,41 @@ retry: return ret < 0 ? ret : 0; } +int bch2_fs_quota_transfer(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_qid new_qid, + unsigned qtypes, + enum quota_acct_mode mode) +{ + unsigned i; + int ret; + + qtypes &= enabled_qtypes(c); + + for (i = 0; i < QTYP_NR; i++) + if (new_qid.q[i] == inode->ei_qid.q[i]) + qtypes &= ~(1U << i); + + if (!qtypes) + return 0; + + mutex_lock(&inode->ei_quota_lock); + + ret = bch2_quota_transfer(c, qtypes, new_qid, + inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved, + mode); + if (!ret) + for (i = 0; i < QTYP_NR; i++) + if (qtypes & (1 << i)) + inode->ei_qid.q[i] = new_qid.q[i]; + + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} + static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; @@ -913,37 +947,27 @@ static int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_qid qid = inode->ei_qid; + struct bch_qid qid; struct btree_trans trans; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; struct inode_write_setattr s = { iattr, idmap }; - unsigned qtypes = 0; int ret; mutex_lock(&inode->ei_update_lock); - if (c->opts.usrquota && - (iattr->ia_valid & ATTR_UID) && - !uid_eq(iattr->ia_uid, inode->v.i_uid)) { - qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid), - qtypes |= 1 << QTYP_USR; - } + qid = inode->ei_qid; + + if (iattr->ia_valid & ATTR_UID) + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), iattr->ia_uid); - if (c->opts.grpquota && - (iattr->ia_valid & ATTR_GID) && - !gid_eq(iattr->ia_gid, inode->v.i_gid)) { + if (iattr->ia_valid & ATTR_GID) qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), iattr->ia_gid); - qtypes |= 1 << QTYP_GRP; - } - if (qtypes) { - ret = bch2_quota_transfer(c, qtypes, qid, inode->ei_qid, - inode->v.i_blocks + - inode->ei_quota_reserved); - if (ret) - goto err; - } + ret = bch2_fs_quota_transfer(c, inode, qid, ~0, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; bch2_trans_init(&trans, c); retry: @@ -1312,6 +1336,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); + inode->ei_qid = bch_qid(bi); inode->v.i_mapping->a_ops = &bch_address_space_operations; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 4fdc11762cd7..fbb31976bc55 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -70,6 +70,12 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, + struct bch_qid, + unsigned, + enum quota_acct_mode); + /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 7c38daac1cac..113a2ca88ffc 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -270,7 +270,8 @@ static void __bch2_quota_transfer(struct bch_memquota *src_q, int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, struct bch_qid dst, - struct bch_qid src, u64 space) + struct bch_qid src, u64 space, + enum quota_acct_mode mode) { struct bch_memquota_type *q; struct bch_memquota *src_q[3], *dst_q[3]; @@ -296,13 +297,13 @@ int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, dst_q[i]->c[Q_SPC].v + space, - KEY_TYPE_QUOTA_PREALLOC); + mode); if (ret) goto err; ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, dst_q[i]->c[Q_INO].v + 1, - KEY_TYPE_QUOTA_PREALLOC); + mode); if (ret) goto err; } diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 294a04db84bf..72b5ea0d77c5 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -15,12 +15,6 @@ void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); .val_to_text = bch2_quota_to_text, \ } -enum quota_acct_mode { - KEY_TYPE_QUOTA_PREALLOC, - KEY_TYPE_QUOTA_WARN, - KEY_TYPE_QUOTA_NOCHECK, -}; - static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) { return (struct bch_qid) { @@ -43,7 +37,7 @@ int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, s64, enum quota_acct_mode); int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, - struct bch_qid, u64); + struct bch_qid, u64, enum quota_acct_mode); void bch2_fs_quota_exit(struct bch_fs *); void bch2_fs_quota_init(struct bch_fs *); @@ -62,7 +56,8 @@ static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, struct bch_qid dst, - struct bch_qid src, u64 space) + struct bch_qid src, u64 space, + enum quota_acct_mode mode) { return 0; } diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h index 9eda6c363736..6a136083d389 100644 --- a/fs/bcachefs/quota_types.h +++ b/fs/bcachefs/quota_types.h @@ -8,6 +8,12 @@ struct bch_qid { u32 q[QTYP_NR]; }; +enum quota_acct_mode { + KEY_TYPE_QUOTA_PREALLOC, + KEY_TYPE_QUOTA_WARN, + KEY_TYPE_QUOTA_NOCHECK, +}; + struct memquota_counter { u64 v; u64 hardlimit; -- cgit v1.2.3 From 96012e143e699db1a7644e4c5903b63bdde33772 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Dec 2018 05:31:49 -0500 Subject: bcachefs: rename keeps inheritable inode opts consistent Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/fs.h | 25 +++++++++++++++++ 2 files changed, 109 insertions(+) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index d22b9e7e2082..033582a87852 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -282,6 +282,32 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } +int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_inode_info *dir = p; + u64 src, dst; + unsigned id; + int ret = 1; + + for (id = 0; id < Inode_opt_nr; id++) { + if (bi->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(&dir->ei_inode, id); + dst = bch2_inode_opt_get(bi, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(bi, id, src); + ret = 0; + } + + return ret; +} + static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; @@ -765,6 +791,7 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode, void *p) { struct rename_info *info = p; + int ret; if (inode == info->src_dir) { bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode); @@ -779,6 +806,19 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode, S_ISDIR(info->dst_inode->v.i_mode); } + if (inode == info->src_inode) { + ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir); + + BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode)); + } + + if (inode == info->dst_inode && + info->mode == BCH_RENAME_EXCHANGE) { + ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir); + + BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode)); + } + if (inode == info->dst_inode && info->mode == BCH_RENAME_OVERWRITE) { BUG_ON(bi->bi_nlink && @@ -844,6 +884,39 @@ static int bch2_rename2(struct mnt_idmap *idmap, i.dst_inode); bch2_trans_init(&trans, c); + + if (S_ISDIR(i.src_inode->v.i_mode) && + inode_attrs_changing(i.dst_dir, i.src_inode)) { + ret = -EXDEV; + goto err; + } + + if (i.mode == BCH_RENAME_EXCHANGE && + S_ISDIR(i.dst_inode->v.i_mode) && + inode_attrs_changing(i.src_dir, i.dst_inode)) { + ret = -EXDEV; + goto err; + } + + if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, i.src_inode, + i.dst_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + if (i.mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, i.dst_inode, + i.src_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + retry: bch2_trans_begin(&trans); i.now = bch2_current_time(c); @@ -894,6 +967,17 @@ retry: ATTR_CTIME); err: bch2_trans_exit(&trans); + + bch2_fs_quota_transfer(c, i.src_inode, + bch_qid(&i.src_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + if (i.dst_inode) + bch2_fs_quota_transfer(c, i.dst_inode, + bch_qid(&i.dst_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + bch2_unlock_inodes(i.src_dir, i.dst_dir, i.src_inode, diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index fbb31976bc55..18e41609c89d 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -66,6 +66,27 @@ static inline unsigned nlink_bias(umode_t mode) return S_ISDIR(mode) ? 2 : 1; } +static inline bool inode_attr_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode, + enum inode_opt_id id) +{ + return !(inode->ei_inode.bi_fields_set & (1 << id)) && + bch2_inode_opt_get(&dir->ei_inode, id) != + bch2_inode_opt_get(&inode->ei_inode, id); +} + +static inline bool inode_attrs_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode) +{ + unsigned id; + + for (id = 0; id < Inode_opt_nr; id++) + if (inode_attr_changing(dir, inode, id)) + return true; + + return false; +} + struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS @@ -91,6 +112,10 @@ int __must_check bch2_write_inode_trans(struct btree_trans *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); +int bch2_reinherit_attrs_fn(struct bch_inode_info *, + struct bch_inode_unpacked *, + void *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); -- cgit v1.2.3 From 8095708fce72a911e20799078639e95c1a008176 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Dec 2018 06:11:14 -0500 Subject: bcachefs: bch2_ioc_reinherit_attrs() Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_ioctl.h | 2 ++ fs/bcachefs/fs-ioctl.c | 77 +++++++++++++++++++++++++++++++++++++++++++- fs/bcachefs/fs.c | 31 ++---------------- fs/bcachefs/fs.h | 26 +++++++++++++++ fs/bcachefs/inode.c | 3 +- 5 files changed, 109 insertions(+), 30 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index c65104ed454a..2dca4bb0362b 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -307,4 +307,6 @@ struct bch_ioctl_disk_resize { __u64 nbuckets; }; +#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 14, const char __user *) + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index d6563370bec4..92939befe507 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "chardev.h" +#include "dirent.h" #include "fs.h" #include "fs-ioctl.h" #include "quota.h" @@ -177,6 +178,75 @@ err: return ret; } +static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + struct file *file, + struct bch_inode_info *src, + const char __user *name) +{ + struct bch_inode_info *dst; + struct inode *vinode = NULL; + char *kname = NULL; + struct qstr qstr; + int ret = 0; + u64 inum; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) + return -ENOMEM; + + ret = strncpy_from_user(kname, name, BCH_NAME_MAX); + if (unlikely(ret < 0)) + goto err1; + + qstr.hash_len = ret; + qstr.name = kname; + + ret = -ENOENT; + inum = bch2_dirent_lookup(c, src->v.i_ino, + &src->ei_str_hash, + &qstr); + if (!inum) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) + goto err1; + + dst = to_bch_ei(vinode); + + ret = mnt_want_write_file(file); + if (ret) + goto err2; + + bch2_lock_inodes(src, dst); + + if (inode_attr_changing(src, dst, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst, + src->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err3; + } + + ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); +err3: + bch2_unlock_inodes(src, dst); + + /* return true if we did work */ + if (ret >= 0) + ret = !ret; + + mnt_drop_write_file(file); +err2: + iput(vinode); +err1: + kfree(kname); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -193,7 +263,12 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_FSGETXATTR: return bch2_ioc_fsgetxattr(inode, (void __user *) arg); case FS_IOC_FSSETXATTR: - return bch2_ioc_fssetxattr(c, file, inode, (void __user *) arg); + return bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + + case BCHFS_IOC_REINHERIT_ATTRS: + return bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); case FS_IOC_GETVERSION: return -ENOTTY; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 033582a87852..d23a82d94c5e 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -51,30 +51,6 @@ static void journal_seq_copy(struct bch_inode_info *dst, } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); } -static inline int ptrcmp(void *l, void *r) -{ - return (l > r) - (l < r); -} - -#define __bch2_lock_inodes(_lock, ...) \ -do { \ - struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ - unsigned i; \ - \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp); \ - \ - for (i = ARRAY_SIZE(a) - 1; a[i]; --i) \ - if (a[i] != a[i - 1]) { \ - if (_lock) \ - mutex_lock_nested(&a[i]->ei_update_lock, i);\ - else \ - mutex_unlock(&a[i]->ei_update_lock); \ - } \ -} while (0) - -#define bch2_lock_inodes(...) __bch2_lock_inodes(true, __VA_ARGS__) -#define bch2_unlock_inodes(...) __bch2_lock_inodes(false, __VA_ARGS__) - static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { BUG_ON(atomic_long_read(&lock->v) == 0); @@ -308,7 +284,7 @@ int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, return ret; } -static struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; @@ -393,14 +369,13 @@ __bch2_create(struct mnt_idmap *idmap, bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); bch2_inode_init_owner(&inode_u, &dir->v, mode); - inode_u.bi_project = dir->ei_qid.q[QTYP_PRJ]; - hash_info = bch2_hash_info_init(c, &inode_u); if (tmpfile) inode_u.bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); + ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); if (ret) return ERR_PTR(ret); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 18e41609c89d..4c584d3a27c3 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -51,6 +51,30 @@ struct bch_inode_info { #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) +static inline int ptrcmp(void *l, void *r) +{ + return (l > r) - (l < r); +} + +#define __bch2_lock_inodes(_lock, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp); \ + \ + for (i = ARRAY_SIZE(a) - 1; a[i]; --i) \ + if (a[i] != a[i - 1]) { \ + if (_lock) \ + mutex_lock_nested(&a[i]->ei_update_lock, i);\ + else \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) + +#define bch2_lock_inodes(...) __bch2_lock_inodes(true, __VA_ARGS__) +#define bch2_unlock_inodes(...) __bch2_lock_inodes(false, __VA_ARGS__) + static inline struct bch_inode_info *file_bch_inode(struct file *file) { return to_bch_ei(file_inode(file)); @@ -97,6 +121,8 @@ int bch2_fs_quota_transfer(struct bch_fs *, unsigned, enum quota_acct_mode); +struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); + /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 23d3668b4567..6acb487312a8 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -258,7 +258,8 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, /* ick */ inode_u->bi_flags |= c->opts.str_hash << INODE_STR_HASH_OFFSET; - get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); + get_random_bytes(&inode_u->bi_hash_seed, + sizeof(inode_u->bi_hash_seed)); inode_u->bi_mode = mode; inode_u->bi_uid = uid; -- cgit v1.2.3 From 2fab25cdd70be6868936639dfb03eaa9fa0245c0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 19 Dec 2018 08:43:01 -0500 Subject: bcachefs: more project quota fixes Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 26 ++++++-------------------- fs/bcachefs/fs.h | 13 +++++++++++++ fs/bcachefs/quota.h | 2 +- fs/bcachefs/xattr.c | 7 +++++++ 4 files changed, 27 insertions(+), 21 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 92939befe507..4925a127a335 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -104,19 +104,6 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, return copy_to_user(arg, &fa, sizeof(fa)); } -static int bch2_set_projid(struct bch_fs *c, - struct bch_inode_info *inode, - u32 projid) -{ - struct bch_qid qid = inode->ei_qid; - - qid.q[QTYP_PRJ] = projid; - - return bch2_fs_quota_transfer(c, inode, qid, - 1 << QTYP_PRJ, - KEY_TYPE_QUOTA_PREALLOC); -} - static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) @@ -124,11 +111,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, struct flags_set *s = p; if (s->projid != bi->bi_project) { - if (s->projid) - bi->bi_fields_set |= 1U << Inode_opt_project; - else - bi->bi_fields_set &= ~(1U << Inode_opt_project); - + bi->bi_fields_set |= 1U << Inode_opt_project; bi->bi_project = s->projid; } @@ -151,7 +134,10 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, if (fa.fsx_xflags) return -EOPNOTSUPP; - s.projid = fa.fsx_projid; + if (fa.fsx_projid >= U32_MAX) + return -EINVAL; + + s.projid = fa.fsx_projid + 1; ret = mnt_want_write_file(file); if (ret) @@ -164,7 +150,7 @@ static int bch2_ioc_fssetxattr(struct bch_fs *c, } mutex_lock(&inode->ei_update_lock); - ret = bch2_set_projid(c, inode, fa.fsx_projid); + ret = bch2_set_projid(c, inode, s.projid); if (ret) goto err_unlock; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 4c584d3a27c3..f949cd0d2a68 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -121,6 +121,19 @@ int bch2_fs_quota_transfer(struct bch_fs *, unsigned, enum quota_acct_mode); +static inline int bch2_set_projid(struct bch_fs *c, + struct bch_inode_info *inode, + u32 projid) +{ + struct bch_qid qid = inode->ei_qid; + + qid.q[QTYP_PRJ] = projid; + + return bch2_fs_quota_transfer(c, inode, qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); +} + struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); /* returns 0 if we want to do the update, or error is passed up */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h index 72b5ea0d77c5..51e4f9713ef0 100644 --- a/fs/bcachefs/quota.h +++ b/fs/bcachefs/quota.h @@ -20,7 +20,7 @@ static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) return (struct bch_qid) { .q[QTYP_USR] = u->bi_uid, .q[QTYP_GRP] = u->bi_gid, - .q[QTYP_PRJ] = u->bi_project, + .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, }; } diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index dfb5c385e8c3..f31eec2f1fce 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -515,7 +515,14 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, } mutex_lock(&inode->ei_update_lock); + if (inode_opt_id == Inode_opt_project) { + ret = bch2_set_projid(c, inode, s.v); + if (ret) + goto err; + } + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); +err: mutex_unlock(&inode->ei_update_lock); if (value && -- cgit v1.2.3 From 5154704b29e58a5fd9acd601b831d99298a76a6c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 20 Jul 2018 22:27:07 -0400 Subject: bcachefs: Use deferred btree updates for inode updates Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.h | 1 + fs/bcachefs/btree_locking.h | 2 -- fs/bcachefs/fs-io.c | 35 +++++++++++++++++++++-------------- fs/bcachefs/fs.c | 23 +++++++++++++++++++++-- fs/bcachefs/fs.h | 1 + 5 files changed, 44 insertions(+), 18 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 52e0e003153b..a64ed6d32175 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -104,6 +104,7 @@ void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, unsigned, unsigned); int bch2_btree_iter_unlock(struct btree_iter *); +bool bch2_btree_iter_relock(struct btree_iter *); bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index 48b50e066186..c036cd0458a4 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -203,8 +203,6 @@ static inline bool bch2_btree_node_relock(struct btree_iter *iter, __bch2_btree_node_relock(iter, level); } -bool bch2_btree_iter_relock(struct btree_iter *); - void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 7681cfbc6bed..f8657baf0521 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -287,11 +287,11 @@ static int bch2_extent_update(struct btree_trans *trans, bool direct, s64 *total_delta) { - struct btree_iter *inode_iter = NULL; struct bch_inode_unpacked inode_u; struct bkey_inode_buf inode_p; bool allocating = false; bool extended = false; + bool inode_locked = false; s64 i_sectors_delta; int ret; @@ -314,16 +314,20 @@ static int bch2_extent_update(struct btree_trans *trans, /* XXX: inode->i_size locking */ if (i_sectors_delta || new_i_size > inode->ei_inode.bi_size) { - inode_iter = bch2_trans_get_iter(trans, - BTREE_ID_INODES, - POS(k->k.p.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); + bch2_btree_iter_unlock(extent_iter); + mutex_lock(&inode->ei_update_lock); - ret = bch2_btree_iter_traverse(inode_iter); - if (ret) - goto err; + if (!bch2_btree_iter_relock(extent_iter)) { + mutex_unlock(&inode->ei_update_lock); + return -EINTR; + } + + inode_locked = true; + + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(trans->c, + BTREE_ID_INODES, 64); inode_u = inode->ei_inode; inode_u.bi_sectors += i_sectors_delta; @@ -337,7 +341,8 @@ static int bch2_extent_update(struct btree_trans *trans, bch2_inode_pack(&inode_p, &inode_u); bch2_trans_update(trans, - BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p.inode.k_i)); } ret = bch2_trans_commit(trans, disk_res, @@ -371,13 +376,15 @@ static int bch2_extent_update(struct btree_trans *trans, if (total_delta) *total_delta += i_sectors_delta; err: - if (!IS_ERR_OR_NULL(inode_iter)) - bch2_trans_iter_put(trans, inode_iter); + if (inode_locked) + mutex_unlock(&inode->ei_update_lock); + return ret; } static int bchfs_write_index_update(struct bch_write_op *wop) { + struct bch_fs *c = wop->c; struct bchfs_write_op *op = container_of(wop, struct bchfs_write_op, op); struct quota_res *quota_res = op->is_dio @@ -392,7 +399,7 @@ static int bchfs_write_index_update(struct bch_write_op *wop) BUG_ON(k->k.p.inode != inode->v.i_ino); - bch2_trans_init(&trans, wop->c); + bch2_trans_init(&trans, c); bch2_trans_preload_iters(&trans); iter = bch2_trans_get_iter(&trans, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 02c7543e40c8..5f93ea76785f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -156,12 +156,18 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, inode_set_fn set, void *p) { + struct bch_fs *c = trans->c; struct btree_iter *iter; struct bkey_inode_buf *inode_p; int ret; lockdep_assert_held(&inode->ei_update_lock); + /* XXX: Don't do this with btree locks held */ + if (!inode->ei_inode_update) + inode->ei_inode_update = + bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64); +#if 0 iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(inode->v.i_ino, 0), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); @@ -172,7 +178,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, ret = bch2_btree_iter_traverse(iter); if (ret) return ret; - +#endif *inode_u = inode->ei_inode; if (set) { @@ -186,7 +192,15 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + + if (!inode->ei_inode_update) + bch2_trans_update(trans, + BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + else + bch2_trans_update(trans, + BTREE_INSERT_DEFERRED(inode->ei_inode_update, + &inode_p->inode.k_i)); + return 0; } @@ -1431,6 +1445,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) mutex_init(&inode->ei_update_lock); pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); + inode->ei_inode_update = NULL; inode->ei_journal_seq = 0; return &inode->v; @@ -1494,6 +1509,10 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); + if (inode->ei_inode_update) + bch2_deferred_update_free(c, inode->ei_inode_update); + inode->ei_inode_update = NULL; + if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index f949cd0d2a68..b9a8a9bc3e90 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -34,6 +34,7 @@ struct bch_inode_info { struct inode v; struct mutex ei_update_lock; + struct deferred_update *ei_inode_update; u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; -- cgit v1.2.3 From 3ea2b1e12898154d6fae49b22a3509521ba49d38 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 12 Apr 2019 04:54:12 -0400 Subject: bcachefs: cmp_int() Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_background.c | 8 ++++---- fs/bcachefs/bkey.c | 2 +- fs/bcachefs/bkey.h | 4 ++-- fs/bcachefs/bset.h | 2 +- fs/bcachefs/btree_update_leaf.c | 2 +- fs/bcachefs/ec.c | 2 +- fs/bcachefs/fs.h | 2 +- fs/bcachefs/journal_seq_blacklist.c | 2 +- fs/bcachefs/movinggc.c | 4 ++-- fs/bcachefs/replicas.c | 2 +- fs/bcachefs/sysfs.c | 6 +++--- fs/bcachefs/util.h | 2 ++ 12 files changed, 20 insertions(+), 18 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index acd7be90fc47..b3a8ff0b1daa 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -687,16 +687,16 @@ static inline int bucket_alloc_cmp(alloc_heap *h, struct alloc_heap_entry l, struct alloc_heap_entry r) { - return (l.key > r.key) - (l.key < r.key) ?: - (l.nr < r.nr) - (l.nr > r.nr) ?: - (l.bucket > r.bucket) - (l.bucket < r.bucket); + return cmp_int(l.key, r.key) ?: + cmp_int(r.nr, l.nr) ?: + cmp_int(l.bucket, r.bucket); } static inline int bucket_idx_cmp(const void *_l, const void *_r) { const struct alloc_heap_entry *l = _l, *r = _r; - return (l->bucket > r->bucket) - (l->bucket < r->bucket); + return cmp_int(l->bucket, r->bucket); } static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 8a3295ff9631..8b3c9ae8d266 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -1024,7 +1024,7 @@ static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, r_v = *r; } - return (l_v > r_v) - (l_v < r_v); + return cmp_int(l_v, r_v); } #endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 44044fcd6f9f..45de61d492a4 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -217,8 +217,8 @@ void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); static __always_inline int bversion_cmp(struct bversion l, struct bversion r) { - return (l.hi > r.hi) - (l.hi < r.hi) ?: - (l.lo > r.lo) - (l.lo < r.lo); + return cmp_int(l.hi, r.hi) ?: + cmp_int(l.lo, r.lo); } #define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 329ffb0b6b3d..da3e41cc9757 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -465,7 +465,7 @@ static inline int bkey_iter_cmp(struct btree *b, { return bkey_cmp_packed(b, l, r) ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) - ?: (l > r) - (l < r); + ?: cmp_int(l, r); } static inline int btree_node_iter_cmp(struct btree *b, diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 8e686dc42f9d..48d3be517471 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -55,7 +55,7 @@ static void btree_trans_unlock_write(struct btree_trans *trans) static inline int btree_trans_cmp(struct btree_insert_entry l, struct btree_insert_entry r) { - return (l.deferred > r.deferred) - (l.deferred < r.deferred) ?: + return cmp_int(l.deferred, r.deferred) ?: btree_iter_cmp(l.iter, r.iter); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index ea009f0ff829..6a357e5b652e 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -951,7 +951,7 @@ static int unsigned_cmp(const void *_l, const void *_r) unsigned l = *((const unsigned *) _l); unsigned r = *((const unsigned *) _r); - return (l > r) - (l < r); + return cmp_int(l, r); } /* pick most common bucket size: */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index b9a8a9bc3e90..e72d6a58b322 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -54,7 +54,7 @@ struct bch_inode_info { static inline int ptrcmp(void *l, void *r) { - return (l > r) - (l < r); + return cmp_int(l, r); } #define __bch2_lock_inodes(_lock, ...) \ diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 0df8dfccd5b5..ae64bf3248ef 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -136,7 +136,7 @@ static int journal_seq_blacklist_table_cmp(const void *_l, const struct journal_seq_blacklist_table_entry *l = _l; const struct journal_seq_blacklist_table_entry *r = _r; - return (l->start > r->start) - (l->start < r->start); + return cmp_int(l->start, r->start); } bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 78d9ca8bfc5e..aba13e6ea4ff 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -54,7 +54,7 @@ static inline int sectors_used_cmp(copygc_heap *heap, struct copygc_heap_entry l, struct copygc_heap_entry r) { - return (l.sectors > r.sectors) - (l.sectors < r.sectors); + return cmp_int(l.sectors, r.sectors); } static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) @@ -62,7 +62,7 @@ static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) const struct copygc_heap_entry *l = _l; const struct copygc_heap_entry *r = _r; - return (l->offset > r->offset) - (l->offset < r->offset); + return cmp_int(l->offset, r->offset); } static bool __copygc_pred(struct bch_dev *ca, diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index b66217989b71..b1df2c1ce4a4 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -12,7 +12,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, static inline int u8_cmp(u8 l, u8 r) { - return (l > r) - (l < r); + return cmp_int(l, r); } static void verify_replicas_entry_sorted(struct bch_replicas_entry *e) diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index db87a63b97cc..f4b70f66d0ac 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -751,10 +751,10 @@ static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, static int unsigned_cmp(const void *_l, const void *_r) { - unsigned l = *((unsigned *) _l); - unsigned r = *((unsigned *) _r); + const unsigned *l = _l; + const unsigned *r = _r; - return (l > r) - (l < r); + return cmp_int(*l, *r); } static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index dc40a52ac8c7..59c8a1dac7be 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -743,4 +743,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); +#define cmp_int(l, r) ((l > r) - (l < r)) + #endif /* _BCACHEFS_UTIL_H */ -- cgit v1.2.3 From 168f4c5fb375131bd0f5996b549c5e13cc2c2bb5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 24 Jun 2019 18:24:38 -0400 Subject: bcachefs: Improve bch2_lock_inodes() Can now be used for the two different types of locks we have so far Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 4 ++-- fs/bcachefs/fs.c | 10 ++++++---- fs/bcachefs/fs.h | 34 ++++++++++++++++++++++++++-------- 3 files changed, 34 insertions(+), 14 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 4dca716217a6..0cf2621ec4fc 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -205,7 +205,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, if (ret) goto err2; - bch2_lock_inodes(src, dst); + bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); if (inode_attr_changing(src, dst, Inode_opt_project)) { ret = bch2_fs_quota_transfer(c, dst, @@ -218,7 +218,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); err3: - bch2_unlock_inodes(src, dst); + bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); /* return true if we did work */ if (ret >= 0) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index afe930532224..c806ebad9cde 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -657,7 +657,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct btree_trans trans; int ret; - bch2_lock_inodes(dir, inode); + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); retry: bch2_trans_begin(&trans); @@ -690,7 +690,7 @@ retry: ATTR_MTIME); err: bch2_trans_exit(&trans); - bch2_unlock_inodes(dir, inode); + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); return ret; } @@ -871,7 +871,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, bch2_trans_init(&trans, c, 8, 2048); - bch2_lock_inodes(i.src_dir, + bch2_lock_inodes(INODE_UPDATE_LOCK, + i.src_dir, i.dst_dir, i.src_inode, i.dst_inode); @@ -969,7 +970,8 @@ err: 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); - bch2_unlock_inodes(i.src_dir, + bch2_unlock_inodes(INODE_UPDATE_LOCK, + i.src_dir, i.dst_dir, i.src_inode, i.dst_inode); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index e72d6a58b322..de07f0f1dd51 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -57,24 +57,42 @@ static inline int ptrcmp(void *l, void *r) return cmp_int(l, r); } -#define __bch2_lock_inodes(_lock, ...) \ +enum bch_inode_lock_op { + INODE_LOCK = (1U << 0), + INODE_UPDATE_LOCK = (1U << 1), +}; + +#define bch2_lock_inodes(_locks, ...) \ do { \ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ unsigned i; \ \ - bubble_sort(&a[1], ARRAY_SIZE(a) - 1 , ptrcmp); \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ \ - for (i = ARRAY_SIZE(a) - 1; a[i]; --i) \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_lock) \ + if (_locks & INODE_LOCK) \ + down_write_nested(&a[i]->v.i_rwsem, i); \ + if (_locks & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ - else \ - mutex_unlock(&a[i]->ei_update_lock); \ } \ } while (0) -#define bch2_lock_inodes(...) __bch2_lock_inodes(true, __VA_ARGS__) -#define bch2_unlock_inodes(...) __bch2_lock_inodes(false, __VA_ARGS__) +#define bch2_unlock_inodes(_locks, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ + \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ + if (a[i] != a[i - 1]) { \ + if (_locks & INODE_LOCK) \ + up_write(&a[i]->v.i_rwsem); \ + if (_locks & INODE_UPDATE_LOCK) \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) static inline struct bch_inode_info *file_bch_inode(struct file *file) { -- cgit v1.2.3 From 76426098e419c1732efc3f88166f3f3592c215c9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 16 Aug 2019 09:59:56 -0400 Subject: bcachefs: Reflink Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/bcachefs.h | 4 + fs/bcachefs/bcachefs_format.h | 26 +++- fs/bcachefs/bkey.h | 2 + fs/bcachefs/bkey_methods.c | 1 + fs/bcachefs/btree_types.h | 9 +- fs/bcachefs/btree_update_leaf.c | 3 +- fs/bcachefs/buckets.c | 100 +++++++++++++- fs/bcachefs/extents.c | 50 +++++-- fs/bcachefs/extents.h | 19 ++- fs/bcachefs/fs-io.c | 218 ++++++++++++++++++++++------- fs/bcachefs/fs-io.h | 19 +++ fs/bcachefs/fs.c | 42 +++++- fs/bcachefs/fs.h | 15 +- fs/bcachefs/io.c | 127 +++++++++++++---- fs/bcachefs/io.h | 3 + fs/bcachefs/migrate.c | 13 +- fs/bcachefs/move.c | 98 ++++++++----- fs/bcachefs/move.h | 3 +- fs/bcachefs/recovery.c | 18 +-- fs/bcachefs/reflink.c | 300 ++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/reflink.h | 32 +++++ fs/bcachefs/replicas.c | 1 + 23 files changed, 945 insertions(+), 159 deletions(-) create mode 100644 fs/bcachefs/reflink.c create mode 100644 fs/bcachefs/reflink.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index c29ccdb45965..4c2608409144 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -44,6 +44,7 @@ bcachefs-y := \ quota.o \ rebalance.o \ recovery.o \ + reflink.o \ replicas.o \ siphash.o \ six.o \ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 68e2d3b1a9a6..410fce3ed8d4 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -361,6 +361,7 @@ enum gc_phase { GC_PHASE_BTREE_XATTRS, GC_PHASE_BTREE_ALLOC, GC_PHASE_BTREE_QUOTAS, + GC_PHASE_BTREE_REFLINK, GC_PHASE_PENDING_DELETE, GC_PHASE_ALLOC, @@ -750,6 +751,9 @@ struct bch_fs { struct work_struct ec_stripe_delete_work; struct llist_head ec_stripe_delete_list; + /* REFLINK */ + u64 reflink_hint; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b8aafd2e283a..62afea1e7ec3 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -340,7 +340,9 @@ static inline void bkey_init(struct bkey *k) x(xattr, 11) \ x(alloc, 12) \ x(quota, 13) \ - x(stripe, 14) + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) enum bch_bkey_type { #define x(name, nr) KEY_TYPE_##name = nr, @@ -895,6 +897,24 @@ struct bch_stripe { struct bch_extent_ptr ptrs[0]; } __attribute__((packed, aligned(8))); +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + + __le32 reservation_generation; + __u8 nr_replicas; + __u8 pad[3]; +}; + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +}; + /* Optional/variable size superblock sections: */ struct bch_sb_field { @@ -1297,6 +1317,7 @@ enum bch_sb_features { BCH_FEATURE_ATOMIC_NLINK = 3, /* should have gone under compat */ BCH_FEATURE_EC = 4, BCH_FEATURE_JOURNAL_SEQ_BLACKLIST_V3 = 5, + BCH_FEATURE_REFLINK = 6, BCH_FEATURE_NR, }; @@ -1487,7 +1508,8 @@ LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); x(XATTRS, 3, "xattrs") \ x(ALLOC, 4, "alloc") \ x(QUOTAS, 5, "quotas") \ - x(EC, 6, "erasure_coding") + x(EC, 6, "erasure_coding") \ + x(REFLINK, 7, "reflink") enum btree_id { #define x(kwd, val, name) BTREE_ID_##kwd = val, diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index b3a08e52e6b3..321fe6fe0b55 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -560,6 +560,8 @@ BKEY_VAL_ACCESSORS(xattr); BKEY_VAL_ACCESSORS(alloc); BKEY_VAL_ACCESSORS(quota); BKEY_VAL_ACCESSORS(stripe); +BKEY_VAL_ACCESSORS(reflink_p); +BKEY_VAL_ACCESSORS(reflink_v); /* byte order helpers */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 8af16ca994e0..6fa6ac1fadc1 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -10,6 +10,7 @@ #include "extents.h" #include "inode.h" #include "quota.h" +#include "reflink.h" #include "xattr.h" const char * const bch2_bkey_types[] = { diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index ec14e2deecb7..621cbfa22fc9 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -464,7 +464,13 @@ static inline enum btree_node_type btree_node_type(struct btree *b) static inline bool btree_node_type_is_extents(enum btree_node_type type) { - return type == BKEY_TYPE_EXTENTS; + switch (type) { + case BKEY_TYPE_EXTENTS: + case BKEY_TYPE_REFLINK: + return true; + default: + return false; + } } static inline bool btree_node_is_extents(struct btree *b) @@ -480,6 +486,7 @@ static inline bool btree_node_type_needs_gc(enum btree_node_type type) case BKEY_TYPE_EXTENTS: case BKEY_TYPE_INODES: case BKEY_TYPE_EC: + case BKEY_TYPE_REFLINK: return true; default: return false; diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index 5f94b6e9cf28..443ffb5c709d 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -521,7 +521,8 @@ static inline bool update_triggers_transactional(struct btree_trans *trans, { return likely(!(trans->flags & BTREE_INSERT_MARK_INMEM)) && (i->iter->btree_id == BTREE_ID_EXTENTS || - i->iter->btree_id == BTREE_ID_INODES); + i->iter->btree_id == BTREE_ID_INODES || + i->iter->btree_id == BTREE_ID_REFLINK); } static inline bool update_has_triggers(struct btree_trans *trans, diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index baf9642d21ca..3d243f2d1095 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -972,7 +972,7 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c, spin_unlock(&c->ec_stripes_heap_lock); bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", (u64) p.idx); - return -1; + return -EIO; } BUG_ON(m->r.e.data_type != data_type); @@ -1144,6 +1144,7 @@ int bch2_mark_key_locked(struct bch_fs *c, fs_usage, journal_seq, flags); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, fs_usage, journal_seq, flags); break; @@ -1304,7 +1305,8 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, xchg(&warned_disk_usage, 1)) return; - pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors); + bch_err(c, "disk usage increased more than %llu sectors reserved", + disk_res_sectors); trans_for_each_update_iter(trans, i) { struct btree_iter *iter = i->iter; @@ -1319,7 +1321,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, node_iter = iter->l[0].iter; while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b, - KEY_TYPE_discard))) { + KEY_TYPE_discard))) { struct bkey unpacked; struct bkey_s_c k; @@ -1471,6 +1473,7 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, struct bch_extent_stripe_ptr p, s64 sectors, enum bch_data_type data_type) { + struct bch_fs *c = trans->c; struct bch_replicas_padded r; struct btree_iter *iter; struct bkey_i *new_k; @@ -1487,10 +1490,10 @@ static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, return ret; if (k.k->type != KEY_TYPE_stripe) { - bch_err_ratelimited(trans->c, - "pointer to nonexistent stripe %llu", - (u64) p.idx); - ret = -1; + bch2_fs_inconsistent(c, + "pointer to nonexistent stripe %llu", + (u64) p.idx); + ret = -EIO; goto out; } @@ -1578,6 +1581,84 @@ static int bch2_trans_mark_extent(struct btree_trans *trans, return 0; } +static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 idx, unsigned sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter *iter; + struct bkey_i *new_k; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + s64 ret; + + ret = trans_get_key(trans, BTREE_ID_REFLINK, + POS(0, idx), &iter, &k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_reflink_v) { + bch2_fs_inconsistent(c, + "%llu:%llu len %u points to nonexistent indirect extent %llu", + p.k->p.inode, p.k->p.offset, p.k->size, idx); + ret = -EIO; + goto err; + } + + if ((flags & BCH_BUCKET_MARK_OVERWRITE) && + (bkey_start_offset(k.k) < idx || + k.k->p.offset > idx + sectors)) + goto out; + + bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); + BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); + + new_k = trans_update_key(trans, iter, k.k->u64s); + ret = PTR_ERR_OR_ZERO(new_k); + if (ret) + goto err; + + bkey_reassemble(new_k, k); + r_v = bkey_i_to_reflink_v(new_k); + + le64_add_cpu(&r_v->v.refcount, + !(flags & BCH_BUCKET_MARK_OVERWRITE) ? 1 : -1); + + if (!r_v->v.refcount) { + r_v->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&r_v->k, 0); + } +out: + ret = k.k->p.offset - idx; +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + +static int bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, unsigned offset, + s64 sectors, unsigned flags) +{ + u64 idx = le64_to_cpu(p.v->idx) + offset; + s64 ret = 0; + + sectors = abs(sectors); + BUG_ON(offset + sectors > p.k->size); + + while (sectors) { + ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); + if (ret < 0) + break; + + idx += ret; + sectors = max_t(s64, 0LL, sectors - ret); + ret = 0; + } + + return ret; +} + int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, s64 sectors, unsigned flags) { @@ -1593,6 +1674,7 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_BTREE); case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: return bch2_trans_mark_extent(trans, k, offset, sectors, flags, BCH_DATA_USER); case KEY_TYPE_inode: @@ -1616,6 +1698,10 @@ int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, d->fs_usage.persistent_reserved[replicas - 1] += sectors; return 0; } + case KEY_TYPE_reflink_p: + return bch2_trans_mark_reflink_p(trans, + bkey_s_c_to_reflink_p(k), + offset, sectors, flags); default: return 0; } diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 11defa3d99a5..81ec55526ce9 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -744,7 +744,8 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) case KEY_TYPE_error: case KEY_TYPE_cookie: break; - case KEY_TYPE_extent: { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; bool seen_crc = false; @@ -774,6 +775,12 @@ void __bch2_cut_front(struct bpos where, struct bkey_s k) break; } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + + le64_add_cpu(&p.v->idx, sub); + break; + } case KEY_TYPE_reservation: break; default: @@ -968,6 +975,33 @@ static int __bch2_extent_atomic_end(struct btree_trans *trans, } break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = end->offset - bkey_start_offset(p.k); + struct btree_iter *iter; + struct bkey_s_c r_k; + + for_each_btree_key(trans, iter, + BTREE_ID_REFLINK, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret) { + if (bkey_cmp(bkey_start_pos(r_k.k), + POS(0, idx + sectors)) >= 0) + break; + + *nr_iters += 1; + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += r_k.k->p.offset - idx; + + *end = bpos_min(*end, pos); + break; + } + } + + bch2_trans_iter_put(trans, iter); + break; + } } return ret; @@ -1561,17 +1595,17 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) return false; } -void bch2_extent_mark_replicas_cached(struct bch_fs *c, - struct bkey_s_extent e, - unsigned target, - unsigned nr_desired_replicas) +void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, + unsigned target, + unsigned nr_desired_replicas) { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); union bch_extent_entry *entry; struct extent_ptr_decoded p; - int extra = bch2_bkey_durability(c, e.s_c) - nr_desired_replicas; + int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; if (target && extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra && @@ -1582,7 +1616,7 @@ void bch2_extent_mark_replicas_cached(struct bch_fs *c, } if (extra > 0) - extent_for_each_ptr_decode(e, p, entry) { + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int n = bch2_extent_ptr_durability(c, p); if (n && n <= extra) { diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 156d8e37045a..cef93af25858 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -306,6 +306,14 @@ static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) to_entry(&s.v->ptrs[s.v->nr_blocks]), }; } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } default: return (struct bkey_ptrs_c) { NULL, NULL }; } @@ -436,8 +444,8 @@ bch2_extent_can_insert(struct btree_trans *, struct btree_insert_entry *, void bch2_insert_fixup_extent(struct btree_trans *, struct btree_insert_entry *); -void bch2_extent_mark_replicas_cached(struct bch_fs *, struct bkey_s_extent, - unsigned, unsigned); +void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, + unsigned, unsigned); const struct bch_extent_ptr * bch2_extent_has_device(struct bkey_s_c_extent, unsigned); @@ -452,17 +460,24 @@ static inline bool bkey_extent_is_data(const struct bkey *k) switch (k->type) { case KEY_TYPE_btree_ptr: case KEY_TYPE_extent: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; } } +/* + * Should extent be counted under inode->i_sectors? + */ static inline bool bkey_extent_is_allocation(const struct bkey *k) { switch (k->type) { case KEY_TYPE_extent: case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: return true; default: return false; diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index ef94aecaa7cb..771fb111550d 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -16,6 +16,7 @@ #include "io.h" #include "keylist.h" #include "quota.h" +#include "reflink.h" #include "trace.h" #include @@ -201,9 +202,9 @@ static int inode_set_size(struct bch_inode_info *inode, return 0; } -static int __must_check bch2_write_inode_size(struct bch_fs *c, - struct bch_inode_info *inode, - loff_t new_size, unsigned fields) +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) { struct inode_new_size s = { .new_size = new_size, @@ -936,15 +937,12 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) { struct bvec_iter iter; struct bio_vec bv; - unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k); + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_allocated(k); unsigned state = k.k->type == KEY_TYPE_reservation ? SECTOR_RESERVED : SECTOR_ALLOCATED; - BUG_ON(bio->bi_iter.bi_sector < bkey_start_offset(k.k)); - BUG_ON(bio_end_sector(bio) > k.k->p.offset); - - bio_for_each_segment(bv, bio, iter) { struct bch_page_state *s = bch2_page_state(bv.bv_page); unsigned i; @@ -959,10 +957,11 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) } static void readpage_bio_extend(struct readpages_iter *iter, - struct bio *bio, u64 offset, + struct bio *bio, + unsigned sectors_this_extent, bool get_more) { - while (bio_end_sector(bio) < offset && + while (bio_sectors(bio) < sectors_this_extent && bio->bi_vcnt < bio->bi_max_vecs) { pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; struct page *page = readpage_iter_next(iter); @@ -1012,35 +1011,39 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, struct bch_fs *c = trans->c; int flags = BCH_READ_RETRY_IF_STALE| BCH_READ_MAY_PROMOTE; + int ret = 0; rbio->c = c; rbio->start_time = local_clock(); - +retry: while (1) { BKEY_PADDED(k) tmp; struct bkey_s_c k; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; bch2_btree_iter_set_pos(iter, POS(inum, rbio->bio.bi_iter.bi_sector)); k = bch2_btree_iter_peek_slot(iter); - BUG_ON(!k.k); - - if (IS_ERR(k.k)) { - int ret = btree_iter_err(iter); - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); - bio_endio(&rbio->bio); - return; - } + ret = bkey_err(k); + if (ret) + break; bkey_reassemble(&tmp.k, k); - bch2_trans_unlock(trans); k = bkey_i_to_s_c(&tmp.k); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(trans); if (readpages_iter) { bool want_full_extent = false; @@ -1055,13 +1058,11 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, (p.crc.compression_type != 0)); } - readpage_bio_extend(readpages_iter, - &rbio->bio, k.k->p.offset, - want_full_extent); + readpage_bio_extend(readpages_iter, &rbio->bio, + sectors, want_full_extent); } - bytes = min_t(unsigned, bio_sectors(&rbio->bio), - (k.k->size - offset_into_extent)) << 9; + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) @@ -1078,6 +1079,12 @@ static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } + + if (ret == -EINTR) + goto retry; + + bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); + bio_endio(&rbio->bio); } void bch2_readahead(struct readahead_control *ractl) @@ -2256,29 +2263,25 @@ out: /* truncate: */ -static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, - u64 start_offset, u64 end_offset, u64 *journal_seq) +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bch_inode_info *inode, + u64 new_i_size) { - struct bpos start = POS(inode->v.i_ino, start_offset); - struct bpos end = POS(inode->v.i_ino, end_offset); + struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); - struct btree_trans trans; - struct btree_iter *iter; struct bkey_s_c k; - int ret = 0; - - bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); - - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, - BTREE_ITER_INTENT); + int ret = 0, ret2 = 0; while ((k = bch2_btree_iter_peek(iter)).k && - !(ret = bkey_err(k)) && bkey_cmp(iter->pos, end) < 0) { struct disk_reservation disk_res = bch2_disk_reservation_init(c, 0); struct bkey_i delete; + ret = bkey_err(k); + if (ret) + goto btree_err; + bkey_init(&delete.k); delete.k.p = iter->pos; @@ -2286,23 +2289,51 @@ static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, bch2_key_resize(&delete.k, max_sectors); bch2_cut_back(end, &delete.k); - bch2_trans_begin_updates(&trans); + bch2_trans_begin_updates(trans); - ret = bch2_extent_update(&trans, inode, + ret = bch2_extent_update(trans, inode, &disk_res, NULL, iter, &delete, - 0, true, true, NULL); + new_i_size, false, true, NULL); bch2_disk_reservation_put(c, &disk_res); - - if (ret == -EINTR) +btree_err: + if (ret == -EINTR) { + ret2 = ret; ret = 0; + } if (ret) break; + } - bch2_trans_cond_resched(&trans); + if (bkey_cmp(iter->pos, end) > 0) { + bch2_btree_iter_set_pos(iter, end); + ret = bch2_btree_iter_traverse(iter); } + return ret ?: ret2; +} + +static int __bch2_fpunch(struct bch_fs *c, struct bch_inode_info *inode, + u64 start_offset, u64 end_offset) +{ + struct btree_trans trans; + struct btree_iter *iter; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode->v.i_ino, start_offset), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, iter, + POS(inode->v.i_ino, end_offset), + inode, 0); + bch2_trans_exit(&trans); + if (ret == -EINTR) + ret = 0; + return ret; } @@ -2510,7 +2541,7 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ret = __bch2_fpunch(c, inode, round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); + U64_MAX); if (unlikely(ret)) goto err; @@ -2557,8 +2588,7 @@ static long bch2_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) truncate_pagecache_range(&inode->v, offset, offset + len - 1); if (discard_start < discard_end) - ret = __bch2_fpunch(c, inode, discard_start, discard_end, - &inode->ei_journal_seq); + ret = __bch2_fpunch(c, inode, discard_start, discard_end); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); inode_unlock(&inode->v); @@ -2670,7 +2700,7 @@ bkey_err: ret = __bch2_fpunch(c, inode, round_up(new_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq); + U64_MAX); if (ret) goto err; @@ -2853,6 +2883,94 @@ long bch2_fallocate_dispatch(struct file *file, int mode, return -EOPNOTSUPP; } +static void mark_range_unallocated(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + pgoff_t index = start >> PAGE_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + struct bch_page_state *s; + + folio_lock(folio); + s = bch2_page_state(&folio->page); + + if (s) + for (j = 0; j < PAGE_SECTORS; j++) + s->s[j].nr_replicas = 0; + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + loff_t ret = 0; + loff_t aligned_len; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto out_unlock; + + aligned_len = round_up(len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + aligned_len); + if (ret) + goto out_unlock; + + mark_range_unallocated(src, pos_src, pos_src + aligned_len); + + ret = bch2_remap_range(c, dst, + POS(dst->v.i_ino, pos_dst >> 9), + POS(src->v.i_ino, pos_src >> 9), + aligned_len >> 9, + pos_dst + len); + if (ret > 0) + ret = min(ret << 9, len); + +out_unlock: + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + return ret; +} + /* fseek: */ static int folio_data_offset(struct folio *folio, unsigned offset) diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index e263b515e901..861ec25ab9ef 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -9,6 +9,22 @@ #include +struct quota_res; + +int bch2_extent_update(struct btree_trans *, + struct bch_inode_info *, + struct disk_reservation *, + struct quota_res *, + struct btree_iter *, + struct bkey_i *, + u64, bool, bool, s64 *); +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_inode_info *, u64); + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + int bch2_writepage(struct page *, struct writeback_control *); int bch2_read_folio(struct file *, struct folio *); @@ -28,6 +44,9 @@ int bch2_fsync(struct file *, loff_t, loff_t, int); int bch2_truncate(struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + loff_t bch2_llseek(struct file *, loff_t, int); vm_fault_t bch2_page_fault(struct vm_fault *); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 54e555fb4d5d..fad019d3c3f5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1157,6 +1157,9 @@ static int bch2_fill_extent(struct bch_fs *c, struct extent_ptr_decoded p; int ret; + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { int flags2 = 0; u64 offset = p.ptr.offset; @@ -1200,6 +1203,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_iter *iter; struct bkey_s_c k; BKEY_PADDED(k) cur, prev; + unsigned offset_into_extent, sectors; bool have_extent = false; int ret = 0; @@ -1212,15 +1216,36 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(ei->v.i_ino, start >> 9), 0, k, ret) { - if (bkey_cmp(bkey_start_pos(k.k), - POS(ei->v.i_ino, (start + len) >> 9)) >= 0) - break; + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(ei->v.i_ino, start >> 9), + BTREE_ITER_SLOTS); + + while (bkey_cmp(iter->pos, POS(ei->v.i_ino, (start + len) >> 9)) < 0) { + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; bkey_reassemble(&cur.k, k); k = bkey_i_to_s_c(&cur.k); + offset_into_extent = iter->pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &cur.k); + if (ret) + break; + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + offset_into_extent), + &cur.k); + bch2_key_resize(&cur.k.k, sectors); + cur.k.k.p.offset = iter->pos.offset + cur.k.k.size; + if (bkey_extent_is_data(k.k) || k.k->type == KEY_TYPE_reservation) { if (have_extent) { @@ -1233,12 +1258,16 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bkey_copy(&prev.k, &cur.k); have_extent = true; } + + bch2_btree_iter_set_pos(iter, + POS(iter->pos.inode, + iter->pos.offset + sectors)); } if (!ret && have_extent) ret = bch2_fill_extent(c, info, bkey_i_to_s_c(&prev.k), FIEMAP_EXTENT_LAST); - +err: ret = bch2_trans_exit(&trans) ?: ret; return ret < 0 ? ret : 0; } @@ -1286,6 +1315,7 @@ static const struct file_operations bch_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = bch2_compat_fs_ioctl, #endif + .remap_file_range = bch2_remap_file_range, }; static const struct inode_operations bch_file_inode_operations = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index de07f0f1dd51..6edf5dd803f0 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -59,7 +59,8 @@ static inline int ptrcmp(void *l, void *r) enum bch_inode_lock_op { INODE_LOCK = (1U << 0), - INODE_UPDATE_LOCK = (1U << 1), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), }; #define bch2_lock_inodes(_locks, ...) \ @@ -71,9 +72,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ } while (0) @@ -87,9 +90,11 @@ do { \ \ for (i = 1; i < ARRAY_SIZE(a); i++) \ if (a[i] != a[i - 1]) { \ - if (_locks & INODE_LOCK) \ + if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ - if (_locks & INODE_UPDATE_LOCK) \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ } while (0) diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index ed84572a9e67..4d359931edb3 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -1041,6 +1041,7 @@ static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) noinline static struct promote_op *__promote_alloc(struct bch_fs *c, + enum btree_id btree_id, struct bpos pos, struct extent_ptr_decoded *pick, struct bch_io_opts opts, @@ -1097,6 +1098,7 @@ static struct promote_op *__promote_alloc(struct bch_fs *c, (struct data_opts) { .target = opts.promote_target }, + btree_id, bkey_s_c_null); BUG_ON(ret); @@ -1134,7 +1136,11 @@ static inline struct promote_op *promote_alloc(struct bch_fs *c, if (!should_promote(c, k, pos, opts, flags)) return NULL; - promote = __promote_alloc(c, pos, pick, opts, sectors, rbio); + promote = __promote_alloc(c, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_REFLINK + : BTREE_ID_EXTENTS, + pos, pick, opts, sectors, rbio); if (!promote) return NULL; @@ -1278,18 +1284,25 @@ retry: POS(inode, bvec_iter.bi_sector), BTREE_ITER_SLOTS, k, ret) { BKEY_PADDED(k) tmp; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_trans_unlock(&trans); - offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + break; - bytes = min_t(unsigned, bvec_iter_sectors(bvec_iter), - (k.k->size - offset_into_extent)) << 9; + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_trans_unlock(&trans); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); ret = __bch2_read_extent(c, rbio, bvec_iter, k, @@ -1569,6 +1582,48 @@ static void bch2_read_endio(struct bio *bio) bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } +int bch2_read_indirect_extent(struct btree_trans *trans, + struct btree_iter *extent_iter, + unsigned *offset_into_extent, + struct bkey_i *orig_k) +{ + struct btree_iter *iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + if (orig_k->k.type != KEY_TYPE_reflink_p) + return 0; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k)->v.idx) + + *offset_into_extent; + + iter = __bch2_trans_get_iter(trans, BTREE_ID_REFLINK, + POS(0, reflink_offset), + BTREE_ITER_SLOTS, 1); + ret = PTR_ERR_OR_ZERO(iter); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v) { + __bcache_io_error(trans->c, + "pointer to nonexistent indirect extent"); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + bkey_reassemble(orig_k, k); +err: + bch2_trans_iter_put(trans, iter); + return ret; +} + int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, struct bvec_iter iter, struct bkey_s_c k, unsigned offset_into_extent, @@ -1644,6 +1699,7 @@ int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; + offset_into_extent = 0; pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.offset = 0; @@ -1829,25 +1885,47 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) rbio->c = c; rbio->start_time = local_clock(); - for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, - POS(inode, rbio->bio.bi_iter.bi_sector), - BTREE_ITER_SLOTS, k, ret) { + iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + POS(inode, rbio->bio.bi_iter.bi_sector), + BTREE_ITER_SLOTS); + + while (1) { BKEY_PADDED(k) tmp; - unsigned bytes, offset_into_extent; + unsigned bytes, sectors, offset_into_extent; + + bch2_btree_iter_set_pos(iter, + POS(inode, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ bkey_reassemble(&tmp.k, k); k = bkey_i_to_s_c(&tmp.k); - bch2_trans_unlock(&trans); offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + ret = bch2_read_indirect_extent(&trans, iter, + &offset_into_extent, &tmp.k); + if (ret) + goto err; + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(&trans); - bytes = min_t(unsigned, bio_sectors(&rbio->bio), - (k.k->size - offset_into_extent)) << 9; + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) @@ -1856,21 +1934,18 @@ void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) bch2_read_extent(c, rbio, k, offset_into_extent, flags); if (flags & BCH_READ_LAST_FRAGMENT) - return; + break; swap(rbio->bio.bi_iter.bi_size, bytes); bio_advance(&rbio->bio, bytes); } - - /* - * If we get here, it better have been because there was an error - * reading a btree node - */ - BUG_ON(!ret); - bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); - +out: bch2_trans_exit(&trans); + return; +err: + bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); bch2_rbio_done(rbio); + goto out; } void bch2_fs_io_exit(struct bch_fs *c) diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index aa437cb05fe7..a768ccc90f1f 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -99,6 +99,9 @@ struct bch_devs_mask; struct cache_promote_op; struct extent_ptr_decoded; +int bch2_read_indirect_extent(struct btree_trans *, struct btree_iter *, + unsigned *, struct bkey_i *); + enum bch_read_flags { BCH_READ_RETRY_IF_STALE = 1 << 0, BCH_READ_MAY_PROMOTE = 1 << 1, diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index 301cb72bd3e4..dc3b03d6e627 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -34,7 +34,8 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, + enum btree_id btree_id) { struct btree_trans trans; struct btree_iter *iter; @@ -44,8 +45,8 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, - POS_MIN, BTREE_ITER_PREFETCH); + iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, + BTREE_ITER_PREFETCH); while ((k = bch2_btree_iter_peek(iter)).k && !(ret = bkey_err(k))) { @@ -98,6 +99,12 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: + __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); +} + static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) { struct btree_trans trans; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index ffa0c2bbe290..05bb74a36230 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -63,13 +63,14 @@ static int bch2_migrate_index_update(struct bch_write_op *op) bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, m->btree_id, bkey_start_pos(&bch2_keylist_front(keys)->k), BTREE_ITER_SLOTS|BTREE_ITER_INTENT); while (1) { struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - struct bkey_i_extent *insert, *new = + struct bkey_i *insert; + struct bkey_i_extent *new = bkey_i_to_extent(bch2_keylist_front(keys)); BKEY_PADDED(k) _new, _insert; const union bch_extent_entry *entry; @@ -86,26 +87,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto nomatch; if (m->data_cmd == DATA_REWRITE && - !bch2_extent_has_device(bkey_s_c_to_extent(k), - m->data_opts.rewrite_dev)) + !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) goto nomatch; bkey_reassemble(&_insert.k, k); - insert = bkey_i_to_extent(&_insert.k); + insert = &_insert.k; bkey_copy(&_new.k, bch2_keylist_front(keys)); new = bkey_i_to_extent(&_new.k); - bch2_cut_front(iter->pos, &insert->k_i); + bch2_cut_front(iter->pos, insert); bch2_cut_back(new->k.p, &insert->k); bch2_cut_back(insert->k.p, &new->k); if (m->data_cmd == DATA_REWRITE) - bch2_bkey_drop_device(extent_i_to_s(insert).s, + bch2_bkey_drop_device(bkey_i_to_s(insert), m->data_opts.rewrite_dev); extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { - if (bch2_extent_has_device(extent_i_to_s_c(insert), p.ptr.dev)) { + if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { /* * raced with another move op? extent already * has a pointer to the device we just wrote @@ -114,25 +114,25 @@ static int bch2_migrate_index_update(struct bch_write_op *op) continue; } - bch2_extent_ptr_decoded_append(&insert->k_i, &p); + bch2_extent_ptr_decoded_append(insert, &p); did_work = true; } if (!did_work) goto nomatch; - bch2_bkey_narrow_crcs(&insert->k_i, + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); - bch2_extent_normalize(c, extent_i_to_s(insert).s); - bch2_extent_mark_replicas_cached(c, extent_i_to_s(insert), - op->opts.background_target, - op->opts.data_replicas); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), + op->opts.background_target, + op->opts.data_replicas); /* * If we're not fully overwriting @k, and it's compressed, we * need a reservation for all the pointers in @insert */ - nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(&insert->k_i)) - + nr = bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(insert)) - m->nr_ptrs_reserved; if (insert->k.size < k.k->size && @@ -148,7 +148,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) } bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &insert->k_i)); + BTREE_INSERT_ENTRY(iter, insert)); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), @@ -213,10 +213,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, struct bch_io_opts io_opts, enum data_cmd data_cmd, struct data_opts data_opts, + enum btree_id btree_id, struct bkey_s_c k) { int ret; + m->btree_id = btree_id; m->data_cmd = data_cmd; m->data_opts = data_opts; m->nr_ptrs_reserved = 0; @@ -264,11 +266,12 @@ int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, break; } case DATA_REWRITE: { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; unsigned compressed_sectors = 0; - extent_for_each_ptr_decode(bkey_s_c_to_extent(k), p, entry) + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && p.crc.compression_type != BCH_COMPRESSION_NONE && bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) @@ -391,6 +394,7 @@ static int bch2_move_extent(struct bch_fs *c, struct moving_context *ctxt, struct write_point_specifier wp, struct bch_io_opts io_opts, + enum btree_id btree_id, struct bkey_s_c k, enum data_cmd data_cmd, struct data_opts data_opts) @@ -443,7 +447,7 @@ static int bch2_move_extent(struct bch_fs *c, io->rbio.bio.bi_end_io = move_read_endio; ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, - data_cmd, data_opts, k); + data_cmd, data_opts, btree_id, k); if (ret) goto err_free_pages; @@ -473,16 +477,17 @@ err: return ret; } -int bch2_move_data(struct bch_fs *c, - struct bch_ratelimit *rate, - struct write_point_specifier wp, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - struct bch_move_stats *stats) +static int __bch2_move_data(struct bch_fs *c, + struct moving_context *ctxt, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats, + enum btree_id btree_id) { bool kthread = (current->flags & PF_KTHREAD) != 0; - struct moving_context ctxt = { .stats = stats }; struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); BKEY_PADDED(k) tmp; struct btree_trans trans; @@ -493,17 +498,13 @@ int bch2_move_data(struct bch_fs *c, u64 delay, cur_inum = U64_MAX; int ret = 0, ret2; - closure_init_stack(&ctxt.cl); - INIT_LIST_HEAD(&ctxt.reads); - init_waitqueue_head(&ctxt.wait); - bch2_trans_init(&trans, c, 0, 0); stats->data_type = BCH_DATA_USER; - stats->btree_id = BTREE_ID_EXTENTS; + stats->btree_id = btree_id; stats->pos = POS_MIN; - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, start, + iter = bch2_trans_get_iter(&trans, btree_id, start, BTREE_ITER_PREFETCH); if (rate) @@ -528,7 +529,7 @@ int bch2_move_data(struct bch_fs *c, if (unlikely(freezing(current))) { bch2_trans_unlock(&trans); - move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); + move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); try_to_freeze(); } } while (delay); @@ -579,12 +580,12 @@ peek: k = bkey_i_to_s_c(&tmp.k); bch2_trans_unlock(&trans); - ret2 = bch2_move_extent(c, &ctxt, wp, io_opts, k, + ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, data_cmd, data_opts); if (ret2) { if (ret2 == -ENOMEM) { /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(&ctxt); + bch2_move_ctxt_wait_for_io(ctxt); continue; } @@ -602,7 +603,32 @@ next_nondata: bch2_trans_cond_resched(&trans); } out: - bch2_trans_exit(&trans); + ret = bch2_trans_exit(&trans) ?: ret; + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + struct bch_ratelimit *rate, + struct write_point_specifier wp, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + struct bch_move_stats *stats) +{ + struct moving_context ctxt = { .stats = stats }; + int ret; + + closure_init_stack(&ctxt.cl); + INIT_LIST_HEAD(&ctxt.reads); + init_waitqueue_head(&ctxt.wait); + + stats->data_type = BCH_DATA_USER; + + ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_EXTENTS) ?: + __bch2_move_data(c, &ctxt, rate, wp, start, end, + pred, arg, stats, BTREE_ID_REFLINK); move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); closure_sync(&ctxt.cl); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 71b3d2b2ddb6..0acd1720d4f8 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -25,6 +25,7 @@ struct data_opts { }; struct migrate_write { + enum btree_id btree_id; enum data_cmd data_cmd; struct data_opts data_opts; @@ -44,7 +45,7 @@ int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, struct write_point_specifier, struct bch_io_opts, enum data_cmd, struct data_opts, - struct bkey_s_c); + enum btree_id, struct bkey_s_c); typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 3742b241807c..f2899ba9ad43 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -236,7 +236,8 @@ static void replay_now_at(struct journal *j, u64 seq) bch2_journal_pin_put(j, j->replay_journal_seq++); } -static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) +static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, + struct bkey_i *k) { struct btree_trans trans; struct btree_iter *iter, *split_iter; @@ -255,7 +256,7 @@ static int bch2_extent_replay_key(struct bch_fs *c, struct bkey_i *k) retry: bch2_trans_begin(&trans); - iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, + iter = bch2_trans_get_iter(&trans, btree_id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); @@ -341,22 +342,17 @@ static int bch2_journal_replay(struct bch_fs *c, for_each_journal_key(keys, i) { replay_now_at(j, keys.journal_seq_base + i->journal_seq); - switch (i->btree_id) { - case BTREE_ID_ALLOC: + if (i->btree_id == BTREE_ID_ALLOC) ret = bch2_alloc_replay_key(c, i->k); - break; - case BTREE_ID_EXTENTS: - ret = bch2_extent_replay_key(c, i->k); - break; - default: + else if (btree_node_type_is_extents(i->btree_id)) + ret = bch2_extent_replay_key(c, i->btree_id, i->k); + else ret = bch2_btree_insert(c, i->btree_id, i->k, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW| BTREE_INSERT_JOURNAL_REPLAY| BTREE_INSERT_NOMARK); - break; - } if (ret) { bch_err(c, "journal replay: error %d while replaying key", diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000000..dcca9c1d0f47 --- /dev/null +++ b/fs/bcachefs/reflink.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "fs-io.h" +#include "reflink.h" + +#include + +/* reflink pointers */ + +const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (bkey_val_bytes(p.k) != sizeof(*p.v)) + return "incorrect value size"; + + return NULL; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); +} + +enum merge_result bch2_reflink_p_merge(struct bch_fs *c, + struct bkey_s _l, struct bkey_s _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return BCH_MERGE_NOMERGE; + + if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { + bch2_key_resize(l.k, KEY_SIZE_MAX); + __bch2_cut_front(l.k->p, _r); + return BCH_MERGE_PARTIAL; + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + + return BCH_MERGE_MERGE; +} + +/* indirect extents */ + +const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + if (bkey_val_bytes(r.k) < sizeof(*r.v)) + return "incorrect value size"; + + return bch2_bkey_ptrs_invalid(c, k); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +/* + * bch2_remap_range() depends on bch2_extent_update(), which depends on various + * things tied to the linux vfs for inode updates, for now: + */ +#ifndef NO_BCACHEFS_FS + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i_extent *e) +{ + struct bch_fs *c = trans->c; + struct btree_iter *reflink_iter; + struct bkey_s_c k; + struct bkey_i_reflink_v *r_v; + struct bkey_i_reflink_p *r_p; + int ret; + + for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, + POS(0, c->reflink_hint), + BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { + if (reflink_iter->pos.inode) { + bch2_btree_iter_set_pos(reflink_iter, POS_MIN); + continue; + } + + if (bkey_deleted(k.k) && e->k.size <= k.k->size) + break; + } + + if (ret) + goto err; + + /* rewind iter to start of hole, if necessary: */ + bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); + + r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_reflink_v_init(&r_v->k_i); + r_v->k.p = reflink_iter->pos; + bch2_key_resize(&r_v->k, e->k.size); + r_v->k.version = e->k.version; + + set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + + bkey_val_u64s(&e->k)); + r_v->v.refcount = 0; + memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + + r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); + if (IS_ERR(r_p)) + return PTR_ERR(r_p); + + e->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(&e->k_i); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); +err: + if (!IS_ERR(reflink_iter)) { + c->reflink_hint = reflink_iter->pos.offset; + bch2_trans_iter_put(trans, reflink_iter); + } + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k = bch2_btree_iter_peek(iter); + + while (1) { + if (bkey_err(k)) + return k; + + if (bkey_cmp(iter->pos, end) >= 0) + return bkey_s_c_null; + + if (k.k->type == KEY_TYPE_extent || + k.k->type == KEY_TYPE_reflink_p) + return k; + + k = bch2_btree_iter_next(iter); + } +} + +s64 bch2_remap_range(struct bch_fs *c, + struct bch_inode_info *dst_inode, + struct bpos dst_start, struct bpos src_start, + u64 remap_sectors, u64 new_i_size) +{ + struct btree_trans trans; + struct btree_iter *dst_iter, *src_iter; + struct bkey_s_c src_k; + BKEY_PADDED(k) new_dst, new_src; + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos dst_want, src_want; + u64 src_done, dst_done; + int ret = 0; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << BCH_FEATURE_REFLINK))) { + c->disk_sb.sb->features[0] |= + cpu_to_le64(1ULL << BCH_FEATURE_REFLINK); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + src_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, + BTREE_ITER_INTENT, 1); + dst_iter = __bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, + BTREE_ITER_INTENT, 2); + + while (1) { + bch2_trans_begin_updates(&trans); + trans.mem_top = 0; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto err; + } + + src_k = get_next_src(src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + goto btree_err; + + src_done = bpos_min(src_iter->pos, src_end).offset - + src_start.offset; + dst_want = POS(dst_start.inode, dst_start.offset + src_done); + + if (bkey_cmp(dst_iter->pos, dst_want) < 0) { + ret = bch2_fpunch_at(&trans, dst_iter, dst_want, + dst_inode, new_i_size); + if (ret) + goto btree_err; + continue; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); + + if (!bkey_cmp(dst_iter->pos, dst_end)) + break; + + if (src_k.k->type == KEY_TYPE_extent) { + bkey_reassemble(&new_src.k, src_k); + src_k = bkey_i_to_s_c(&new_src.k); + + bch2_cut_front(src_iter->pos, &new_src.k); + bch2_cut_back(src_end, &new_src.k.k); + + ret = bch2_make_extent_indirect(&trans, src_iter, + bkey_i_to_extent(&new_src.k)); + if (ret) + goto btree_err; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(&new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_iter->pos.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k.k.p = dst_iter->pos; + bch2_key_resize(&new_dst.k.k, + min(src_k.k->p.offset - src_iter->pos.offset, + dst_end.offset - dst_iter->pos.offset)); + + ret = bch2_extent_update(&trans, dst_inode, NULL, NULL, + dst_iter, &new_dst.k, + new_i_size, false, true, NULL); + if (ret) + goto btree_err; + + dst_done = dst_iter->pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(src_iter, src_want); +btree_err: + if (ret == -EINTR) + ret = 0; + if (ret) + goto err; + } + + BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); +err: + BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); + + dst_done = dst_iter->pos.offset - dst_start.offset; + new_i_size = min(dst_iter->pos.offset << 9, new_i_size); + + ret = bch2_trans_exit(&trans) ?: ret; + + mutex_lock(&dst_inode->ei_update_lock); + if (dst_inode->v.i_size < new_i_size) { + i_size_write(&dst_inode->v, new_i_size); + ret = bch2_write_inode_size(c, dst_inode, new_i_size, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&dst_inode->ei_update_lock); + + return dst_done ?: ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 index 000000000000..327618c36d33 --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +enum merge_result bch2_reflink_p_merge(struct bch_fs *, + struct bkey_s, struct bkey_s); + +#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ +} + +const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + + +#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ +} + +#ifndef NO_BCACHEFS_FS +s64 bch2_remap_range(struct bch_fs *, struct bch_inode_info *, + struct bpos, struct bpos, u64, u64); +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 7a9a7ec26c93..4fb142f3d39c 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -113,6 +113,7 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e, extent_to_replicas(k, e); break; case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: e->data_type = BCH_DATA_USER; extent_to_replicas(k, e); break; -- cgit v1.2.3 From b43a0f60a61e8e0adea6b1b9adc9a97600fc2f00 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 25 Sep 2019 16:19:52 -0400 Subject: bcachefs: Cleanup i_nlink handling Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 22 ++++------------------ fs/bcachefs/fs.h | 5 ----- fs/bcachefs/fsck.c | 13 ++----------- fs/bcachefs/inode.h | 43 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 34 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 0ba498505b07..b9a20bb19b58 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -131,9 +131,7 @@ void bch2_inode_update_after_write(struct bch_fs *c, struct bch_inode_unpacked *bi, unsigned fields) { - set_nlink(&inode->v, bi->bi_flags & BCH_INODE_UNLINKED - ? 0 - : bi->bi_nlink + nlink_bias(inode->v.i_mode)); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); inode->v.i_mode = bi->bi_mode; @@ -552,12 +550,7 @@ static int inode_update_for_link_fn(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; bi->bi_ctime = bch2_current_time(c); - - if (bi->bi_flags & BCH_INODE_UNLINKED) - bi->bi_flags &= ~BCH_INODE_UNLINKED; - else - bi->bi_nlink++; - + bch2_inode_nlink_inc(bi); return 0; } @@ -640,11 +633,7 @@ static int inode_update_for_unlink_fn(struct bch_inode_info *inode, struct bch_fs *c = inode->v.i_sb->s_fs_info; bi->bi_ctime = bch2_current_time(c); - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; - + bch2_inode_nlink_dec(bi); return 0; } @@ -815,10 +804,7 @@ static int inode_update_for_rename_fn(struct bch_inode_info *inode, BUG_ON(bi->bi_nlink && S_ISDIR(info->dst_inode->v.i_mode)); - if (bi->bi_nlink) - bi->bi_nlink--; - else - bi->bi_flags |= BCH_INODE_UNLINKED; + bch2_inode_nlink_dec(bi); } if (inode == info->src_dir || diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 6edf5dd803f0..04ac5b4129a4 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -109,11 +109,6 @@ static inline u8 mode_to_type(umode_t mode) return (mode >> 12) & 15; } -static inline unsigned nlink_bias(umode_t mode) -{ - return S_ISDIR(mode) ? 2 : 1; -} - static inline bool inode_attr_changing(struct bch_inode_info *dir, struct bch_inode_info *inode, enum inode_opt_id id) diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 50a7d8c1faba..162563b809fb 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -1116,9 +1116,7 @@ static int check_inode_nlink(struct bch_fs *c, struct nlink *link, bool *do_update) { - u32 i_nlink = u->bi_flags & BCH_INODE_UNLINKED - ? 0 - : u->bi_nlink + nlink_bias(u->bi_mode); + u32 i_nlink = bch2_inode_nlink_get(u); u32 real_i_nlink = link->count * nlink_bias(u->bi_mode) + link->dir_count; @@ -1197,14 +1195,7 @@ static int check_inode_nlink(struct bch_fs *c, u->bi_inum, i_nlink, real_i_nlink); set_i_nlink: if (i_nlink != real_i_nlink) { - if (real_i_nlink) { - u->bi_nlink = real_i_nlink - nlink_bias(u->bi_mode); - u->bi_flags &= ~BCH_INODE_UNLINKED; - } else { - u->bi_nlink = 0; - u->bi_flags |= BCH_INODE_UNLINKED; - } - + bch2_inode_nlink_set(u, real_i_nlink); *do_update = true; } fsck_err: diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index af0c355f2f04..e88ec78071bd 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -103,6 +103,49 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, } } +/* i_nlink: */ + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else + bi->bi_nlink++; +} + +static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) +{ + BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; +} + +static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) +{ + return bi->bi_flags & BCH_INODE_UNLINKED + ? 0 + : bi->bi_nlink + nlink_bias(bi->bi_mode); +} + +static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + unsigned nlink) +{ + if (nlink) { + bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); + bi->bi_flags &= ~BCH_INODE_UNLINKED; + } else { + bi->bi_nlink = 0; + bi->bi_flags |= BCH_INODE_UNLINKED; + } +} + #ifdef CONFIG_BCACHEFS_DEBUG void bch2_inode_pack_test(void); #else -- cgit v1.2.3 From a7199432c3cbcd42141cfd5c047bf8828c2390d8 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 22 Sep 2019 18:49:16 -0400 Subject: bcachefs: Kill deferred btree updates Will be replaced by cached btree iterators Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 2 +- fs/bcachefs/alloc_background.c | 4 +- fs/bcachefs/btree_types.h | 20 ----- fs/bcachefs/btree_update.h | 43 ++-------- fs/bcachefs/btree_update_leaf.c | 178 ++++++---------------------------------- fs/bcachefs/buckets.c | 8 +- fs/bcachefs/dirent.c | 9 +- fs/bcachefs/ec.c | 6 +- fs/bcachefs/fs-io.c | 78 +++++------------- fs/bcachefs/fs.c | 42 +++------- fs/bcachefs/fs.h | 1 - fs/bcachefs/fsck.c | 7 +- fs/bcachefs/inode.c | 6 +- fs/bcachefs/io.c | 5 +- fs/bcachefs/migrate.c | 5 +- fs/bcachefs/move.c | 3 +- fs/bcachefs/opts.h | 8 +- fs/bcachefs/quota.c | 2 +- fs/bcachefs/recovery.c | 2 +- fs/bcachefs/reflink.c | 4 +- fs/bcachefs/str_hash.h | 4 +- fs/bcachefs/tests.c | 10 +-- 22 files changed, 99 insertions(+), 348 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 1c3343252129..5a4263806610 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -378,7 +378,7 @@ int bch2_acl_chmod(struct btree_trans *trans, } new->k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new->k_i)); + bch2_trans_update(trans, iter, &new->k_i); *new_acl = acl; acl = NULL; err: diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 85795b580892..81418d534d70 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -311,7 +311,7 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, new_u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_trans_update(trans, iter, &a->k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| @@ -899,7 +899,7 @@ retry: a->k.p = iter->pos; bch2_alloc_pack(a, u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &a->k_i)); + bch2_trans_update(trans, iter, &a->k_i); /* * XXX: diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 299d1173df62..c128ff393f0c 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -246,29 +246,9 @@ static inline enum btree_iter_type btree_iter_type(struct btree_iter *iter) return iter->flags & BTREE_ITER_TYPE; } -struct deferred_update { - struct journal_preres res; - struct journal_entry_pin journal; - - spinlock_t lock; - unsigned dirty:1; - - u8 allocated_u64s; - enum btree_id btree_id; - - /* must be last: */ - struct bkey_i k; -}; - struct btree_insert_entry { struct bkey_i *k; - - union { struct btree_iter *iter; - struct deferred_update *d; - }; - - bool deferred; }; #define BTREE_ITER_MAX 64 diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 36e34b3d9213..0e985c1f0100 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -15,24 +15,6 @@ bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, void bch2_btree_journal_key(struct btree_trans *, struct btree_iter *, struct bkey_i *); -void bch2_deferred_update_free(struct bch_fs *, - struct deferred_update *); -struct deferred_update * -bch2_deferred_update_alloc(struct bch_fs *, enum btree_id, unsigned); - -#define BTREE_INSERT_ENTRY(_iter, _k) \ - ((struct btree_insert_entry) { \ - .iter = (_iter), \ - .k = (_k), \ - }) - -#define BTREE_INSERT_DEFERRED(_d, _k) \ - ((struct btree_insert_entry) { \ - .k = (_k), \ - .d = (_d), \ - .deferred = true, \ - }) - enum { __BTREE_INSERT_ATOMIC, __BTREE_INSERT_NOUNLOCK, @@ -120,11 +102,14 @@ int bch2_trans_commit(struct btree_trans *, u64 *, unsigned); static inline void bch2_trans_update(struct btree_trans *trans, - struct btree_insert_entry entry) + struct btree_iter *iter, + struct bkey_i *k) { EBUG_ON(trans->nr_updates >= trans->nr_iters + 4); - trans->updates[trans->nr_updates++] = entry; + trans->updates[trans->nr_updates++] = (struct btree_insert_entry) { + .iter = iter, .k = k + }; } #define bch2_trans_do(_c, _journal_seq, _flags, _do) \ @@ -145,23 +130,9 @@ static inline void bch2_trans_update(struct btree_trans *trans, _ret; \ }) -#define __trans_next_update(_trans, _i, _filter) \ -({ \ - while ((_i) < (_trans)->updates + (_trans->nr_updates) && !(_filter))\ - (_i)++; \ - \ - (_i) < (_trans)->updates + (_trans->nr_updates); \ -}) - -#define __trans_for_each_update(_trans, _i, _filter) \ +#define trans_for_each_update(_trans, _i) \ for ((_i) = (_trans)->updates; \ - __trans_next_update(_trans, _i, _filter); \ + (_i) < (_trans)->updates + (_trans)->nr_updates; \ (_i)++) -#define trans_for_each_update(trans, i) \ - __trans_for_each_update(trans, i, true) - -#define trans_for_each_update_iter(trans, i) \ - __trans_for_each_update(trans, i, !(i)->deferred) - #endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c index a0a59cd496a3..2e9271759447 100644 --- a/fs/bcachefs/btree_update_leaf.c +++ b/fs/bcachefs/btree_update_leaf.c @@ -28,8 +28,7 @@ static inline bool same_leaf_as_prev(struct btree_trans *trans, ? trans->updates + trans->updates_sorted[sorted_idx - 1] : NULL; - return !i->deferred && - prev && + return prev && i->iter->l[0].b == prev->iter->l[0].b; } @@ -73,13 +72,6 @@ static void btree_trans_lock_write(struct btree_trans *trans, bool lock) } } -static inline int btree_trans_cmp(struct btree_insert_entry l, - struct btree_insert_entry r) -{ - return cmp_int(l.deferred, r.deferred) ?: - btree_iter_cmp(l.iter, r.iter); -} - static inline void btree_trans_sort_updates(struct btree_trans *trans) { struct btree_insert_entry *l, *r; @@ -89,7 +81,7 @@ static inline void btree_trans_sort_updates(struct btree_trans *trans) for (pos = 0; pos < nr; pos++) { r = trans->updates + trans->updates_sorted[pos]; - if (btree_trans_cmp(*l, *r) <= 0) + if (btree_iter_cmp(l->iter, r->iter) <= 0) break; } @@ -312,143 +304,23 @@ static void btree_insert_key_leaf(struct btree_trans *trans, trace_btree_insert_key(c, b, insert->k); } -/* Deferred btree updates: */ - -static void deferred_update_flush(struct journal *j, - struct journal_entry_pin *pin, - u64 seq) -{ - struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct deferred_update *d = - container_of(pin, struct deferred_update, journal); - struct journal_preres res = { 0 }; - u64 tmp[32]; - struct bkey_i *k = (void *) tmp; - int ret; - - if (d->allocated_u64s > ARRAY_SIZE(tmp)) { - k = kmalloc(d->allocated_u64s * sizeof(u64), GFP_NOFS); - - BUG_ON(!k); /* XXX */ - } - - spin_lock(&d->lock); - if (d->dirty) { - BUG_ON(jset_u64s(d->k.k.u64s) > d->res.u64s); - - swap(res, d->res); - - BUG_ON(d->k.k.u64s > d->allocated_u64s); - - bkey_copy(k, &d->k); - d->dirty = false; - spin_unlock(&d->lock); - - ret = bch2_btree_insert(c, d->btree_id, k, NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_USE_RESERVE| - BTREE_INSERT_JOURNAL_RESERVED); - bch2_fs_fatal_err_on(ret && !bch2_journal_error(j), - c, "error flushing deferred btree update: %i", ret); - - spin_lock(&d->lock); - } - - if (!d->dirty) - bch2_journal_pin_drop(j, &d->journal); - spin_unlock(&d->lock); - - bch2_journal_preres_put(j, &res); - if (k != (void *) tmp) - kfree(k); -} - -static void btree_insert_key_deferred(struct btree_trans *trans, - struct btree_insert_entry *insert) -{ - struct bch_fs *c = trans->c; - struct journal *j = &c->journal; - struct deferred_update *d = insert->d; - int difference; - - BUG_ON(trans->flags & BTREE_INSERT_JOURNAL_REPLAY); - BUG_ON(insert->k->u64s > d->allocated_u64s); - - __btree_journal_key(trans, d->btree_id, insert->k); - - spin_lock(&d->lock); - BUG_ON(jset_u64s(insert->k->u64s) > - trans->journal_preres.u64s); - - difference = jset_u64s(insert->k->u64s) - d->res.u64s; - if (difference > 0) { - trans->journal_preres.u64s -= difference; - d->res.u64s += difference; - } - - bkey_copy(&d->k, insert->k); - d->dirty = true; - - bch2_journal_pin_update(j, trans->journal_res.seq, &d->journal, - deferred_update_flush); - spin_unlock(&d->lock); -} - -void bch2_deferred_update_free(struct bch_fs *c, - struct deferred_update *d) -{ - deferred_update_flush(&c->journal, &d->journal, 0); - - BUG_ON(journal_pin_active(&d->journal)); - - bch2_journal_pin_flush(&c->journal, &d->journal); - kfree(d); -} - -struct deferred_update * -bch2_deferred_update_alloc(struct bch_fs *c, - enum btree_id btree_id, - unsigned u64s) -{ - struct deferred_update *d; - - BUG_ON(u64s > U8_MAX); - - d = kmalloc(offsetof(struct deferred_update, k) + - u64s * sizeof(u64), GFP_NOFS); - BUG_ON(!d); - - memset(d, 0, offsetof(struct deferred_update, k)); - - spin_lock_init(&d->lock); - d->allocated_u64s = u64s; - d->btree_id = btree_id; - - return d; -} - /* Normal update interface: */ static inline void btree_insert_entry_checks(struct btree_trans *trans, struct btree_insert_entry *i) { struct bch_fs *c = trans->c; - enum btree_id btree_id = !i->deferred - ? i->iter->btree_id - : i->d->btree_id; - - if (!i->deferred) { - BUG_ON(i->iter->level); - BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); - EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && - !(trans->flags & BTREE_INSERT_ATOMIC)); - } + + BUG_ON(i->iter->level); + BUG_ON(bkey_cmp(bkey_start_pos(&i->k->k), i->iter->pos)); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + bkey_cmp(i->k->k.p, i->iter->l[0].b->key.k.p) > 0); + EBUG_ON((i->iter->flags & BTREE_ITER_IS_EXTENTS) && + !(trans->flags & BTREE_INSERT_ATOMIC)); BUG_ON(debug_check_bkeys(c) && !bkey_deleted(&i->k->k) && - bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), btree_id)); + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), i->iter->btree_id)); } static int bch2_trans_journal_preres_get(struct btree_trans *trans) @@ -459,7 +331,7 @@ static int bch2_trans_journal_preres_get(struct btree_trans *trans) int ret; trans_for_each_update(trans, i) - if (i->deferred) + if (0) u64s += jset_u64s(i->k->k.u64s); if (!u64s) @@ -551,10 +423,7 @@ static int btree_trans_check_can_insert(struct btree_trans *trans, static inline void do_btree_insert_one(struct btree_trans *trans, struct btree_insert_entry *insert) { - if (likely(!insert->deferred)) - btree_insert_key_leaf(trans, insert); - else - btree_insert_key_deferred(trans, insert); + btree_insert_key_leaf(trans, insert); } static inline bool update_triggers_transactional(struct btree_trans *trans, @@ -570,7 +439,6 @@ static inline bool update_has_triggers(struct btree_trans *trans, struct btree_insert_entry *i) { return likely(!(trans->flags & BTREE_INSERT_NOMARK)) && - !i->deferred && btree_node_type_needs_gc(i->iter->btree_id); } @@ -588,14 +456,14 @@ static inline int do_btree_insert_at(struct btree_trans *trans, : 0; int ret; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) BUG_ON(i->iter->uptodate >= BTREE_ITER_NEED_RELOCK); /* * note: running triggers will append more updates to the list of * updates as we're walking it: */ - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (update_has_triggers(trans, i) && update_triggers_transactional(trans, i)) { ret = bch2_trans_mark_update(trans, i->iter, i->k); @@ -633,7 +501,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, if (ret) goto out; - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { if (!btree_node_type_needs_gc(i->iter->btree_id)) continue; @@ -673,7 +541,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, i->k->k.version = MAX_VERSION; } - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (update_has_triggers(trans, i) && !update_triggers_transactional(trans, i)) bch2_mark_update(trans, i, &fs_usage->u, mark_flags); @@ -687,7 +555,7 @@ static inline int do_btree_insert_at(struct btree_trans *trans, if (likely(!(trans->flags & BTREE_INSERT_NOMARK)) && unlikely(c->gc_pos.phase)) - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) bch2_mark_update(trans, i, NULL, mark_flags| @@ -772,7 +640,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, case BTREE_INSERT_NEED_MARK_REPLICAS: bch2_trans_unlock(trans); - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); if (ret) return ret; @@ -842,7 +710,7 @@ static int __bch2_trans_commit(struct btree_trans *trans, unsigned iter; int ret; - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { if (!bch2_btree_iter_upgrade(i->iter, 1)) { trace_trans_restart_upgrade(trans->ip); ret = -EINTR; @@ -868,7 +736,7 @@ static int __bch2_trans_commit(struct btree_trans *trans, trans->nounlock = false; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) bch2_btree_iter_downgrade(i->iter); err: /* make sure we didn't drop or screw up locks: */ @@ -995,7 +863,7 @@ retry: iter = bch2_trans_get_iter(&trans, id, bkey_start_pos(&k->k), BTREE_ITER_INTENT); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, k)); + bch2_trans_update(&trans, iter, k); ret = bch2_trans_commit(&trans, disk_res, journal_seq, flags); if (ret == -EINTR) @@ -1045,7 +913,7 @@ retry: break; } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &delete)); + bch2_trans_update(trans, iter, &delete); ret = bch2_trans_commit(trans, NULL, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL); @@ -1072,7 +940,7 @@ int bch2_btree_delete_at(struct btree_trans *trans, bkey_init(&k.k); k.k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &k)); + bch2_trans_update(trans, iter, &k); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_USE_RESERVE|flags); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 637a9e909f82..9c97a1522d9d 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -1316,7 +1316,7 @@ void bch2_trans_fs_usage_apply(struct btree_trans *trans, bch_err(c, "disk usage increased more than %llu sectors reserved", disk_res_sectors); - trans_for_each_update_iter(trans, i) { + trans_for_each_update(trans, i) { struct btree_iter *iter = i->iter; struct btree *b = iter->l[0].b; struct btree_node_iter node_iter = iter->l[0].iter; @@ -1358,7 +1358,7 @@ static int trans_get_key(struct btree_trans *trans, struct btree_insert_entry *i; int ret; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (i->iter->btree_id == btree_id && (btree_node_type_is_extents(btree_id) ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && @@ -1397,13 +1397,13 @@ static void *trans_update_key(struct btree_trans *trans, bkey_init(&new_k->k); new_k->k.p = iter->pos; - trans_for_each_update_iter(trans, i) + trans_for_each_update(trans, i) if (i->iter == iter) { i->k = new_k; return new_k; } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, new_k)); + bch2_trans_update(trans, iter, new_k); return new_k; } diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 1442dacef0de..38dd96808e90 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -255,9 +255,8 @@ int bch2_dirent_rename(struct btree_trans *trans, * new_dst at the src position: */ new_dst->k.p = src_iter->pos; - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(src_iter, - &new_dst->k_i)); + bch2_trans_update(trans, src_iter, + &new_dst->k_i); return 0; } else { /* If we're overwriting, we can't insert new_dst @@ -280,8 +279,8 @@ int bch2_dirent_rename(struct btree_trans *trans, } } - bch2_trans_update(trans, BTREE_INSERT_ENTRY(src_iter, &new_src->k_i)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(dst_iter, &new_dst->k_i)); + bch2_trans_update(trans, src_iter, &new_src->k_i); + bch2_trans_update(trans, dst_iter, &new_dst->k_i); return 0; } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index 5b61e9cb1ac3..155e7c9bd89f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -738,7 +738,7 @@ found_slot: stripe->k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &stripe->k_i)); + bch2_trans_update(&trans, iter, &stripe->k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -819,7 +819,7 @@ static int ec_stripe_update_ptrs(struct bch_fs *c, extent_stripe_ptr_add(e, s, ptr, idx); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.k)); + bch2_trans_update(&trans, iter, &tmp.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| @@ -1231,7 +1231,7 @@ static int __bch2_stripe_write_key(struct btree_trans *trans, spin_unlock(&c->ec_stripes_heap_lock); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &new_key->k_i)); + bch2_trans_update(trans, iter, &new_key->k_i); return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL|flags); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 18356cbe0794..da4976344d49 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -324,69 +324,36 @@ int bch2_extent_update(struct btree_trans *trans, if (!may_allocate && allocating) return -ENOSPC; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, k)); + bch2_trans_update(trans, extent_iter, k); new_i_size = min(k->k.p.offset << 9, new_i_size); /* XXX: inode->i_size locking */ if (i_sectors_delta || new_i_size > inode->ei_inode.bi_size) { - if (c->opts.new_inode_updates) { - bch2_trans_unlock(trans); - mutex_lock(&inode->ei_update_lock); - - if (!bch2_trans_relock(trans)) { - mutex_unlock(&inode->ei_update_lock); - return -EINTR; - } - - inode_locked = true; - - if (!inode->ei_inode_update) - inode->ei_inode_update = - bch2_deferred_update_alloc(c, - BTREE_ID_INODES, 64); - - inode_u = inode->ei_inode; - inode_u.bi_sectors += i_sectors_delta; - - /* XXX: this is slightly suspect */ - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) { - inode_u.bi_size = new_i_size; - extended = true; - } - - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, - BTREE_INSERT_DEFERRED(inode->ei_inode_update, - &inode_p.inode.k_i)); - } else { - inode_iter = bch2_trans_get_iter(trans, - BTREE_ID_INODES, - POS(k->k.p.inode, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(inode_iter)) - return PTR_ERR(inode_iter); - - ret = bch2_btree_iter_traverse(inode_iter); - if (ret) - goto err; + inode_iter = bch2_trans_get_iter(trans, + BTREE_ID_INODES, + POS(k->k.p.inode, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); - inode_u = inode->ei_inode; - inode_u.bi_sectors += i_sectors_delta; + ret = bch2_btree_iter_traverse(inode_iter); + if (ret) + goto err; - /* XXX: this is slightly suspect */ - if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && - new_i_size > inode_u.bi_size) { - inode_u.bi_size = new_i_size; - extended = true; - } + inode_u = inode->ei_inode; + inode_u.bi_sectors += i_sectors_delta; - bch2_inode_pack(&inode_p, &inode_u); - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(inode_iter, &inode_p.inode.k_i)); + /* XXX: this is slightly suspect */ + if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > inode_u.bi_size) { + inode_u.bi_size = new_i_size; + extended = true; } + + bch2_inode_pack(&inode_p, &inode_u); + bch2_trans_update(trans, inode_iter, &inode_p.inode.k_i); } ret = bch2_trans_commit(trans, disk_res, @@ -2793,9 +2760,8 @@ reassemble: bkey_start_pos(&delete.k)); } - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(dst, ©.k)); - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(del ?: src, &delete)); + bch2_trans_update(&trans, dst, ©.k); + bch2_trans_update(&trans, del ?: src, &delete); if (copy.k.k.size == k.k->size) { /* diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index b9a20bb19b58..166d94e5e59d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -154,30 +154,22 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, inode_set_fn set, void *p) { - struct bch_fs *c = trans->c; struct btree_iter *iter = NULL; struct bkey_inode_buf *inode_p; int ret; lockdep_assert_held(&inode->ei_update_lock); - if (c->opts.new_inode_updates) { - /* XXX: Don't do this with btree locks held */ - if (!inode->ei_inode_update) - inode->ei_inode_update = - bch2_deferred_update_alloc(c, BTREE_ID_INODES, 64); - } else { - iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, - POS(inode->v.i_ino, 0), - BTREE_ITER_SLOTS|BTREE_ITER_INTENT); - if (IS_ERR(iter)) - return PTR_ERR(iter); - - /* The btree node lock is our lock on the inode: */ - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - } + iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, + POS(inode->v.i_ino, 0), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if (IS_ERR(iter)) + return PTR_ERR(iter); + + /* The btree node lock is our lock on the inode: */ + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; *inode_u = inode->ei_inode; @@ -192,14 +184,7 @@ int __must_check bch2_write_inode_trans(struct btree_trans *trans, return PTR_ERR(inode_p); bch2_inode_pack(inode_p, inode_u); - - if (!inode->ei_inode_update) - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); - else - bch2_trans_update(trans, - BTREE_INSERT_DEFERRED(inode->ei_inode_update, - &inode_p->inode.k_i)); + bch2_trans_update(trans, iter, &inode_p->inode.k_i); return 0; } @@ -1482,7 +1467,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) mutex_init(&inode->ei_update_lock); pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); - inode->ei_inode_update = NULL; inode->ei_journal_seq = 0; return &inode->v; @@ -1540,10 +1524,6 @@ static void bch2_evict_inode(struct inode *vinode) BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); - if (inode->ei_inode_update) - bch2_deferred_update_free(c, inode->ei_inode_update); - inode->ei_inode_update = NULL; - if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), KEY_TYPE_QUOTA_WARN); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 04ac5b4129a4..c3ee9c17064f 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -34,7 +34,6 @@ struct bch_inode_info { struct inode v; struct mutex ei_update_lock; - struct deferred_update *ei_inode_update; u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index b806284c0517..c5540536f47c 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -393,7 +393,7 @@ static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", buf, strlen(buf), d->v.d_name, len)) { - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &d->k_i)); + bch2_trans_update(trans, iter, &d->k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| @@ -663,8 +663,7 @@ retry: bkey_reassemble(&n->k_i, d.s_c); n->v.d_type = mode_to_type(target.bi_mode); - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &n->k_i)); + bch2_trans_update(&trans, iter, &n->k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL| @@ -1293,7 +1292,7 @@ static int check_inode(struct btree_trans *trans, struct bkey_inode_buf p; bch2_inode_pack(&p, &u); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, &p.inode.k_i)); + bch2_trans_update(trans, iter, &p.inode.k_i); ret = bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 0fb08a396d62..f192536558c1 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -345,8 +345,7 @@ again: inode_u->bi_generation = bkey_generation(k); bch2_inode_pack(inode_p, inode_u); - bch2_trans_update(trans, - BTREE_INSERT_ENTRY(iter, &inode_p->inode.k_i)); + bch2_trans_update(trans, iter, &inode_p->inode.k_i); return 0; } } @@ -435,8 +434,7 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) delete.v.bi_generation = cpu_to_le32(bi_generation); } - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &delete.k_i)); + bch2_trans_update(&trans, iter, &delete.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 07fe6b5cd517..690f9b2dbb98 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -290,8 +290,7 @@ retry: if (ret) break; - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, &split.k)); + bch2_trans_update(&trans, iter, &split.k); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), BTREE_INSERT_NOFAIL| @@ -1445,7 +1444,7 @@ retry: if (!bch2_bkey_narrow_crcs(&new.k, new_crc)) goto out; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new.k)); + bch2_trans_update(&trans, iter, &new.k); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL| diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index dc3b03d6e627..de8522f754e2 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -72,10 +72,9 @@ static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags */ bch2_extent_normalize(c, bkey_i_to_s(&tmp.key)); - /* XXX not sketchy at all */ - iter->pos = bkey_start_pos(&tmp.key.k); + bch2_btree_iter_set_pos(iter, bkey_start_pos(&tmp.key.k)); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &tmp.key)); + bch2_trans_update(&trans, iter, &tmp.key); ret = bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_ATOMIC| diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 8855dd19f7f2..2f0bdfbfcd61 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -147,8 +147,7 @@ static int bch2_migrate_index_update(struct bch_write_op *op) goto next; } - bch2_trans_update(&trans, - BTREE_INSERT_ENTRY(iter, insert)); + bch2_trans_update(&trans, iter, insert); ret = bch2_trans_commit(&trans, &op->res, op_journal_seq(op), diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index d44bfe90c0d5..d9325d4bc024 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -295,13 +295,7 @@ enum opt_type { OPT_UINT(0, BCH_REPLICAS_MAX), \ NO_SB_OPT, 1, \ "n", "Data written to this device will be considered\n"\ - "to have already been replicated n times") \ - x(new_inode_updates, u8, \ - OPT_MOUNT, \ - OPT_BOOL(), \ - NO_SB_OPT, false, \ - NULL, "Enable new btree write-cache for inode updates") - + "to have already been replicated n times") struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index f0da0fac09bf..0fa6f33c049b 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -752,7 +752,7 @@ static int bch2_set_quota(struct super_block *sb, struct kqid qid, if (qdq->d_fieldmask & QC_INO_HARD) new_quota.v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &new_quota.k_i)); + bch2_trans_update(&trans, iter, &new_quota.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 98d9a1432e50..2e880955a07c 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -301,7 +301,7 @@ retry: bch2_cut_front(split_iter->pos, split); bch2_cut_back(atomic_end, &split->k); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(split_iter, split)); + bch2_trans_update(&trans, split_iter, split); bch2_btree_iter_set_pos(iter, split->k.p); } while (bkey_cmp(iter->pos, k->k.p) < 0); diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index dcca9c1d0f47..c08b57634abd 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -120,7 +120,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, r_v->v.refcount = 0; memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(reflink_iter, &r_v->k_i)); + bch2_trans_update(trans, reflink_iter, &r_v->k_i); r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); if (IS_ERR(r_p)) @@ -131,7 +131,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); - bch2_trans_update(trans, BTREE_INSERT_ENTRY(extent_iter, &r_p->k_i)); + bch2_trans_update(trans, extent_iter, &r_p->k_i); err: if (!IS_ERR(reflink_iter)) { c->reflink_hint = reflink_iter->pos.offset; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 31b278e71051..886f1bc8aa14 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -267,7 +267,7 @@ not_found: } insert->k.p = iter->pos; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, insert)); + bch2_trans_update(trans, iter, insert); bch2_trans_iter_free_on_commit(trans, iter); } @@ -295,7 +295,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; - bch2_trans_update(trans, BTREE_INSERT_ENTRY(iter, delete)); + bch2_trans_update(trans, iter, delete); return 0; } diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 92843bd09b04..a2092bb99095 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -43,7 +43,7 @@ static void test_delete(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -75,7 +75,7 @@ static void test_delete_written(struct bch_fs *c, u64 nr) ret = bch2_btree_iter_traverse(iter); BUG_ON(ret); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -465,7 +465,7 @@ static void rand_mixed(struct bch_fs *c, u64 nr) bkey_cookie_init(&k.k_i); k.k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &k.k_i)); + bch2_trans_update(&trans, iter, &k.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } @@ -509,7 +509,7 @@ static void seq_insert(struct bch_fs *c, u64 nr) BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { insert.k.p = iter->pos; - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &insert.k_i)); + bch2_trans_update(&trans, iter, &insert.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); @@ -548,7 +548,7 @@ static void seq_overwrite(struct bch_fs *c, u64 nr) bkey_reassemble(&u.k_i, k); - bch2_trans_update(&trans, BTREE_INSERT_ENTRY(iter, &u.k_i)); + bch2_trans_update(&trans, iter, &u.k_i); ret = bch2_trans_commit(&trans, NULL, NULL, 0); BUG_ON(ret); } -- cgit v1.2.3 From 9638574229e3ae0175a46a63431149746c777b3a Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Oct 2019 18:35:36 -0400 Subject: bcachefs: Factor out fs-common.c This refactoring makes the code easier to understand by separating the bcachefs btree transactional code from the linux VFS code - but more importantly, it's also to share code with the fuse port. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/dirent.c | 97 ++++------ fs/bcachefs/dirent.h | 29 +-- fs/bcachefs/fs-common.c | 280 +++++++++++++++++++++++++++ fs/bcachefs/fs-common.h | 37 ++++ fs/bcachefs/fs-ioctl.c | 10 + fs/bcachefs/fs.c | 495 +++++++++++++----------------------------------- fs/bcachefs/fs.h | 13 -- fs/bcachefs/fsck.c | 76 +++----- fs/bcachefs/inode.c | 39 ++-- fs/bcachefs/inode.h | 16 +- fs/bcachefs/recovery.c | 26 +-- 12 files changed, 586 insertions(+), 533 deletions(-) create mode 100644 fs/bcachefs/fs-common.c create mode 100644 fs/bcachefs/fs-common.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 4c2608409144..9d120936703a 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -27,6 +27,7 @@ bcachefs-y := \ error.o \ extents.o \ fs.o \ + fs-common.o \ fs-ioctl.o \ fs-io.o \ fsck.o \ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 38dd96808e90..304128d7251f 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -138,10 +138,10 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int __bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - int flags) +int bch2_dirent_create(struct btree_trans *trans, + u64 dir_inum, const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + int flags) { struct bkey_i_dirent *dirent; int ret; @@ -155,16 +155,6 @@ int __bch2_dirent_create(struct btree_trans *trans, dir_inum, &dirent->k_i, flags); } -int bch2_dirent_create(struct bch_fs *c, u64 dir_inum, - const struct bch_hash_info *hash_info, - u8 type, const struct qstr *name, u64 dst_inum, - u64 *journal_seq, int flags) -{ - return bch2_trans_do(c, journal_seq, flags, - __bch2_dirent_create(&trans, dir_inum, hash_info, - type, name, dst_inum, flags)); -} - static void dirent_copy_target(struct bkey_i_dirent *dst, struct bkey_s_c_dirent src) { @@ -172,23 +162,22 @@ static void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } -static struct bpos bch2_dirent_pos(struct bch_inode_info *inode, - const struct qstr *name) -{ - return POS(inode->v.i_ino, bch2_dirent_hash(&inode->ei_str_hash, name)); -} - int bch2_dirent_rename(struct btree_trans *trans, - struct bch_inode_info *src_dir, const struct qstr *src_name, - struct bch_inode_info *dst_dir, const struct qstr *dst_name, - enum bch_rename_mode mode) + u64 src_dir, struct bch_hash_info *src_hash, + u64 dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, u64 *src_inum, + const struct qstr *dst_name, u64 *dst_inum, + enum bch_rename_mode mode) { struct btree_iter *src_iter, *dst_iter; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; - struct bpos dst_pos = bch2_dirent_pos(dst_dir, dst_name); + struct bpos dst_pos = + POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); int ret; + *src_inum = *dst_inum = 0; + /* * Lookup dst: * @@ -198,24 +187,25 @@ int bch2_dirent_rename(struct btree_trans *trans, */ dst_iter = mode == BCH_RENAME ? bch2_hash_hole(trans, bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - dst_dir->v.i_ino, dst_name) + dst_hash, dst_dir, dst_name) : bch2_hash_lookup(trans, bch2_dirent_hash_desc, - &dst_dir->ei_str_hash, - dst_dir->v.i_ino, dst_name, + dst_hash, dst_dir, dst_name, BTREE_ITER_INTENT); if (IS_ERR(dst_iter)) return PTR_ERR(dst_iter); old_dst = bch2_btree_iter_peek_slot(dst_iter); + if (mode != BCH_RENAME) + *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); + /* Lookup src: */ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - src_dir->v.i_ino, src_name, + src_hash, src_dir, src_name, BTREE_ITER_INTENT); if (IS_ERR(src_iter)) return PTR_ERR(src_iter); old_src = bch2_btree_iter_peek_slot(src_iter); + *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); /* Create new dst key: */ new_dst = dirent_create_key(trans, 0, dst_name, 0); @@ -269,8 +259,7 @@ int bch2_dirent_rename(struct btree_trans *trans, } else { /* Check if we need a whiteout to delete src: */ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, - &src_dir->ei_str_hash, - src_iter); + src_hash, src_iter); if (ret < 0) return ret; @@ -284,12 +273,12 @@ int bch2_dirent_rename(struct btree_trans *trans, return 0; } -int __bch2_dirent_delete(struct btree_trans *trans, u64 dir_inum, - const struct bch_hash_info *hash_info, - const struct qstr *name) +int bch2_dirent_delete_at(struct btree_trans *trans, + const struct bch_hash_info *hash_info, + struct btree_iter *iter) { - return bch2_hash_delete(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, name); + return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + hash_info, iter); } int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, @@ -300,7 +289,17 @@ int bch2_dirent_delete(struct bch_fs *c, u64 dir_inum, return bch2_trans_do(c, journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOFAIL, - __bch2_dirent_delete(&trans, dir_inum, hash_info, name)); + bch2_hash_delete(&trans, bch2_dirent_hash_desc, hash_info, + dir_inum, name)); +} + +struct btree_iter * +__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, + const struct bch_hash_info *hash_info, + const struct qstr *name) +{ + return bch2_hash_lookup(trans, bch2_dirent_hash_desc, + hash_info, dir_inum, name, 0); } u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, @@ -314,8 +313,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, bch2_trans_init(&trans, c, 0, 0); - iter = bch2_hash_lookup(&trans, bch2_dirent_hash_desc, - hash_info, dir_inum, name, 0); + iter = __bch2_dirent_lookup_trans(&trans, dir_inum, hash_info, name); if (IS_ERR(iter)) { BUG_ON(PTR_ERR(iter) == -EINTR); goto out; @@ -349,16 +347,8 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) return ret; } -int bch2_empty_dir(struct bch_fs *c, u64 dir_inum) +int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) { - return bch2_trans_do(c, NULL, 0, - bch2_empty_dir_trans(&trans, dir_inum)); -} - -int bch2_readdir(struct bch_fs *c, struct file *file, - struct dir_context *ctx) -{ - struct bch_inode_info *inode = file_bch_inode(file); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c k; @@ -366,22 +356,19 @@ int bch2_readdir(struct bch_fs *c, struct file *file, unsigned len; int ret; - if (!dir_emit_dots(file, ctx)) - return 0; - bch2_trans_init(&trans, c, 0, 0); for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, - POS(inode->v.i_ino, ctx->pos), 0, k, ret) { + POS(inum, ctx->pos), 0, k, ret) { if (k.k->type != KEY_TYPE_dirent) continue; dirent = bkey_s_c_to_dirent(k); - if (bkey_cmp(k.k->p, POS(inode->v.i_ino, ctx->pos)) < 0) + if (bkey_cmp(k.k->p, POS(inum, ctx->pos)) < 0) continue; - if (k.k->p.inode > inode->v.i_ino) + if (k.k->p.inode > inum) break; len = bch2_dirent_name_bytes(dirent); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index bc64718a7832..9a57ad005468 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -29,15 +29,13 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int __bch2_dirent_create(struct btree_trans *, u64, - const struct bch_hash_info *, u8, - const struct qstr *, u64, int); -int bch2_dirent_create(struct bch_fs *c, u64, const struct bch_hash_info *, - u8, const struct qstr *, u64, u64 *, int); - -int __bch2_dirent_delete(struct btree_trans *, u64, - const struct bch_hash_info *, - const struct qstr *); +int bch2_dirent_create(struct btree_trans *, u64, + const struct bch_hash_info *, u8, + const struct qstr *, u64, int); + +int bch2_dirent_delete_at(struct btree_trans *, + const struct bch_hash_info *, + struct btree_iter *); int bch2_dirent_delete(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *, u64 *); @@ -48,15 +46,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - struct bch_inode_info *, const struct qstr *, - struct bch_inode_info *, const struct qstr *, + u64, struct bch_hash_info *, + u64, struct bch_hash_info *, + const struct qstr *, u64 *, + const struct qstr *, u64 *, enum bch_rename_mode); +struct btree_iter * +__bch2_dirent_lookup_trans(struct btree_trans *, u64, + const struct bch_hash_info *, + const struct qstr *); u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, const struct qstr *); int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_empty_dir(struct bch_fs *, u64); -int bch2_readdir(struct bch_fs *, struct file *, struct dir_context *); +int bch2_readdir(struct bch_fs *, u64, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 index 000000000000..fdd2b9b6716f --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,280 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "acl.h" +#include "btree_update.h" +#include "dirent.h" +#include "fs-common.h" +#include "inode.h" +#include "xattr.h" + +#include + +int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *new_inode, + const struct qstr *name, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, + struct posix_acl *acl) +{ + struct bch_fs *c = trans->c; + struct btree_iter *dir_iter; + struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + u64 now = bch2_current_time(trans->c); + int ret; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, + name ? BTREE_ITER_INTENT : 0); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (!name) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, new_inode, + BLOCKDEV_INODE_MAX, 0, + &c->unused_inode_hint); + if (ret) + return ret; + + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + return ret; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inode, &hash, + acl, ACL_TYPE_ACCESS); + if (ret) + return ret; + } + + if (name) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); + dir_u->bi_mtime = dir_u->bi_ctime = now; + + if (S_ISDIR(new_inode->bi_mode)) + dir_u->bi_nlink++; + + ret = bch2_inode_write(trans, dir_iter, dir_u); + if (ret) + return ret; + + ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(new_inode->bi_mode), + name, new_inode->bi_inum, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + } + + return 0; +} + +int bch2_link_trans(struct btree_trans *trans, + u64 dir_inum, + u64 inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct btree_iter *dir_iter, *inode_iter; + struct bch_inode_unpacked dir_u; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(trans->c); + + dir_iter = bch2_inode_peek(trans, &dir_u, dir_inum, 0); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + dir_hash = bch2_hash_info_init(trans->c, &dir_u); + + inode_u->bi_ctime = now; + bch2_inode_nlink_inc(inode_u); + + return bch2_dirent_create(trans, dir_inum, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum, BCH_HASH_SET_MUST_CREATE) ?: + bch2_inode_write(trans, inode_iter, inode_u); +} + +int bch2_unlink_trans(struct btree_trans *trans, + u64 dir_inum, struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct btree_iter *dir_iter, *dirent_iter, *inode_iter; + struct bch_hash_info dir_hash; + u64 inum, now = bch2_current_time(trans->c); + struct bkey_s_c k; + + dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); + if (IS_ERR(dir_iter)) + return PTR_ERR(dir_iter); + + dir_hash = bch2_hash_info_init(trans->c, dir_u); + + dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, + &dir_hash, name); + if (IS_ERR(dirent_iter)) + return PTR_ERR(dirent_iter); + + k = bch2_btree_iter_peek_slot(dirent_iter); + inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); + + inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); + if (IS_ERR(inode_iter)) + return PTR_ERR(inode_iter); + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + bch2_inode_nlink_dec(inode_u); + + return (S_ISDIR(inode_u->bi_mode) + ? bch2_empty_dir_trans(trans, inum) + : 0) ?: + bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: + bch2_inode_write(trans, dir_iter, dir_u) ?: + bch2_inode_write(trans, inode_iter, inode_u); +} + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + struct bch_inode_unpacked *src_u) +{ + u64 src, dst; + unsigned id; + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { + if (dst_u->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(src_u, id); + dst = bch2_inode_opt_get(dst_u, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(dst_u, id, src); + ret = true; + } + + return ret; +} + +int bch2_rename_trans(struct btree_trans *trans, + u64 src_dir, struct bch_inode_unpacked *src_dir_u, + u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + struct bch_inode_unpacked *src_inode_u, + struct bch_inode_unpacked *dst_inode_u, + const struct qstr *src_name, + const struct qstr *dst_name, + enum bch_rename_mode mode) +{ + struct btree_iter *src_dir_iter, *dst_dir_iter = NULL; + struct btree_iter *src_inode_iter, *dst_inode_iter = NULL; + struct bch_hash_info src_hash, dst_hash; + u64 src_inode, dst_inode, now = bch2_current_time(trans->c); + int ret; + + src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, + BTREE_ITER_INTENT); + if (IS_ERR(src_dir_iter)) + return PTR_ERR(src_dir_iter); + + src_hash = bch2_hash_info_init(trans->c, src_dir_u); + + if (dst_dir != src_dir) { + dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); + if (IS_ERR(dst_dir_iter)) + return PTR_ERR(dst_dir_iter); + + dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); + } else { + dst_dir_u = src_dir_u; + dst_hash = src_hash; + } + + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, + src_name, &src_inode, + dst_name, &dst_inode, + mode); + if (ret) + return ret; + + src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, + BTREE_ITER_INTENT); + if (IS_ERR(src_inode_iter)) + return PTR_ERR(src_inode_iter); + + if (dst_inode) { + dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, + BTREE_ITER_INTENT); + if (IS_ERR(dst_inode_iter)) + return PTR_ERR(dst_inode_iter); + } + + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != + S_ISDIR(dst_inode_u->bi_mode)) + return -ENOTDIR; + + if (S_ISDIR(dst_inode_u->bi_mode) && + bch2_empty_dir_trans(trans, dst_inode)) + return -ENOTEMPTY; + } + + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) + return -EXDEV; + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) + return -EXDEV; + + if (S_ISDIR(src_inode_u->bi_mode)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + + if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } + + if (mode == BCH_RENAME_OVERWRITE) + bch2_inode_nlink_dec(dst_inode_u); + + src_dir_u->bi_mtime = now; + src_dir_u->bi_ctime = now; + + if (src_dir != dst_dir) { + dst_dir_u->bi_mtime = now; + dst_dir_u->bi_ctime = now; + } + + src_inode_u->bi_ctime = now; + + if (dst_inode) + dst_inode_u->bi_ctime = now; + + return bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: + (src_dir != dst_dir + ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) + : 0 ) ?: + bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: + (dst_inode + ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) + : 0 ); +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h new file mode 100644 index 000000000000..7adcfcf92aec --- /dev/null +++ b/fs/bcachefs/fs-common.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_COMMON_H +#define _BCACHEFS_FS_COMMON_H + +struct posix_acl; + +int bch2_create_trans(struct btree_trans *, u64, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, + struct posix_acl *); + +int bch2_link_trans(struct btree_trans *, + u64, + u64, struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_unlink_trans(struct btree_trans *, + u64, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_rename_trans(struct btree_trans *, + u64, struct bch_inode_unpacked *, + u64, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + const struct qstr *, + enum bch_rename_mode); + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 0cf2621ec4fc..acc0a230ff0c 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,6 +5,7 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" +#include "fs-common.h" #include "fs-ioctl.h" #include "quota.h" @@ -164,6 +165,15 @@ err: return ret; } +static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_inode_info *dir = p; + + return !bch2_reinherit_attrs(bi, &dir->ei_inode); +} + static int bch2_ioc_reinherit_attrs(struct bch_fs *c, struct file *file, struct bch_inode_info *src, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index cbe1b90e80c2..b19a2deed5c1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -9,6 +9,7 @@ #include "dirent.h" #include "extents.h" #include "fs.h" +#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fsck.h" @@ -148,34 +149,13 @@ void bch2_inode_update_after_write(struct bch_fs *c, bch2_inode_flags_to_vfs(inode); } -int __must_check bch2_write_inode_trans(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *inode_u, - inode_set_fn set, - void *p) -{ - struct btree_iter *iter = NULL; - int ret = 0; - - iter = bch2_inode_peek(trans, inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(iter); - if (ret) - return ret; - - ret = set ? set(inode, inode_u, p) : 0; - if (ret) - return ret; - - return bch2_inode_write(trans, iter, inode_u); -} - int __must_check bch2_write_inode(struct bch_fs *c, struct bch_inode_info *inode, inode_set_fn set, void *p, unsigned fields) { struct btree_trans trans; + struct btree_iter *iter; struct bch_inode_unpacked inode_u; int ret; @@ -183,7 +163,11 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(&trans); - ret = bch2_write_inode_trans(&trans, inode, &inode_u, set, p) ?: + iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, + BTREE_ITER_INTENT); + ret = PTR_ERR_OR_ZERO(iter) ?: + (set ? set(inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, &inode->ei_journal_seq, BTREE_INSERT_ATOMIC| @@ -238,32 +222,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_inode_info *dir = p; - u64 src, dst; - unsigned id; - int ret = 1; - - for (id = 0; id < Inode_opt_nr; id++) { - if (bi->bi_fields_set & (1 << id)) - continue; - - src = bch2_inode_opt_get(&dir->ei_inode, id); - dst = bch2_inode_opt_get(bi, id); - - if (src == dst) - continue; - - bch2_inode_opt_set(bi, id, src); - ret = 0; - } - - return ret; -} - struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) { struct bch_inode_unpacked inode_u; @@ -291,39 +249,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) return &inode->v; } -static void bch2_inode_init_owner(struct bch_inode_unpacked *inode_u, - const struct inode *dir, umode_t mode) -{ - kuid_t uid = current_fsuid(); - kgid_t gid; - - if (dir && dir->i_mode & S_ISGID) { - gid = dir->i_gid; - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - gid = current_fsgid(); - - inode_u->bi_uid = from_kuid(i_user_ns(dir), uid); - inode_u->bi_gid = from_kgid(i_user_ns(dir), gid); - inode_u->bi_mode = mode; -} - -static int inode_update_for_create_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct bch_inode_unpacked *new_inode = p; - - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); - - if (S_ISDIR(new_inode->bi_mode)) - bi->bi_nlink++; - - return 0; -} - static int inum_test(struct inode *inode, void *p) { unsigned long *ino = p; @@ -341,40 +266,27 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_inode_unpacked dir_u; struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *default_acl = NULL, *acl = NULL; u64 journal_seq = 0; int ret; - bch2_inode_init(c, &inode_u, 0, 0, 0, rdev, &dir->ei_inode); - bch2_inode_init_owner(&inode_u, &dir->v, mode); - - hash_info = bch2_hash_info_init(c, &inode_u); - - if (tmpfile) - inode_u.bi_flags |= BCH_INODE_UNLINKED; - - ret = bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, - KEY_TYPE_QUOTA_PREALLOC); - if (ret) - return ERR_PTR(ret); - + /* + * preallocate acls + vfs inode before btree transaction, so that + * nothing can fail after the transaction succeeds: + */ #ifdef CONFIG_BCACHEFS_POSIX_ACL - ret = posix_acl_create(&dir->v, &inode_u.bi_mode, &default_acl, &acl); + ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); if (ret) - goto err; + return ERR_PTR(ret); #endif - - /* - * preallocate vfs inode before btree transaction, so that nothing can - * fail after the transaction succeeds: - */ inode = to_bch_ei(new_inode(c->vfs_sb)); if (unlikely(!inode)) { - ret = -ENOMEM; + inode = ERR_PTR(-ENOMEM); goto err; } + bch2_inode_init_early(c, &inode_u); + if (!tmpfile) mutex_lock(&dir->ei_update_lock); @@ -382,38 +294,28 @@ __bch2_create(struct mnt_idmap *idmap, retry: bch2_trans_begin(&trans); - ret = __bch2_inode_create(&trans, &inode_u, - BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint) ?: - (default_acl - ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, - default_acl, ACL_TYPE_DEFAULT) - : 0) ?: - (acl - ? bch2_set_acl_trans(&trans, &inode_u, &hash_info, - acl, ACL_TYPE_ACCESS) - : 0) ?: - (!tmpfile - ? __bch2_dirent_create(&trans, dir->v.i_ino, - &dir->ei_str_hash, - mode_to_type(mode), - &dentry->d_name, - inode_u.bi_inum, - BCH_HASH_SET_MUST_CREATE) - : 0) ?: - (!tmpfile - ? bch2_write_inode_trans(&trans, dir, &dir_u, - inode_update_for_create_fn, - &inode_u) - : 0) ?: - bch2_trans_commit(&trans, NULL, - &journal_seq, + ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, + !tmpfile ? &dentry->d_name : NULL, + from_kuid(i_user_ns(&dir->v), current_fsuid()), + from_kgid(i_user_ns(&dir->v), current_fsgid()), + mode, rdev, + default_acl, acl) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_before_quota; + + ret = bch2_trans_commit(&trans, NULL, &journal_seq, BTREE_INSERT_ATOMIC| BTREE_INSERT_NOUNLOCK); - if (ret == -EINTR) - goto retry; - if (unlikely(ret)) + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +err_before_quota: + if (ret == -EINTR) + goto retry; goto err_trans; + } if (!tmpfile) { bch2_inode_update_after_write(c, dir, &dir_u, @@ -444,7 +346,7 @@ retry: * We raced, another process pulled the new inode into cache * before us: */ - old->ei_journal_seq = inode->ei_journal_seq; + journal_seq_copy(old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -458,7 +360,7 @@ retry: } bch2_trans_exit(&trans); -out: +err: posix_acl_release(default_acl); posix_acl_release(acl); return inode; @@ -469,10 +371,8 @@ err_trans: bch2_trans_exit(&trans); make_bad_inode(&inode->v); iput(&inode->v); -err: - bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); inode = ERR_PTR(ret); - goto out; + goto err; } /* methods */ @@ -515,40 +415,23 @@ static int __bch2_link(struct bch_fs *c, struct dentry *dentry) { struct btree_trans trans; - struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; int ret; mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); -retry: - bch2_trans_begin(&trans); - ret = __bch2_dirent_create(&trans, dir->v.i_ino, - &dir->ei_str_hash, - mode_to_type(inode->v.i_mode), - &dentry->d_name, - inode->v.i_ino, - BCH_HASH_SET_MUST_CREATE); - if (ret) - goto err; - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); - if (ret) - goto err; - - inode_u.bi_ctime = bch2_current_time(c); - bch2_inode_nlink_inc(&inode_u); - - ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK); -err: - if (ret == -EINTR) - goto retry; + do { + bch2_trans_begin(&trans); + ret = bch2_link_trans(&trans, + dir->v.i_ino, + inode->v.i_ino, &inode_u, + &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &inode->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK); + } while (ret == -EINTR); if (likely(!ret)) bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); @@ -582,60 +465,36 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); - struct btree_iter *dir_iter, *inode_iter; struct bch_inode_unpacked dir_u, inode_u; struct btree_trans trans; int ret; bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); -retry: - bch2_trans_begin(&trans); - - ret = __bch2_dirent_delete(&trans, dir->v.i_ino, - &dir->ei_str_hash, - &dentry->d_name); - if (ret) - goto btree_err; - - dir_iter = bch2_inode_peek(&trans, &dir_u, dir->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(dir_iter); - if (ret) - goto btree_err; - - inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, - BTREE_ITER_INTENT); - ret = PTR_ERR_OR_ZERO(inode_iter); - if (ret) - goto btree_err; - - dir_u.bi_mtime = dir_u.bi_ctime = inode_u.bi_ctime = - bch2_current_time(c); - - dir_u.bi_nlink -= S_ISDIR(inode_u.bi_mode); - bch2_inode_nlink_dec(&inode_u); - ret = bch2_inode_write(&trans, dir_iter, &dir_u) ?: - bch2_inode_write(&trans, inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &dir->ei_journal_seq, - BTREE_INSERT_ATOMIC| - BTREE_INSERT_NOUNLOCK| - BTREE_INSERT_NOFAIL); -btree_err: - if (ret == -EINTR) - goto retry; - if (ret) - goto err; - - journal_seq_copy(inode, dir->ei_journal_seq); + do { + bch2_trans_begin(&trans); + + ret = bch2_unlink_trans(&trans, + dir->v.i_ino, &dir_u, + &inode_u, &dentry->d_name) ?: + bch2_trans_commit(&trans, NULL, + &dir->ei_journal_seq, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOUNLOCK| + BTREE_INSERT_NOFAIL); + } while (ret == -EINTR); + + if (likely(!ret)) { + BUG_ON(inode_u.bi_inum != inode->v.i_ino); + + journal_seq_copy(inode, dir->ei_journal_seq); + bch2_inode_update_after_write(c, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(c, inode, &inode_u, + ATTR_MTIME); + } - bch2_inode_update_after_write(c, dir, &dir_u, - ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, - ATTR_MTIME); -err: bch2_trans_exit(&trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -693,11 +552,6 @@ static int bch2_mkdir(struct mnt_idmap *idmap, static int bch2_rmdir(struct inode *vdir, struct dentry *dentry) { - struct bch_fs *c = vdir->i_sb->s_fs_info; - - if (bch2_empty_dir(c, dentry->d_inode->i_ino)) - return -ENOTEMPTY; - return bch2_unlink(vdir, dentry); } @@ -715,99 +569,31 @@ static int bch2_mknod(struct mnt_idmap *idmap, return 0; } -struct rename_info { - u64 now; - struct bch_inode_info *src_dir; - struct bch_inode_info *dst_dir; - struct bch_inode_info *src_inode; - struct bch_inode_info *dst_inode; - enum bch_rename_mode mode; -}; - -static int inode_update_for_rename_fn(struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct rename_info *info = p; - int ret; - - if (inode == info->src_dir) { - bi->bi_nlink -= S_ISDIR(info->src_inode->v.i_mode); - bi->bi_nlink += info->dst_inode && - S_ISDIR(info->dst_inode->v.i_mode) && - info->mode == BCH_RENAME_EXCHANGE; - } - - if (inode == info->dst_dir) { - bi->bi_nlink += S_ISDIR(info->src_inode->v.i_mode); - bi->bi_nlink -= info->dst_inode && - S_ISDIR(info->dst_inode->v.i_mode); - } - - if (inode == info->src_inode) { - ret = bch2_reinherit_attrs_fn(inode, bi, info->dst_dir); - - BUG_ON(!ret && S_ISDIR(info->src_inode->v.i_mode)); - } - - if (inode == info->dst_inode && - info->mode == BCH_RENAME_EXCHANGE) { - ret = bch2_reinherit_attrs_fn(inode, bi, info->src_dir); - - BUG_ON(!ret && S_ISDIR(info->dst_inode->v.i_mode)); - } - - if (inode == info->dst_inode && - info->mode == BCH_RENAME_OVERWRITE) { - BUG_ON(bi->bi_nlink && - S_ISDIR(info->dst_inode->v.i_mode)); - - bch2_inode_nlink_dec(bi); - } - - if (inode == info->src_dir || - inode == info->dst_dir) - bi->bi_mtime = info->now; - bi->bi_ctime = info->now; - - return 0; -} - static int bch2_rename2(struct mnt_idmap *idmap, struct inode *src_vdir, struct dentry *src_dentry, struct inode *dst_vdir, struct dentry *dst_dentry, unsigned flags) { struct bch_fs *c = src_vdir->i_sb->s_fs_info; - struct rename_info i = { - .src_dir = to_bch_ei(src_vdir), - .dst_dir = to_bch_ei(dst_vdir), - .src_inode = to_bch_ei(src_dentry->d_inode), - .dst_inode = to_bch_ei(dst_dentry->d_inode), - .mode = flags & RENAME_EXCHANGE - ? BCH_RENAME_EXCHANGE - : dst_dentry->d_inode - ? BCH_RENAME_OVERWRITE : BCH_RENAME, - }; - struct btree_trans trans; + struct bch_inode_info *src_dir = to_bch_ei(src_vdir); + struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); + struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); + struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); struct bch_inode_unpacked dst_dir_u, src_dir_u; struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct btree_trans trans; + enum bch_rename_mode mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME; u64 journal_seq = 0; int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) return -EINVAL; - if (i.mode == BCH_RENAME_OVERWRITE) { - if (S_ISDIR(i.src_inode->v.i_mode) != - S_ISDIR(i.dst_inode->v.i_mode)) - return -ENOTDIR; - - if (S_ISDIR(i.src_inode->v.i_mode) && - bch2_empty_dir(c, i.dst_inode->v.i_ino)) - return -ENOTEMPTY; - - ret = filemap_write_and_wait_range(i.src_inode->v.i_mapping, + if (mode == BCH_RENAME_OVERWRITE) { + ret = filemap_write_and_wait_range(src_inode->v.i_mapping, 0, LLONG_MAX); if (ret) return ret; @@ -816,37 +602,24 @@ static int bch2_rename2(struct mnt_idmap *idmap, bch2_trans_init(&trans, c, 8, 2048); bch2_lock_inodes(INODE_UPDATE_LOCK, - i.src_dir, - i.dst_dir, - i.src_inode, - i.dst_inode); - - if (S_ISDIR(i.src_inode->v.i_mode) && - inode_attrs_changing(i.dst_dir, i.src_inode)) { - ret = -EXDEV; - goto err; - } - - if (i.mode == BCH_RENAME_EXCHANGE && - S_ISDIR(i.dst_inode->v.i_mode) && - inode_attrs_changing(i.src_dir, i.dst_inode)) { - ret = -EXDEV; - goto err; - } - - if (inode_attr_changing(i.dst_dir, i.src_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, i.src_inode, - i.dst_dir->ei_qid, + src_dir, + dst_dir, + src_inode, + dst_inode); + + if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, src_inode, + dst_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) goto err; } - if (i.mode == BCH_RENAME_EXCHANGE && - inode_attr_changing(i.src_dir, i.dst_inode, Inode_opt_project)) { - ret = bch2_fs_quota_transfer(c, i.dst_inode, - i.src_dir->ei_qid, + if (mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst_inode, + src_dir->ei_qid, 1 << QTYP_PRJ, KEY_TYPE_QUOTA_PREALLOC); if (ret) @@ -855,24 +628,14 @@ static int bch2_rename2(struct mnt_idmap *idmap, retry: bch2_trans_begin(&trans); - i.now = bch2_current_time(c); - - ret = bch2_dirent_rename(&trans, - i.src_dir, &src_dentry->d_name, - i.dst_dir, &dst_dentry->d_name, - i.mode) ?: - bch2_write_inode_trans(&trans, i.src_dir, &src_dir_u, - inode_update_for_rename_fn, &i) ?: - (i.src_dir != i.dst_dir - ? bch2_write_inode_trans(&trans, i.dst_dir, &dst_dir_u, - inode_update_for_rename_fn, &i) - : 0 ) ?: - bch2_write_inode_trans(&trans, i.src_inode, &src_inode_u, - inode_update_for_rename_fn, &i) ?: - (i.dst_inode - ? bch2_write_inode_trans(&trans, i.dst_inode, &dst_inode_u, - inode_update_for_rename_fn, &i) - : 0 ) ?: + ret = bch2_rename_trans(&trans, + src_dir->v.i_ino, &src_dir_u, + dst_dir->v.i_ino, &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode) ?: bch2_trans_commit(&trans, NULL, &journal_seq, BTREE_INSERT_ATOMIC| @@ -882,43 +645,47 @@ retry: if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, i.src_dir, &src_dir_u, + BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); + BUG_ON(dst_inode && + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(c, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(i.src_dir, journal_seq); + journal_seq_copy(src_dir, journal_seq); - if (i.src_dir != i.dst_dir) { - bch2_inode_update_after_write(c, i.dst_dir, &dst_dir_u, + if (src_dir != dst_dir) { + bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(i.dst_dir, journal_seq); + journal_seq_copy(dst_dir, journal_seq); } - journal_seq_copy(i.src_inode, journal_seq); - if (i.dst_inode) - journal_seq_copy(i.dst_inode, journal_seq); - - bch2_inode_update_after_write(c, i.src_inode, &src_inode_u, + bch2_inode_update_after_write(c, src_inode, &src_inode_u, ATTR_CTIME); - if (i.dst_inode) - bch2_inode_update_after_write(c, i.dst_inode, &dst_inode_u, + journal_seq_copy(src_inode, journal_seq); + + if (dst_inode) { + bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ATTR_CTIME); + journal_seq_copy(dst_inode, journal_seq); + } err: bch2_trans_exit(&trans); - bch2_fs_quota_transfer(c, i.src_inode, - bch_qid(&i.src_inode->ei_inode), + bch2_fs_quota_transfer(c, src_inode, + bch_qid(&src_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); - if (i.dst_inode) - bch2_fs_quota_transfer(c, i.dst_inode, - bch_qid(&i.dst_inode->ei_inode), + if (dst_inode) + bch2_fs_quota_transfer(c, dst_inode, + bch_qid(&dst_inode->ei_inode), 1 << QTYP_PRJ, KEY_TYPE_QUOTA_NOCHECK); bch2_unlock_inodes(INODE_UPDATE_LOCK, - i.src_dir, - i.dst_dir, - i.src_inode, - i.dst_inode); + src_dir, + dst_dir, + src_inode, + dst_inode); return ret; } @@ -1251,9 +1018,13 @@ static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { - struct bch_fs *c = file_inode(file)->i_sb->s_fs_info; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + if (!dir_emit_dots(file, ctx)) + return 0; - return bch2_readdir(c, file, ctx); + return bch2_readdir(c, inode->v.i_ino, ctx); } static const struct file_operations bch_file_operations = { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index c3ee9c17064f..b3a2993dd9bc 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -103,11 +103,6 @@ static inline struct bch_inode_info *file_bch_inode(struct file *file) return to_bch_ei(file_inode(file)); } -static inline u8 mode_to_type(umode_t mode) -{ - return (mode >> 12) & 15; -} - static inline bool inode_attr_changing(struct bch_inode_info *dir, struct bch_inode_info *inode, enum inode_opt_id id) @@ -162,17 +157,9 @@ void bch2_inode_update_after_write(struct bch_fs *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); -int __must_check bch2_write_inode_trans(struct btree_trans *, - struct bch_inode_info *, - struct bch_inode_unpacked *, - inode_set_fn, void *); int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); -int bch2_reinherit_attrs_fn(struct bch_inode_info *, - struct bch_inode_unpacked *, - void *); - void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index c5540536f47c..5acf1fb64543 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -4,7 +4,7 @@ #include "btree_update.h" #include "dirent.h" #include "error.h" -#include "fs.h" +#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" @@ -80,9 +80,7 @@ static int reattach_inode(struct bch_fs *c, struct bch_inode_unpacked *lostfound_inode, u64 inum) { - struct bch_hash_info lostfound_hash_info = - bch2_hash_info_init(c, lostfound_inode); - struct bkey_inode_buf packed; + struct bch_inode_unpacked inode_u; char name_buf[20]; struct qstr name; int ret; @@ -90,30 +88,14 @@ static int reattach_inode(struct bch_fs *c, snprintf(name_buf, sizeof(name_buf), "%llu", inum); name = (struct qstr) QSTR(name_buf); - lostfound_inode->bi_nlink++; - - bch2_inode_pack(&packed, lostfound_inode); - - ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error %i reattaching inode %llu while updating lost+found", - ret, inum); - return ret; - } + ret = bch2_trans_do(c, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_LAZY_RW, + bch2_link_trans(&trans, lostfound_inode->bi_inum, + inum, &inode_u, &name)); + if (ret) + bch_err(c, "error %i reattaching inode %llu", ret, inum); - ret = bch2_dirent_create(c, lostfound_inode->bi_inum, - &lostfound_hash_info, - DT_DIR, &name, inum, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) { - bch_err(c, "error %i reattaching inode %llu while creating new dirent", - ret, inum); - return ret; - } return ret; } @@ -758,7 +740,7 @@ static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) fsck_err: return ret; create_root: - bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, + bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, 0, NULL); root_inode->bi_inum = BCACHEFS_ROOT_INO; @@ -778,7 +760,6 @@ static int check_lostfound(struct bch_fs *c, struct qstr lostfound = QSTR("lost+found"); struct bch_hash_info root_hash_info = bch2_hash_info_init(c, root_inode); - struct bkey_inode_buf packed; u64 inum; int ret; @@ -806,33 +787,20 @@ static int check_lostfound(struct bch_fs *c, fsck_err: return ret; create_lostfound: - root_inode->bi_nlink++; - - bch2_inode_pack(&packed, root_inode); - - ret = bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, - NULL, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) - return ret; - - bch2_inode_init(c, lostfound_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, - 0, root_inode); - - ret = bch2_inode_create(c, lostfound_inode, BLOCKDEV_INODE_MAX, 0, - &c->unused_inode_hint); + bch2_inode_init_early(c, lostfound_inode); + + ret = bch2_trans_do(c, NULL, + BTREE_INSERT_ATOMIC| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + bch2_create_trans(&trans, + BCACHEFS_ROOT_INO, root_inode, + lostfound_inode, &lostfound, + 0, 0, S_IFDIR|0755, 0, NULL, NULL)); if (ret) - return ret; - - ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode->bi_inum, NULL, - BTREE_INSERT_NOFAIL| - BTREE_INSERT_LAZY_RW); - if (ret) - return ret; + bch_err(c, "error creating lost+found: %i", ret); - return 0; + return ret; } struct inode_bitmap { diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3dc46faaebbc..aeae536b39f1 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -297,11 +297,9 @@ void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); } -void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - uid_t uid, gid_t gid, umode_t mode, dev_t rdev, - struct bch_inode_unpacked *parent) +void bch2_inode_init_early(struct bch_fs *c, + struct bch_inode_unpacked *inode_u) { - s64 now = bch2_current_time(c); enum bch_str_hash_type str_hash = bch2_str_hash_opt_to_type(c, c->opts.str_hash); @@ -311,7 +309,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); +} +void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ inode_u->bi_mode = mode; inode_u->bi_uid = uid; inode_u->bi_gid = gid; @@ -321,6 +324,12 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, inode_u->bi_ctime = now; inode_u->bi_otime = now; + if (parent && parent->bi_mode & S_ISGID) { + inode_u->bi_gid = parent->bi_gid; + if (S_ISDIR(mode)) + inode_u->bi_mode |= S_ISGID; + } + if (parent) { #define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; BCH_INODE_OPTS() @@ -328,6 +337,15 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, } } +void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + bch2_inode_init_early(c, inode_u); + bch2_inode_init_late(inode_u, bch2_current_time(c), + uid, gid, mode, rdev, parent); +} + static inline u32 bkey_generation(struct bkey_s_c k) { switch (k.k->type) { @@ -340,9 +358,9 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } -int __bch2_inode_create(struct btree_trans *trans, - struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) +int bch2_inode_create(struct btree_trans *trans, + struct bch_inode_unpacked *inode_u, + u64 min, u64 max, u64 *hint) { struct bch_fs *c = trans->c; struct bkey_inode_buf *inode_p; @@ -408,13 +426,6 @@ out: return -ENOSPC; } -int bch2_inode_create(struct bch_fs *c, struct bch_inode_unpacked *inode_u, - u64 min, u64 max, u64 *hint) -{ - return bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, - __bch2_inode_create(&trans, inode_u, min, max, hint)); -} - int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) { struct btree_trans trans; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index c5626c668639..b32c0a47c25d 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -51,14 +51,17 @@ struct btree_iter *bch2_inode_peek(struct btree_trans *, int bch2_inode_write(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *); +void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); +void bch2_inode_init_late(struct bch_inode_unpacked *, u64, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); -int __bch2_inode_create(struct btree_trans *, - struct bch_inode_unpacked *, - u64, u64, u64 *); -int bch2_inode_create(struct bch_fs *, struct bch_inode_unpacked *, +int bch2_inode_create(struct btree_trans *, + struct bch_inode_unpacked *, u64, u64, u64 *); int bch2_inode_rm(struct bch_fs *, u64); @@ -108,6 +111,11 @@ static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, } } +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 2e880955a07c..e6015bc13e9b 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -10,6 +10,7 @@ #include "dirent.h" #include "ec.h" #include "error.h" +#include "fs-common.h" #include "fsck.h" #include "journal_io.h" #include "journal_reclaim.h" @@ -952,7 +953,6 @@ int bch2_fs_initialize(struct bch_fs *c) { struct bch_inode_unpacked root_inode, lostfound_inode; struct bkey_inode_buf packed_inode; - struct bch_hash_info root_hash_info; struct qstr lostfound = QSTR("lost+found"); const char *err = "cannot allocate memory"; struct bch_dev *ca; @@ -997,7 +997,6 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); root_inode.bi_inum = BCACHEFS_ROOT_INO; - root_inode.bi_nlink++; /* lost+found */ bch2_inode_pack(&packed_inode, &root_inode); err = "error creating root directory"; @@ -1007,24 +1006,15 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - bch2_inode_init(c, &lostfound_inode, 0, 0, - S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, - &root_inode); - lostfound_inode.bi_inum = BCACHEFS_ROOT_INO + 1; - bch2_inode_pack(&packed_inode, &lostfound_inode); + bch2_inode_init_early(c, &lostfound_inode); err = "error creating lost+found"; - ret = bch2_btree_insert(c, BTREE_ID_INODES, - &packed_inode.inode.k_i, - NULL, NULL, 0); - if (ret) - goto err; - - root_hash_info = bch2_hash_info_init(c, &root_inode); - - ret = bch2_dirent_create(c, BCACHEFS_ROOT_INO, &root_hash_info, DT_DIR, - &lostfound, lostfound_inode.bi_inum, NULL, - BTREE_INSERT_NOFAIL); + ret = bch2_trans_do(c, NULL, BTREE_INSERT_ATOMIC, + bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, 0755, 0, + NULL, NULL)); if (ret) goto err; -- cgit v1.2.3 From eb8e6e9ccbb4ba37c04a7cff032975b4df7d63c7 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 11 Nov 2020 12:33:12 -0500 Subject: bcachefs: Deadlock prevention for ei_pagecache_lock In the dio write path, when get_user_pages() invokes the fault handler we have a recursive locking situation - we have to handle the lock ordering ourselves or we have a deadlock: this patch addresses that by checking for locking ordering violations and doing the unlock/relock dance if necessary. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++-- fs/bcachefs/fs.c | 5 ++++ fs/bcachefs/fs.h | 1 + 3 files changed, 72 insertions(+), 2 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 658d19c04b99..1afdd775ffb3 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -44,6 +44,22 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + struct quota_res { u64 sectors; }; @@ -501,10 +517,35 @@ static void bch2_set_page_dirty(struct bch_fs *c, vm_fault_t bch2_page_fault(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); struct bch_inode_info *inode = file_bch_inode(file); int ret; + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + goto got_lock; + + bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + + bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(&inode->ei_pagecache_lock); + + bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + bch2_pagecache_add_get(&inode->ei_pagecache_lock); +got_lock: ret = filemap_fault(vmf); bch2_pagecache_add_put(&inode->ei_pagecache_lock); @@ -1765,14 +1806,16 @@ static long bch2_dio_write_loop(struct dio_write *dio) struct bio *bio = &dio->op.wbio.bio; struct bvec_iter_all iter; struct bio_vec *bv; - unsigned unaligned; - bool sync = dio->sync; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; long ret; if (dio->loop) goto loop; while (1) { + iter_count = dio->iter.count; + if (kthread) kthread_use_mm(dio->mm); BUG_ON(current->faults_disabled_mapping); @@ -1780,13 +1823,34 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bio_iov_iter_get_pages(bio, &dio->iter); + dropped_locks = fdm_dropped_locks(); + current->faults_disabled_mapping = NULL; if (kthread) kthread_unuse_mm(dio->mm); + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + if (unlikely(ret < 0)) goto err; + if (unlikely(dropped_locks)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); bio->bi_iter.bi_size -= unaligned; iov_iter_revert(&dio->iter, unaligned); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 3e3ab4e53f33..231a5433577f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -93,6 +93,11 @@ void bch2_pagecache_add_put(struct pagecache_lock *lock) __pagecache_lock_put(lock, 1); } +bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) +{ + return __pagecache_lock_tryget(lock, 1); +} + void bch2_pagecache_add_get(struct pagecache_lock *lock) { __pagecache_lock_get(lock, 1); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index b3a2993dd9bc..7c095b856b05 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -26,6 +26,7 @@ static inline void pagecache_lock_init(struct pagecache_lock *lock) } void bch2_pagecache_add_put(struct pagecache_lock *); +bool bch2_pagecache_add_tryget(struct pagecache_lock *); void bch2_pagecache_add_get(struct pagecache_lock *); void bch2_pagecache_block_put(struct pagecache_lock *); void bch2_pagecache_block_get(struct pagecache_lock *); -- cgit v1.2.3 From 33c74e4119a91c3ae87fc207777e34fdbb613c66 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 3 Dec 2020 14:27:20 -0500 Subject: bcachefs: Flag inodes that had btree update errors On write error, the vfs inode's i_size may be inconsistent with the btree inode's i_size - flag this so we don't have spurious assertions. Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 13 +++++++++++-- fs/bcachefs/fs.c | 1 + fs/bcachefs/fs.h | 7 +++++++ 3 files changed, 19 insertions(+), 2 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index c10192e2a688..2d31547446ac 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -994,6 +994,8 @@ static void bch2_writepage_io_done(struct closure *cl) unsigned i; if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + bio_for_each_segment_all(bvec, bio, iter) { struct bch_page_state *s; @@ -1916,7 +1918,13 @@ loop: bio_for_each_segment_all(bv, bio, iter) put_page(bv->bv_page); - if (!dio->iter.count || dio->op.error) + + if (dio->op.error) { + set_bit(EI_INODE_ERROR, &inode->ei_flags); + break; + } + + if (!dio->iter.count) break; bio_reset(bio, NULL, REQ_OP_WRITE); @@ -2306,7 +2314,8 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (ret) goto err; - BUG_ON(inode->v.i_size < inode_u.bi_size); + WARN_ON(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { ret = bch2_extend(inode, &inode_u, iattr); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index a3810493826b..7cd3f243d1ed 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1161,6 +1161,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->v.i_generation = bi->bi_generation; inode->v.i_size = bi->bi_size; + inode->ei_flags = 0; inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_str_hash = bch2_hash_info_init(c, bi); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 7c095b856b05..8c2796aa7abf 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -33,6 +33,7 @@ void bch2_pagecache_block_get(struct pagecache_lock *); struct bch_inode_info { struct inode v; + unsigned long ei_flags; struct mutex ei_update_lock; u64 ei_journal_seq; @@ -49,6 +50,12 @@ struct bch_inode_info { struct bch_inode_unpacked ei_inode; }; +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) -- cgit v1.2.3 From 07bca3bd1e5423b2d6fe8c7085af3e92b31c461f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 2 Mar 2021 18:35:30 -0500 Subject: bcachefs: Kill ei_str_hash Signed-off-by: Kent Overstreet Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 15 +++++++++------ fs/bcachefs/acl.h | 4 ++-- fs/bcachefs/fs-ioctl.c | 4 ++-- fs/bcachefs/fs.c | 7 +++---- fs/bcachefs/fs.h | 2 -- fs/bcachefs/xattr.c | 19 ++++++++++--------- 6 files changed, 26 insertions(+), 25 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index acc1d03c79e4..3879815bcede 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -217,6 +217,7 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c_xattr xattr; @@ -227,7 +228,7 @@ retry: bch2_trans_begin(&trans); iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + &hash, inode->v.i_ino, &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); if (IS_ERR(iter)) { @@ -290,6 +291,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, struct btree_trans trans; struct btree_iter *inode_iter; struct bch_inode_unpacked inode_u; + struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -314,9 +316,9 @@ retry: goto err; } - ret = bch2_set_acl_trans(&trans, &inode_u, - &inode->ei_str_hash, - acl, type); + hash_info = bch2_hash_info_init(c, &inode_u); + + ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); if (ret) goto btree_err; @@ -345,10 +347,11 @@ err: } int bch2_acl_chmod(struct btree_trans *trans, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); struct btree_iter *iter; struct bkey_s_c_xattr xattr; struct bkey_i_xattr *new; @@ -356,7 +359,7 @@ int bch2_acl_chmod(struct btree_trans *trans, int ret = 0; iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + &hash_info, inode->bi_inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); if (IS_ERR(iter)) diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index 73739e38e2d5..f11eb9d4592c 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -33,7 +33,7 @@ int bch2_set_acl_trans(struct btree_trans *, const struct bch_hash_info *, struct posix_acl *, int); int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, +int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else @@ -47,7 +47,7 @@ static inline int bch2_set_acl_trans(struct btree_trans *trans, } static inline int bch2_acl_chmod(struct btree_trans *trans, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) { diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index f6773783b958..09a9567b402c 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -183,6 +183,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, struct bch_inode_info *src, const char __user *name) { + struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); struct bch_inode_info *dst; struct inode *vinode = NULL; char *kname = NULL; @@ -202,8 +203,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.name = kname; ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, - &src->ei_str_hash, + inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, &qstr); if (!inum) goto err1; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index a168d09ffd37..ef8505da7391 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -370,11 +370,11 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; u64 inum; - inum = bch2_dirent_lookup(c, dir->v.i_ino, - &dir->ei_str_hash, + inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, &dentry->d_name); if (inum) @@ -723,7 +723,7 @@ retry: bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); if (ret) goto btree_err; } @@ -1150,7 +1150,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_flags = 0; inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; - inode->ei_str_hash = bch2_hash_info_init(c, bi); inode->ei_qid = bch_qid(bi); inode->v.i_mapping->a_ops = &bch_address_space_operations; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 8c2796aa7abf..f3072780af51 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -44,8 +44,6 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; - struct bch_hash_info ei_str_hash; - /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 5555d45df54e..5692b47eb3c9 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -121,6 +121,7 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, const char *name, void *buffer, size_t size, int type) { + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct btree_trans trans; struct btree_iter *iter; struct bkey_s_c_xattr xattr; @@ -128,8 +129,8 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, bch2_trans_init(&trans, c, 0, 0); - iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, - &inode->ei_str_hash, inode->v.i_ino, + iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, &hash, + inode->v.i_ino, &X_SEARCH(type, name, strlen(name)), 0); if (IS_ERR(iter)) { @@ -239,7 +240,7 @@ static int bch2_xattr_emit(struct dentry *dentry, } static int bch2_xattr_list_bcachefs(struct bch_fs *c, - struct bch_inode_info *inode, + struct bch_inode_unpacked *inode, struct xattr_buf *buf, bool all) { @@ -249,12 +250,12 @@ static int bch2_xattr_list_bcachefs(struct bch_fs *c, u64 v; for (id = 0; id < Inode_opt_nr; id++) { - v = bch2_inode_opt_get(&inode->ei_inode, id); + v = bch2_inode_opt_get(inode, id); if (!v) continue; if (!all && - !(inode->ei_inode.bi_fields_set & (1 << id))) + !(inode->bi_fields_set & (1 << id))) continue; ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], @@ -298,11 +299,11 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) return ret; - ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); if (ret) return ret; - ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); if (ret) return ret; @@ -327,10 +328,10 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, { struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, - &inode->ei_str_hash, + bch2_xattr_set(&trans, inode->v.i_ino, &hash, name, value, size, handler->flags, flags)); } -- cgit v1.2.3 From 68a507a2e8cdc9b90599bb5d220a696abdc54838 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 14 Jun 2021 22:29:54 -0400 Subject: bcachefs: fix truncate with ATTR_MODE After the v5.12 rebase, we started oopsing when truncate was passed ATTR_MODE, due to not passing mnt_userns to setattr_copy(). This refactors things so that truncate/extend finish by using bch2_setattr_nonsize(), which solves the problem. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 43 +++++++++++++++++++++++-------------------- fs/bcachefs/fs-io.h | 3 ++- fs/bcachefs/fs.c | 11 +++++++---- fs/bcachefs/fs.h | 4 ++++ 4 files changed, 36 insertions(+), 25 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 0ffc3971d1b2..a25c3b70ef74 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -2252,11 +2252,11 @@ static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) from, round_up(from, PAGE_SIZE)); } -static int bch2_extend(struct bch_inode_info *inode, +static int bch2_extend(struct mnt_idmap *idmap, + struct bch_inode_info *inode, struct bch_inode_unpacked *inode_u, struct iattr *iattr) { - struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; int ret; @@ -2270,25 +2270,15 @@ static int bch2_extend(struct bch_inode_info *inode, return ret; truncate_setsize(&inode->v, iattr->ia_size); - /* ATTR_MODE will never be set here, ns argument isn't needed: */ - setattr_copy(NULL, &inode->v, iattr); - - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode_size(c, inode, inode->v.i_size, - ATTR_MTIME|ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - return ret; + return bch2_setattr_nonsize(idmap, inode, iattr); } static int bch2_truncate_finish_fn(struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { - struct bch_fs *c = inode->v.i_sb->s_fs_info; - bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; - bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); return 0; } @@ -2302,7 +2292,8 @@ static int bch2_truncate_start_fn(struct bch_inode_info *inode, return 0; } -int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) +int bch2_truncate(struct mnt_idmap *idmap, + struct bch_inode_info *inode, struct iattr *iattr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct address_space *mapping = inode->v.i_mapping; @@ -2313,6 +2304,18 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) s64 i_sectors_delta = 0; int ret = 0; + /* + * Don't update timestamps if we're not doing anything: + */ + if (iattr->ia_size == inode->v.i_size) + return 0; + + if (!(iattr->ia_valid & ATTR_MTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_mtime); + if (!(iattr->ia_valid & ATTR_CTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_ctime); + iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; + inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); @@ -2342,10 +2345,12 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) inode->v.i_size < inode_u.bi_size); if (iattr->ia_size > inode->v.i_size) { - ret = bch2_extend(inode, &inode_u, iattr); + ret = bch2_extend(idmap, inode, &inode_u, iattr); goto err; } + iattr->ia_valid &= ~ATTR_SIZE; + ret = bch2_truncate_page(inode, iattr->ia_size); if (unlikely(ret)) goto err; @@ -2389,13 +2394,11 @@ int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) if (unlikely(ret)) goto err; - /* ATTR_MODE will never be set here, ns argument isn't needed: */ - setattr_copy(NULL, &inode->v, iattr); - mutex_lock(&inode->ei_update_lock); - ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, - ATTR_MTIME|ATTR_CTIME); + ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); mutex_unlock(&inode->ei_update_lock); + + ret = bch2_setattr_nonsize(idmap, inode, iattr); err: bch2_pagecache_block_put(&inode->ei_pagecache_lock); return ret; diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h index 2a2df58a46bb..64b16b44e25a 100644 --- a/fs/bcachefs/fs-io.h +++ b/fs/bcachefs/fs-io.h @@ -31,7 +31,8 @@ ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); int bch2_fsync(struct file *, loff_t, loff_t, int); -int bch2_truncate(struct bch_inode_info *, struct iattr *); +int bch2_truncate(struct mnt_idmap *, + struct bch_inode_info *, struct iattr *); long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index efb467316756..71e738b98967 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -662,6 +662,9 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_GID) bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + if (ia_valid & ATTR_SIZE) + bi->bi_size = attr->ia_size; + if (ia_valid & ATTR_ATIME) bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); if (ia_valid & ATTR_MTIME) @@ -682,9 +685,9 @@ static void bch2_setattr_copy(struct mnt_idmap *idmap, } } -static int bch2_setattr_nonsize(struct mnt_idmap *idmap, - struct bch_inode_info *inode, - struct iattr *attr) +int bch2_setattr_nonsize(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct iattr *attr) { struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; @@ -808,7 +811,7 @@ static int bch2_setattr(struct mnt_idmap *idmap, return ret; return iattr->ia_valid & ATTR_SIZE - ? bch2_truncate(inode, iattr) + ? bch2_truncate(idmap, inode, iattr) : bch2_setattr_nonsize(idmap, inode, iattr); } diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index f3072780af51..c08a828d66cd 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -166,6 +166,10 @@ void bch2_inode_update_after_write(struct bch_fs *, int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, inode_set_fn, void *, unsigned); +int bch2_setattr_nonsize(struct mnt_idmap *, + struct bch_inode_info *, + struct iattr *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); -- cgit v1.2.3 From 284ae18c1d7aa44232baedf860a004ceb32fea62 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Mar 2021 01:33:39 -0400 Subject: bcachefs: Add subvolume to ei_inode_info Filesystem operations generally operate within a subvolume: at the start of every btree transaction we'll be looking up (and locking) the subvolume to get the current snapshot ID, which we then use for our other btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode. But inodes don't record what subvolume they're in - they can't, because if they did we'd have to update every single inode within a subvolume when taking a snapshot in order to keep that field up to date. So it needs to be tracked in memory, based on how we got to that inode. Hence this patch adds a subvolume field to ei_inode_info, and switches to iget5() so we can index by it in the inode hash table. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 6 ++-- fs/bcachefs/fs.c | 85 ++++++++++++++++++++++++++++++++++++-------------- fs/bcachefs/fs.h | 12 ++++++- 3 files changed, 76 insertions(+), 27 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 6d6368555875..ff6b1739342d 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, char *kname = NULL; struct qstr qstr; int ret = 0; - u64 inum; + subvol_inum inum = { .subvol = 1 }; kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); if (!kname) @@ -206,9 +206,9 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.name = kname; ret = -ENOENT; - inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, + inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, &qstr); - if (!inum) + if (!inum.inum) goto err1; vinode = bch2_vfs_inode_get(c, inum); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 570ae826ebb5..7a994f3f9d20 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -37,7 +37,7 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, +static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *); @@ -209,40 +209,68 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; int ret; - inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); + /* + * debug assert, to be removed when we start creating + * subvolumes/snapshots: + */ + BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL); + + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum, &inode_u); + ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u); if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inode, &inode_u); + bch2_vfs_inode_init(c, inum, inode, &inode_u); - inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); + inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum.inum); unlock_new_inode(&inode->v); return &inode->v; } -static int inum_test(struct inode *inode, void *p) -{ - unsigned long *ino = p; - - return *ino == inode->i_ino; -} - static struct bch_inode_info * __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, @@ -254,6 +282,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *inode, *old; struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; u64 journal_seq = 0; int ret; @@ -310,7 +339,10 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - bch2_vfs_inode_init(c, inode, &inode_u); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + bch2_vfs_inode_init(c, inum, inode, &inode_u); journal_seq_copy(c, inode, journal_seq); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -323,8 +355,12 @@ err_before_quota: */ inode->v.i_state |= I_CREATING; - old = to_bch_ei(inode_insert5(&inode->v, inode->v.i_ino, - inum_test, NULL, &inode->v.i_ino)); + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); BUG_ON(!old); if (unlikely(old != inode)) { @@ -370,12 +406,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_inode_info *dir = to_bch_ei(vdir); struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; - u64 inum; + subvol_inum inum = { .subvol = 1 }; - inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, + inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, &dentry->d_name); - if (inum) + if (inum.inum) vinode = bch2_vfs_inode_get(c, inum); return d_splice_alias(vinode, dentry); @@ -1098,6 +1134,7 @@ static const struct address_space_operations bch_address_space_operations = { .error_remove_page = generic_error_remove_page, }; +#if 0 static struct inode *bch2_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -1131,14 +1168,15 @@ static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, return generic_fh_to_parent(sb, fid, fh_len, fh_type, bch2_nfs_get_inode); } +#endif static const struct export_operations bch_export_ops = { - .fh_to_dentry = bch2_fh_to_dentry, - .fh_to_parent = bch2_fh_to_parent, + //.fh_to_dentry = bch2_fh_to_dentry, + //.fh_to_parent = bch2_fh_to_parent, //.get_parent = bch2_get_parent, }; -static void bch2_vfs_inode_init(struct bch_fs *c, +static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { @@ -1154,6 +1192,7 @@ static void bch2_vfs_inode_init(struct bch_fs *c, inode->ei_journal_seq = 0; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; inode->v.i_mapping->a_ops = &bch_address_space_operations; @@ -1595,7 +1634,7 @@ got_sb: sb->s_flags |= SB_POSIXACL; #endif - vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); if (IS_ERR(vinode)) { bch_err(c, "error mounting: error getting root inode %i", (int) PTR_ERR(vinode)); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index c08a828d66cd..6dae425bf616 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -44,10 +44,20 @@ struct bch_inode_info { struct mutex ei_quota_lock; struct bch_qid ei_qid; + u32 ei_subvol; + /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + /* * Set if we've gotten a btree error for this inode, and thus the vfs inode and * btree inode may be inconsistent: @@ -153,7 +163,7 @@ static inline int bch2_set_projid(struct bch_fs *c, KEY_TYPE_QUOTA_PREALLOC); } -struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ typedef int (*inode_set_fn)(struct bch_inode_info *, -- cgit v1.2.3 From 6fed42bb7750e217b0d1169ccfccc7639a3e1d3f Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Mar 2021 00:28:17 -0400 Subject: bcachefs: Plumb through subvolume id To implement snapshots, we need every filesystem btree operation (every btree operation without a subvolume) to start by looking up the subvolume and getting the current snapshot ID, with bch2_subvolume_get_snapshot() - then, that snapshot ID is used for doing btree lookups in BTREE_ITER_FILTER_SNAPSHOTS mode. This patch adds those bch2_subvolume_get_snapshot() calls, and also switches to passing around a subvol_inum instead of just an inode number. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 25 +++++----- fs/bcachefs/acl.h | 11 ++--- fs/bcachefs/dirent.c | 107 +++++++++++++++++++++++++++------------- fs/bcachefs/dirent.h | 29 +++++------ fs/bcachefs/extents.c | 32 ------------ fs/bcachefs/extents.h | 1 - fs/bcachefs/fs-common.c | 127 ++++++++++++++++++++++++++++++------------------ fs/bcachefs/fs-common.h | 21 ++++---- fs/bcachefs/fs-io.c | 117 +++++++++++++++++++++++++++++++++++++++----- fs/bcachefs/fs-ioctl.c | 8 ++- fs/bcachefs/fs.c | 77 +++++++++++++++++------------ fs/bcachefs/fs.h | 4 ++ fs/bcachefs/fsck.c | 5 +- fs/bcachefs/inode.c | 109 +++++++++++++++++++++++++++++++++-------- fs/bcachefs/inode.h | 7 +-- fs/bcachefs/io.c | 5 +- fs/bcachefs/move.c | 3 +- fs/bcachefs/recovery.c | 5 +- fs/bcachefs/reflink.c | 18 ++++++- fs/bcachefs/reflink.h | 4 +- fs/bcachefs/str_hash.h | 41 +++++++++++----- fs/bcachefs/xattr.c | 23 +++++++-- fs/bcachefs/xattr.h | 3 +- 23 files changed, 526 insertions(+), 256 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 93b78e4e6e0d..2afa15b26700 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -230,7 +230,7 @@ retry: bch2_trans_begin(&trans); ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, - &hash, inode->v.i_ino, + &hash, inode_inum(inode), &X_SEARCH(acl_to_xattr_type(type), "", 0), 0); if (ret) { @@ -260,11 +260,11 @@ out: return acl; } -int bch2_set_acl_trans(struct btree_trans *trans, +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); int ret; if (type == ACL_TYPE_DEFAULT && @@ -277,14 +277,14 @@ int bch2_set_acl_trans(struct btree_trans *trans, if (IS_ERR(xattr)) return PTR_ERR(xattr); - ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &xattr->k_i, 0); + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); } else { struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, - inode_u->bi_inum, &search); + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); } return ret == -ENOENT ? 0 : ret; @@ -299,7 +299,6 @@ int bch2_set_acl(struct mnt_idmap *idmap, struct btree_trans trans; struct btree_iter inode_iter = { NULL }; struct bch_inode_unpacked inode_u; - struct bch_hash_info hash_info; struct posix_acl *acl; umode_t mode; int ret; @@ -310,7 +309,7 @@ retry: bch2_trans_begin(&trans); acl = _acl; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -323,9 +322,7 @@ retry: goto btree_err; } - hash_info = bch2_hash_info_init(c, &inode_u); - - ret = bch2_set_acl_trans(&trans, &inode_u, &hash_info, acl, type); + ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); if (ret) goto btree_err; @@ -354,7 +351,7 @@ err: return ret; } -int bch2_acl_chmod(struct btree_trans *trans, +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) @@ -368,7 +365,7 @@ int bch2_acl_chmod(struct btree_trans *trans, int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, - &hash_info, inode->bi_inum, + &hash_info, inum, &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), BTREE_ITER_INTENT); if (ret) diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h index f11eb9d4592c..bb21d8d696a2 100644 --- a/fs/bcachefs/acl.h +++ b/fs/bcachefs/acl.h @@ -28,25 +28,24 @@ typedef struct { struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); -int bch2_set_acl_trans(struct btree_trans *, +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, - const struct bch_hash_info *, struct posix_acl *, int); int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -int bch2_acl_chmod(struct btree_trans *, struct bch_inode_unpacked *, +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, umode_t, struct posix_acl **); #else -static inline int bch2_set_acl_trans(struct btree_trans *trans, +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode_u, - const struct bch_hash_info *hash_info, struct posix_acl *acl, int type) { return 0; } -static inline int bch2_acl_chmod(struct btree_trans *trans, +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, struct bch_inode_unpacked *inode, umode_t mode, struct posix_acl **new_acl) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index f3aef0686928..f290580594ce 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -8,6 +8,7 @@ #include "fs.h" #include "keylist.h" #include "str_hash.h" +#include "subvolume.h" #include @@ -150,8 +151,8 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, return dirent; } -int bch2_dirent_create(struct btree_trans *trans, - u64 dir_inum, const struct bch_hash_info *hash_info, +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, u8 type, const struct qstr *name, u64 dst_inum, u64 *dir_offset, int flags) { @@ -164,7 +165,7 @@ int bch2_dirent_create(struct btree_trans *trans, return ret; ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, &dirent->k_i, flags); + dir, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -223,31 +224,40 @@ err: return ret; } -int bch2_dirent_read_target(struct btree_trans *trans, - struct bkey_s_c_dirent d, u64 *target) +static int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) { - u32 subvol, snapshot; + u32 snapshot; + int ret = 0; - return __bch2_dirent_read_target(trans, d, &subvol, - &snapshot, target, false); + ret = __bch2_dirent_read_target(trans, d, &target->subvol, &snapshot, + &target->inum, false); + if (!target->subvol) + target->subvol = dir.subvol; + + return ret; } int bch2_dirent_rename(struct btree_trans *trans, - u64 src_dir, struct bch_hash_info *src_hash, - u64 dst_dir, struct bch_hash_info *dst_hash, - const struct qstr *src_name, u64 *src_inum, u64 *src_offset, - const struct qstr *dst_name, u64 *dst_inum, u64 *dst_offset, - enum bch_rename_mode mode) + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) { struct btree_iter src_iter = { NULL }; struct btree_iter dst_iter = { NULL }; struct bkey_s_c old_src, old_dst; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = - POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); int ret = 0; - *src_inum = *dst_inum = 0; + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; + + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); /* * Lookup dst: @@ -270,8 +280,12 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - if (mode != BCH_RENAME) - *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); + if (mode != BCH_RENAME) { + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + } if (mode != BCH_RENAME_EXCHANGE) *src_offset = dst_iter.pos.offset; @@ -287,7 +301,10 @@ int bch2_dirent_rename(struct btree_trans *trans, if (ret) goto out; - *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); + if (ret) + goto out; /* Create new dst key: */ new_dst = dirent_create_key(trans, 0, dst_name, 0); @@ -376,17 +393,22 @@ int bch2_dirent_delete_at(struct btree_trans *trans, int __bch2_dirent_lookup_trans(struct btree_trans *trans, struct btree_iter *iter, - u64 dir_inum, + subvol_inum dir, const struct bch_hash_info *hash_info, - const struct qstr *name, u64 *inum, + const struct qstr *name, subvol_inum *inum, unsigned flags) { struct bkey_s_c k; struct bkey_s_c_dirent d; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir_inum, name, flags); + hash_info, dir, name, flags); if (ret) return ret; @@ -399,44 +421,49 @@ int __bch2_dirent_lookup_trans(struct btree_trans *trans, d = bkey_s_c_to_dirent(k); - ret = bch2_dirent_read_target(trans, d, inum); + ret = bch2_dirent_read_target(trans, dir, d, inum); if (ret) bch2_trans_iter_exit(trans, iter); return ret; } -u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct bch_hash_info *hash_info, - const struct qstr *name) + const struct qstr *name, subvol_inum *inum) { struct btree_trans trans; struct btree_iter iter; - u64 inum = 0; - int ret = 0; + int ret; bch2_trans_init(&trans, c, 0, 0); retry: bch2_trans_begin(&trans); - ret = __bch2_dirent_lookup_trans(&trans, &iter, dir_inum, hash_info, - name, &inum, 0); + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); bch2_trans_iter_exit(&trans, &iter); if (ret == -EINTR) goto retry; bch2_trans_exit(&trans); - return inum; + return ret; } -int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) { struct btree_iter iter; struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, iter, BTREE_ID_dirents, - POS(dir_inum, 0), 0, k, ret) { - if (k.k->p.inode > dir_inum) + SPOS(dir.inum, 0, snapshot), 0, k, ret) { + if (k.k->p.inode > dir.inum) break; if (k.k->type == KEY_TYPE_dirent) { @@ -449,19 +476,26 @@ int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) return ret; } -int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) { struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_dirent dirent; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_dirents, - POS(inum, ctx->pos), 0, k, ret) { - if (k.k->p.inode > inum) + SPOS(inum.inum, ctx->pos, snapshot), 0, k, ret) { + if (k.k->p.inode > inum.inum) break; if (k.k->type != KEY_TYPE_dirent) @@ -482,6 +516,9 @@ int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ctx->pos = dirent.k->p.offset + 1; } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 3cd05a2454e1..88b784a99cb5 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -29,7 +29,7 @@ static inline unsigned dirent_val_u64s(unsigned len) sizeof(u64)); } -int bch2_dirent_create(struct btree_trans *, u64, +int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); @@ -40,9 +40,6 @@ int bch2_dirent_delete_at(struct btree_trans *, int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, u32 *, u32 *, u64 *, bool); -int bch2_dirent_read_target(struct btree_trans *, - struct bkey_s_c_dirent, u64 *); - static inline unsigned vfs_d_type(unsigned type) { return type == DT_SUBVOL ? DT_DIR : type; @@ -55,20 +52,20 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - u64, struct bch_hash_info *, - u64, struct bch_hash_info *, - const struct qstr *, u64 *, u64 *, - const struct qstr *, u64 *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); -int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, u64, - const struct bch_hash_info *, - const struct qstr *, u64 *, - unsigned); -u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, - const struct qstr *); +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); -int bch2_empty_dir_trans(struct btree_trans *, u64); -int bch2_readdir(struct bch_fs *, u64, struct dir_context *); +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 0190605711e5..966d6ef41793 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -611,38 +611,6 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c k) return false; } -bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, - unsigned nr_replicas, bool compressed) -{ - struct btree_trans trans; - struct btree_iter iter; - struct bpos end = pos; - struct bkey_s_c k; - bool ret = true; - int err; - - end.offset += size; - - bch2_trans_init(&trans, c, 0, 0); - - for_each_btree_key(&trans, iter, BTREE_ID_extents, pos, - BTREE_ITER_SLOTS, k, err) { - if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) - break; - - if (nr_replicas > bch2_bkey_replicas(c, k) || - (!compressed && bch2_bkey_sectors_compressed(k))) { - ret = false; - break; - } - } - bch2_trans_iter_exit(&trans, &iter); - - bch2_trans_exit(&trans); - - return ret; -} - unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 43cef0a3bdf3..afd3067bb64e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -567,7 +567,6 @@ unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); -bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned, bool); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 96b09b005d0b..02bf32cc7659 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -6,28 +6,38 @@ #include "dirent.h" #include "fs-common.h" #include "inode.h" +#include "subvolume.h" #include "xattr.h" #include -int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *new_inode, const struct qstr *name, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct posix_acl *default_acl, - struct posix_acl *acl) + struct posix_acl *acl, + unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; struct btree_iter inode_iter = { NULL }; - struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); + subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); u64 dir_offset = 0; + u64 dir_target; + u32 snapshot; + unsigned dir_type; int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; @@ -36,19 +46,23 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (!name) new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, &inode_iter, new_inode, U32_MAX, cpu); + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); if (ret) goto err; + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + dir_type = mode_to_type(new_inode->bi_mode); + if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, + ret = bch2_set_acl_trans(trans, new_inum, new_inode, default_acl, ACL_TYPE_DEFAULT); if (ret) goto err; } if (acl) { - ret = bch2_set_acl_trans(trans, new_inode, &hash, + ret = bch2_set_acl_trans(trans, new_inum, new_inode, acl, ACL_TYPE_ACCESS); if (ret) goto err; @@ -56,18 +70,19 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, if (name) { struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); - dir_u->bi_mtime = dir_u->bi_ctime = now; if (S_ISDIR(new_inode->bi_mode)) dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; ret = bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, - mode_to_type(new_inode->bi_mode), - name, new_inode->bi_inum, + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) @@ -79,9 +94,8 @@ int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, new_inode->bi_dir_offset = dir_offset; } - /* XXX use bch2_btree_iter_set_snapshot() */ - inode_iter.snapshot = U32_MAX; - bch2_btree_iter_set_pos(&inode_iter, SPOS(0, new_inode->bi_inum, U32_MAX)); + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); ret = bch2_btree_iter_traverse(&inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); @@ -91,9 +105,10 @@ err: return ret; } -int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, - u64 inum, struct bch_inode_unpacked *dir_u, - struct bch_inode_unpacked *inode_u, const struct qstr *name) +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -103,6 +118,9 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, u64 dir_offset = 0; int ret; + if (dir.subvol != inum.subvol) + return -EXDEV; + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -110,7 +128,7 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, inode_u->bi_ctime = now; bch2_inode_nlink_inc(inode_u); - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; @@ -118,15 +136,15 @@ int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, dir_hash = bch2_hash_info_init(c, dir_u); - ret = bch2_dirent_create(trans, dir_inum, &dir_hash, + ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum, &dir_offset, + name, inum.inum, &dir_offset, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - inode_u->bi_dir = dir_inum; + inode_u->bi_dir = dir.inum; inode_u->bi_dir_offset = dir_offset; } @@ -139,7 +157,8 @@ err: } int bch2_unlink_trans(struct btree_trans *trans, - u64 dir_inum, struct bch_inode_unpacked *dir_u, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name) { @@ -148,39 +167,49 @@ int bch2_unlink_trans(struct btree_trans *trans, struct btree_iter dirent_iter = { NULL }; struct btree_iter inode_iter = { NULL }; struct bch_hash_info dir_hash; - u64 inum, now = bch2_current_time(c); - struct bkey_s_c k; + subvol_inum inum; + u64 now = bch2_current_time(c); int ret; - ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir_inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); if (ret) goto err; dir_hash = bch2_hash_info_init(c, dir_u); - ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir_inum, &dir_hash, + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, name, &inum, BTREE_ITER_INTENT); if (ret) goto err; - ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); if (ret) goto err; - if (inode_u->bi_dir == k.k->p.inode && - inode_u->bi_dir_offset == k.k->p.offset) { + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { inode_u->bi_dir = 0; inode_u->bi_dir_offset = 0; } + if (S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + + if (dir.subvol != inum.subvol) { + ret = bch2_subvolume_delete(trans, inum.subvol, false); + if (ret) + goto err; + } + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); bch2_inode_nlink_dec(inode_u); - ret = (S_ISDIR(inode_u->bi_mode) - ? bch2_empty_dir_trans(trans, inum) - : 0) ?: - bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: + ret = bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -215,8 +244,8 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, } int bch2_rename_trans(struct btree_trans *trans, - u64 src_dir, struct bch_inode_unpacked *src_dir_u, - u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, struct bch_inode_unpacked *src_inode_u, struct bch_inode_unpacked *dst_inode_u, const struct qstr *src_name, @@ -229,7 +258,8 @@ int bch2_rename_trans(struct btree_trans *trans, struct btree_iter src_inode_iter = { NULL }; struct btree_iter dst_inode_iter = { NULL }; struct bch_hash_info src_hash, dst_hash; - u64 src_inode, src_offset, dst_inode, dst_offset; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; u64 now = bch2_current_time(c); int ret; @@ -240,7 +270,8 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir != src_dir) { + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_INTENT); if (ret) @@ -255,19 +286,19 @@ int bch2_rename_trans(struct btree_trans *trans, ret = bch2_dirent_rename(trans, src_dir, &src_hash, dst_dir, &dst_hash, - src_name, &src_inode, &src_offset, - dst_name, &dst_inode, &dst_offset, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, mode); if (ret) goto err; - ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inode, + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, BTREE_ITER_INTENT); if (ret) goto err; - if (dst_inode) { - ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inode, + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, BTREE_ITER_INTENT); if (ret) goto err; @@ -298,7 +329,7 @@ int bch2_rename_trans(struct btree_trans *trans, } if (S_ISDIR(dst_inode_u->bi_mode) && - bch2_empty_dir_trans(trans, dst_inode)) { + bch2_empty_dir_trans(trans, dst_inum)) { ret = -ENOTEMPTY; goto err; } @@ -322,7 +353,7 @@ int bch2_rename_trans(struct btree_trans *trans, dst_dir_u->bi_nlink++; } - if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { + if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; } @@ -333,22 +364,22 @@ int bch2_rename_trans(struct btree_trans *trans, src_dir_u->bi_mtime = now; src_dir_u->bi_ctime = now; - if (src_dir != dst_dir) { + if (src_dir.inum != dst_dir.inum) { dst_dir_u->bi_mtime = now; dst_dir_u->bi_ctime = now; } src_inode_u->bi_ctime = now; - if (dst_inode) + if (dst_inum.inum) dst_inode_u->bi_ctime = now; ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: - (src_dir != dst_dir + (src_dir.inum != dst_dir.inum ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) : 0 ) ?: bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: - (dst_inode + (dst_inum.inum ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) : 0 ); err: diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index 2273b7961c9b..1bb2ac4dc13a 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -4,27 +4,30 @@ struct posix_acl; -int bch2_create_trans(struct btree_trans *, u64, +#define BCH_CREATE_TMPFILE (1U << 0) + +int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, - struct posix_acl *); + struct posix_acl *, + unsigned); -int bch2_link_trans(struct btree_trans *, u64, - u64, struct bch_inode_unpacked *, - struct bch_inode_unpacked *, +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, const struct qstr *); -int bch2_unlink_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, +int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *); int bch2_rename_trans(struct btree_trans *, - u64, struct bch_inode_unpacked *, - u64, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, struct bch_inode_unpacked *, const struct qstr *, diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 909db2f104cd..7a0772195182 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -1790,6 +1790,49 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) /* O_DIRECT writes */ +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_cmp(bkey_start_pos(k.k), POS(inum.inum, end)) >= 0) + break; + + if (nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (err == -EINTR) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + /* * We're going to return -EIOCBQUEUED, but we haven't finished consuming the * iov_iter yet, so we need to stash a copy of the iovec: it might be on the @@ -1911,8 +1954,8 @@ static long bch2_dio_write_loop(struct dio_write *dio) ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), dio->op.opts.data_replicas, 0); if (unlikely(ret) && - !bch2_check_range_allocated(c, dio->op.pos, - bio_sectors(bio), + !bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), dio->op.opts.data_replicas, dio->op.opts.compression != 0)) goto err; @@ -2141,9 +2184,9 @@ out: /* truncate: */ -static inline int range_has_data(struct bch_fs *c, - struct bpos start, - struct bpos end) +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) { struct btree_trans trans; struct btree_iter iter; @@ -2151,6 +2194,12 @@ static inline int range_has_data(struct bch_fs *c, int ret = 0; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, start, 0, k, ret) { if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) @@ -2161,7 +2210,11 @@ static inline int range_has_data(struct bch_fs *c, break; } } + start = iter.pos; bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; return bch2_trans_exit(&trans) ?: ret; } @@ -2193,7 +2246,7 @@ static int __bch2_truncate_page(struct bch_inode_info *inode, * XXX: we're doing two index lookups when we end up reading the * page */ - ret = range_has_data(c, + ret = range_has_data(c, inode->ei_subvol, POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); if (ret <= 0) @@ -2327,7 +2380,7 @@ int bch2_truncate(struct mnt_idmap *idmap, inode_dio_wait(&inode->v); bch2_pagecache_block_get(&inode->ei_pagecache_lock); - ret = bch2_inode_find_by_inum(c, inode->v.i_ino, &inode_u); + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) goto err; @@ -2551,6 +2604,18 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); struct bpos atomic_end; unsigned trigger_flags = 0; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src, snapshot); + bch2_btree_iter_set_snapshot(&dst, snapshot); + bch2_btree_iter_set_snapshot(&del, snapshot); bch2_trans_begin(&trans); @@ -2671,9 +2736,17 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, struct bkey_i_reservation reservation; struct bkey_s_c k; unsigned sectors; + u32 snapshot; bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + k = bch2_btree_iter_peek_slot(&iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -2918,8 +2991,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, mark_range_unallocated(src, pos_src, pos_src + aligned_len); ret = bch2_remap_range(c, - POS(dst->v.i_ino, pos_dst >> 9), - POS(src->v.i_ino, pos_src >> 9), + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, aligned_len >> 9, &dst->ei_journal_seq, pos_dst + len, &i_sectors_delta); @@ -3012,7 +3085,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3020,9 +3095,15 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), 0, k, ret) { + SPOS(inode->v.i_ino, offset >> 9, snapshot), 0, k, ret) { if (k.k->p.inode != inode->v.i_ino) { break; } else if (bkey_extent_is_data(k.k)) { @@ -3032,6 +3113,9 @@ static loff_t bch2_seek_data(struct file *file, u64 offset) break; } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; if (ret) @@ -3108,7 +3192,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; int ret; isize = i_size_read(&inode->v); @@ -3116,9 +3202,15 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) return -ENXIO; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_extents, - POS(inode->v.i_ino, offset >> 9), + SPOS(inode->v.i_ino, offset >> 9, snapshot), BTREE_ITER_SLOTS, k, ret) { if (k.k->p.inode != inode->v.i_ino) { next_hole = bch2_seek_pagecache_hole(&inode->v, @@ -3136,6 +3228,9 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) } } bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; if (ret) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index ff6b1739342d..91f52ab9b4e2 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -192,7 +192,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, char *kname = NULL; struct qstr qstr; int ret = 0; - subvol_inum inum = { .subvol = 1 }; + subvol_inum inum; kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); if (!kname) @@ -205,10 +205,8 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, qstr.len = ret; qstr.name = kname; - ret = -ENOENT; - inum.inum = bch2_dirent_lookup(c, src->v.i_ino, &hash, - &qstr); - if (!inum.inum) + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) goto err1; vinode = bch2_vfs_inode_get(c, inum); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 7a994f3f9d20..0d47d9d5737b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -150,7 +150,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, retry: bch2_trans_begin(&trans); - ret = bch2_inode_peek(&trans, &iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT) ?: (set ? set(inode, &inode_u, p) : 0) ?: bch2_inode_write(&trans, &iter, &inode_u) ?: @@ -256,7 +256,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum.inum, &inode_u); + ret = bch2_inode_find_by_inum(c, inum, &inode_u); if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); @@ -271,10 +271,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) return &inode->v; } -static struct bch_inode_info * +struct bch_inode_info * __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, bool tmpfile) + umode_t mode, dev_t rdev, unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans trans; @@ -303,20 +303,23 @@ __bch2_create(struct mnt_idmap *idmap, bch2_inode_init_early(c, &inode_u); - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_lock(&dir->ei_update_lock); bch2_trans_init(&trans, c, 8, - 2048 + (!tmpfile ? dentry->d_name.len : 0)); + 2048 + (!(flags & BCH_CREATE_TMPFILE) + ? dentry->d_name.len : 0)); retry: bch2_trans_begin(&trans); - ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, - !tmpfile ? &dentry->d_name : NULL, + ret = bch2_create_trans(&trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, from_kuid(i_user_ns(&dir->v), current_fsuid()), from_kgid(i_user_ns(&dir->v), current_fsgid()), mode, rdev, - default_acl, acl) ?: + default_acl, acl, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) @@ -332,7 +335,7 @@ err_before_quota: goto err_trans; } - if (!tmpfile) { + if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); journal_seq_copy(c, dir, journal_seq); @@ -387,7 +390,7 @@ err: posix_acl_release(acl); return inode; err_trans: - if (!tmpfile) + if (!(flags & BCH_CREATE_TMPFILE)) mutex_unlock(&dir->ei_update_lock); bch2_trans_exit(&trans); @@ -407,11 +410,12 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); struct inode *vinode = NULL; subvol_inum inum = { .subvol = 1 }; + int ret; - inum.inum = bch2_dirent_lookup(c, dir->v.i_ino, &hash, - &dentry->d_name); + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); - if (inum.inum) + if (!ret) vinode = bch2_vfs_inode_get(c, inum); return d_splice_alias(vinode, dentry); @@ -422,7 +426,7 @@ static int bch2_mknod(struct mnt_idmap *idmap, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, false); + __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 0); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -452,8 +456,8 @@ static int __bch2_link(struct bch_fs *c, ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, bch2_link_trans(&trans, - dir->v.i_ino, - inode->v.i_ino, &dir_u, &inode_u, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, &dentry->d_name)); if (likely(!ret)) { @@ -504,7 +508,7 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, - dir->v.i_ino, &dir_u, + inode_inum(dir), &dir_u, &inode_u, &dentry->d_name)); if (likely(!ret)) { @@ -531,7 +535,8 @@ static int bch2_symlink(struct mnt_idmap *idmap, struct bch_inode_info *dir = to_bch_ei(vdir), *inode; int ret; - inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); + inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -624,8 +629,8 @@ static int bch2_rename2(struct mnt_idmap *idmap, ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, bch2_rename_trans(&trans, - src_dir->v.i_ino, &src_dir_u, - dst_dir->v.i_ino, &dst_dir_u, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, &src_inode_u, &dst_inode_u, &src_dentry->d_name, @@ -748,7 +753,7 @@ retry: kfree(acl); acl = NULL; - ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode->v.i_ino, + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT); if (ret) goto btree_err; @@ -756,7 +761,8 @@ retry: bch2_setattr_copy(idmap, inode, &inode_u, attr); if (attr->ia_valid & ATTR_MODE) { - ret = bch2_acl_chmod(&trans, &inode_u, inode_u.bi_mode, &acl); + ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); if (ret) goto btree_err; } @@ -848,7 +854,8 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, { struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), - file->f_path.dentry, mode, 0, true); + file->f_path.dentry, mode, 0, + BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -923,6 +930,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); unsigned offset_into_extent, sectors; bool have_extent = false; + u32 snapshot; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); @@ -932,15 +940,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (start + len < start) return -EINVAL; + start >>= 9; + bch2_bkey_buf_init(&cur); bch2_bkey_buf_init(&prev); bch2_trans_init(&trans, c, 0, 0); - - bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, - POS(ei->v.i_ino, start >> 9), 0); retry: bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + while ((k = bch2_btree_iter_peek(&iter)).k && !(ret = bkey_err(k)) && bkey_cmp(iter.pos, end) < 0) { @@ -989,7 +1003,9 @@ retry: bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, iter.pos.offset + sectors)); } - + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: if (ret == -EINTR) goto retry; @@ -997,7 +1013,6 @@ retry: ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), FIEMAP_EXTENT_LAST); - bch2_trans_iter_exit(&trans, &iter); ret = bch2_trans_exit(&trans) ?: ret; bch2_bkey_buf_exit(&cur, c); bch2_bkey_buf_exit(&prev, c); @@ -1034,7 +1049,7 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) if (!dir_emit_dots(file, ctx)) return 0; - return bch2_readdir(c, inode->v.i_ino, ctx); + return bch2_readdir(c, inode_inum(inode), ctx); } static const struct file_operations bch_file_operations = { @@ -1290,7 +1305,7 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode->v.i_ino, true); + bch2_inode_rm(c, inode_inum(inode), true); } } diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 6dae425bf616..aa755987b36c 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -144,6 +144,10 @@ struct bch_inode_unpacked; #ifndef NO_BCACHEFS_FS +struct bch_inode_info * +__bch2_create(struct mnt_idmap *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, unsigned); + int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, struct bch_qid, diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index e4ca05aae76c..40b107715cdd 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -858,7 +858,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, d = bkey_s_c_to_dirent(k); d_inum = le64_to_cpu(d.v->d_inum); - ret = bch2_dirent_read_target(trans, d, &d_inum); + ret = __bch2_dirent_read_target(&trans, d, + &target_subvol, + &target_snapshot, + &target_inum); if (ret && ret != -ENOENT) return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 3b19dc6b9ddc..7fccf842a46b 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -6,6 +6,7 @@ #include "btree_update.h" #include "error.h" #include "extents.h" +#include "extent_update.h" #include "inode.h" #include "str_hash.h" #include "subvolume.h" @@ -296,15 +297,21 @@ int bch2_inode_unpack(struct bkey_s_c_inode inode, int bch2_inode_peek(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, - u64 inum, unsigned flags) + subvol_inum inum, unsigned flags) { struct bkey_s_c k; + u32 snapshot; int ret; if (trans->c->opts.inodes_use_key_cache) flags |= BTREE_ITER_CACHED; - bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, inum), flags); + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), flags); k = bch2_btree_iter_peek_slot(iter); ret = bkey_err(k); if (ret) @@ -486,6 +493,9 @@ static inline u32 bkey_generation(struct bkey_s_c k) } } +/* + * This just finds an empty slot: + */ int bch2_inode_create(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode_u, @@ -585,16 +595,74 @@ found_slot: return 0; } -int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + u64 offset = 0; + int ret = 0; + + while (!ret || ret == -EINTR) { + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_trans_iter_init(trans, &iter, id, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + + if (!k.k || iter.pos.inode != inum.inum) { + bch2_trans_iter_exit(trans, &iter); + break; + } + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + if (btree_node_type_is_extents(iter.btree_id)) { + unsigned max_sectors = + min_t(u64, U64_MAX - iter.pos.offset, + KEY_SIZE_MAX & (~0 << trans->c->block_bits)); + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + + ret = bch2_extent_trim_atomic(trans, &iter, &delete); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + offset = iter.pos.offset; + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum, bool cached) { struct btree_trans trans; struct btree_iter iter = { NULL }; struct bkey_i_inode_generation delete; - struct bpos start = POS(inode_nr, 0); - struct bpos end = POS(inode_nr + 1, 0); struct bch_inode_unpacked inode_u; struct bkey_s_c k; unsigned iter_flags = BTREE_ITER_INTENT; + u32 snapshot; int ret; if (cached && c->opts.inodes_use_key_cache) @@ -610,19 +678,20 @@ int bch2_inode_rm(struct bch_fs *c, u64 inode_nr, bool cached) * XXX: the dirent could ideally would delete whiteouts when they're no * longer needed */ - ret = bch2_btree_delete_range_trans(&trans, BTREE_ID_extents, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_xattrs, - start, end, NULL) ?: - bch2_btree_delete_range_trans(&trans, BTREE_ID_dirents, - start, end, NULL); + ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); if (ret) goto err; retry: bch2_trans_begin(&trans); + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, - POS(0, inode_nr), iter_flags); + SPOS(0, inum.inum, snapshot), iter_flags); k = bch2_btree_iter_peek_slot(&iter); ret = bkey_err(k); @@ -632,7 +701,7 @@ retry: if (k.k->type != KEY_TYPE_inode) { bch2_fs_inconsistent(trans.c, "inode %llu not found when deleting", - inode_nr); + inum.inum); ret = -EIO; goto err; } @@ -662,20 +731,22 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, +static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, struct bch_inode_unpacked *inode) { - struct btree_iter iter = { NULL }; + struct btree_iter iter; int ret; - ret = bch2_inode_peek(trans, &iter, inode, inode_nr, 0); - bch2_trans_iter_exit(trans, &iter); + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, struct bch_inode_unpacked *inode) { return bch2_trans_do(c, NULL, NULL, 0, - bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); + bch2_inode_find_by_inum_trans(&trans, inum, inode)); } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 25bef104ebcc..9e84cddcc6cb 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -58,7 +58,7 @@ int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); int bch2_inode_peek(struct btree_trans *, struct btree_iter *, - struct bch_inode_unpacked *, u64, unsigned); + struct bch_inode_unpacked *, subvol_inum, unsigned); int bch2_inode_write(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *); @@ -74,9 +74,10 @@ void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, int bch2_inode_create(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, u32, u64); -int bch2_inode_rm(struct bch_fs *, u64, bool); +int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); -int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) { diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index f95ceb820faa..0f5e0099b848 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -325,7 +325,10 @@ int bch2_extent_update(struct btree_trans *trans, struct bch_inode_unpacked inode_u; ret = bch2_inode_peek(trans, &inode_iter, &inode_u, - k->k.p.inode, BTREE_ITER_INTENT); + (subvol_inum) { + .subvol = BCACHEFS_ROOT_SUBVOL, + .inum = k->k.p.inode, + }, BTREE_ITER_INTENT); if (ret) return ret; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index eb2b91f7e682..9dc6684139de 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -581,7 +581,8 @@ static int __bch2_move_data(struct bch_fs *c, stats->pos = start; bch2_trans_iter_init(&trans, &iter, btree_id, start, - BTREE_ITER_PREFETCH); + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); if (rate) bch2_ratelimit_reset(rate); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 2aab57cf09e1..47c8fecc6839 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1480,11 +1480,12 @@ int bch2_fs_initialize(struct bch_fs *c) err = "error creating lost+found"; ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_create_trans(&trans, BCACHEFS_ROOT_INO, + bch2_create_trans(&trans, + BCACHEFS_ROOT_SUBVOL_INUM, &root_inode, &lostfound_inode, &lostfound, 0, 0, S_IFDIR|0700, 0, - NULL, NULL)); + NULL, NULL, 0)); if (ret) { bch_err(c, "error creating lost+found"); goto err; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 576cfbccf5b5..be4b47bc7438 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -7,6 +7,7 @@ #include "inode.h" #include "io.h" #include "reflink.h" +#include "subvolume.h" #include @@ -197,7 +198,8 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } s64 bch2_remap_range(struct bch_fs *c, - struct bpos dst_start, struct bpos src_start, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, u64 remap_sectors, u64 *journal_seq, u64 new_i_size, s64 *i_sectors_delta) { @@ -205,6 +207,8 @@ s64 bch2_remap_range(struct bch_fs *c, struct btree_iter dst_iter, src_iter; struct bkey_s_c src_k; struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); struct bpos dst_end = dst_start, src_end = src_start; struct bpos src_want; u64 dst_done; @@ -238,6 +242,16 @@ s64 bch2_remap_range(struct bch_fs *c, break; } + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + &src_iter.snapshot); + if (ret) + continue; + + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + &dst_iter.snapshot); + if (ret) + continue; + dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); bch2_btree_iter_set_pos(&src_iter, src_want); @@ -311,7 +325,7 @@ s64 bch2_remap_range(struct bch_fs *c, bch2_trans_begin(&trans); ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, - dst_start.inode, BTREE_ITER_INTENT); + dst_inum, BTREE_ITER_INTENT); if (!ret2 && inode_u.bi_size < new_i_size) { diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 68c5cb5a2780..4c1b82860b0b 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -57,7 +57,7 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) } } -s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, - u64, u64 *, u64, s64 *); +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64 *, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index c6a132b3c5bb..6418089531ad 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -8,6 +8,7 @@ #include "error.h" #include "inode.h" #include "siphash.h" +#include "subvolume.h" #include "super.h" #include @@ -144,16 +145,21 @@ bch2_hash_lookup(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key, + subvol_inum inum, const void *key, unsigned flags) { struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, *iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|flags, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; if (k.k->type == desc.key_type) { @@ -176,15 +182,20 @@ bch2_hash_hole(struct btree_trans *trans, struct btree_iter *iter, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { struct bkey_s_c k; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, *iter, desc.btree_id, - POS(inode, desc.hash_key(info, key)), + SPOS(inum.inum, desc.hash_key(info, key), snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter->pos.inode != inode) + if (iter->pos.inode != inum.inum) break; if (k.k->type != desc.key_type) @@ -229,17 +240,25 @@ static __always_inline int bch2_hash_set(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, struct bkey_i *insert, int flags) + subvol_inum inum, + struct bkey_i *insert, int flags) { struct btree_iter iter, slot = { NULL }; struct bkey_s_c k; bool found = false; + u32 snapshot; int ret; + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + for_each_btree_key(trans, iter, desc.btree_id, - POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), + SPOS(inum.inum, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { - if (iter.pos.inode != inode) + if (iter.pos.inode != inum.inum) break; if (k.k->type == desc.key_type) { @@ -313,12 +332,12 @@ static __always_inline int bch2_hash_delete(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - u64 inode, const void *key) + subvol_inum inum, const void *key) { struct btree_iter iter; int ret; - ret = bch2_hash_lookup(trans, &iter, desc, info, inode, key, + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, BTREE_ITER_INTENT); if (ret) return ret; diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index babbfaadeb3f..ff81a25698ff 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -128,7 +128,7 @@ static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info int ret; ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, - inode->v.i_ino, + inode_inum(inode), &X_SEARCH(type, name, strlen(name)), 0); if (ret) @@ -160,7 +160,7 @@ int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, bch2_xattr_get_trans(&trans, inode, name, buffer, size, type)); } -int bch2_xattr_set(struct btree_trans *trans, u64 inum, +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) @@ -282,13 +282,21 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct btree_iter iter; struct bkey_s_c k; struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; - u64 inum = dentry->d_inode->i_ino; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; int ret; bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; for_each_btree_key(&trans, iter, BTREE_ID_xattrs, - POS(inum, 0), 0, k, ret) { + SPOS(inum, offset, snapshot), 0, k, ret) { BUG_ON(k.k->p.inode < inum); if (k.k->p.inode > inum) @@ -301,7 +309,12 @@ ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (ret) break; } + + offset = iter.pos.offset; bch2_trans_iter_exit(&trans, &iter); +err: + if (ret == -EINTR) + goto retry; ret = bch2_trans_exit(&trans) ?: ret; @@ -340,7 +353,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, - bch2_xattr_set(&trans, inode->v.i_ino, &hash, + bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 4151065ab853..f4f896545e1c 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -39,7 +39,8 @@ struct bch_inode_info; int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, const char *, void *, size_t, int); -int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, +int bch2_xattr_set(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -- cgit v1.2.3 From 42d237320e9817a94f3a0a2de28156523596b086 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 16 Mar 2021 23:28:43 -0400 Subject: bcachefs: Snapshot creation, deletion This is the final patch in the patch series implementing snapshots. This patch implements two new ioctls that work like creation and deletion of directories, but fancier. - BCH_IOCTL_SUBVOLUME_CREATE, for creating new subvolumes and snaphots - BCH_IOCTL_SUBVOLUME_DESTROY, for deleting subvolumes and snapshots Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 8 --- fs/bcachefs/dirent.h | 4 -- fs/bcachefs/fs-common.c | 182 +++++++++++++++++++++++++++++++++++++++--------- fs/bcachefs/fs-common.h | 7 +- fs/bcachefs/fs-ioctl.c | 168 ++++++++++++++++++++++++++++++++++++++++++++ fs/bcachefs/fs.c | 29 ++++---- fs/bcachefs/fs.h | 3 +- fs/bcachefs/fsck.c | 7 +- fs/bcachefs/recovery.c | 2 +- fs/bcachefs/str_hash.h | 7 +- 10 files changed, 348 insertions(+), 69 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index f290580594ce..8653a106809d 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -383,14 +383,6 @@ out: return ret; } -int bch2_dirent_delete_at(struct btree_trans *trans, - const struct bch_hash_info *hash_info, - struct btree_iter *iter) -{ - return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - hash_info, iter); -} - int __bch2_dirent_lookup_trans(struct btree_trans *trans, struct btree_iter *iter, subvol_inum dir, diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 88b784a99cb5..e7f65fbd8e65 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -33,10 +33,6 @@ int bch2_dirent_create(struct btree_trans *, subvol_inum, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, int); -int bch2_dirent_delete_at(struct btree_trans *, - const struct bch_hash_info *, - struct btree_iter *); - int __bch2_dirent_read_target(struct btree_trans *, struct bkey_s_c_dirent, u32 *, u32 *, u64 *, bool); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index 02bf32cc7659..3e8e3c5bf870 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -11,6 +11,11 @@ #include +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + int bch2_create_trans(struct btree_trans *trans, subvol_inum dir, struct bch_inode_unpacked *dir_u, @@ -19,6 +24,7 @@ int bch2_create_trans(struct btree_trans *trans, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct posix_acl *default_acl, struct posix_acl *acl, + subvol_inum snapshot_src, unsigned flags) { struct bch_fs *c = trans->c; @@ -27,10 +33,9 @@ int bch2_create_trans(struct btree_trans *trans, subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); - u64 dir_offset = 0; u64 dir_target; u32 snapshot; - unsigned dir_type; + unsigned dir_type = mode_to_type(mode); int ret; ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); @@ -41,37 +46,122 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); - if (!name) - new_inode->bi_flags |= BCH_INODE_UNLINKED; + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_UNLINKED; - ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); - if (ret) - goto err; + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) + goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct btree_iter subvol_iter; + struct bkey_s_c k; + + bch2_trans_iter_init(trans, &subvol_iter, BTREE_ID_subvolumes, + POS(0, snapshot_src.subvol), 0); + k = bch2_btree_iter_peek_slot(&subvol_iter); + + ret = bkey_err(k); + if (!ret && k.k->type != KEY_TYPE_subvolume) { + bch_err(c, "subvolume %u not found", + snapshot_src.subvol); + ret = -ENOENT; + } + + if (!ret) + snapshot_src.inum = le64_to_cpu(bkey_s_c_to_subvolume(k).v->inode); + bch2_trans_iter_exit(trans, &subvol_iter); + + if (ret) + goto err; + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; + } new_inum.inum = new_inode->bi_inum; dir_target = new_inode->bi_inum; - dir_type = mode_to_type(new_inode->bi_mode); - if (default_acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - default_acl, ACL_TYPE_DEFAULT); + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); if (ret) goto err; - } - if (acl) { - ret = bch2_set_acl_trans(trans, new_inum, new_inode, - acl, ACL_TYPE_ACCESS); + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); if (ret) goto err; } - if (name) { + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } + } + + if (!(flags & BCH_CREATE_TMPFILE)) { struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); + u64 dir_offset; - if (S_ISDIR(new_inode->bi_mode)) + if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; @@ -87,11 +177,11 @@ int bch2_create_trans(struct btree_trans *trans, BCH_HASH_SET_MUST_CREATE); if (ret) goto err; - } - if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { - new_inode->bi_dir = dir_u->bi_inum; - new_inode->bi_dir_offset = dir_offset; + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } } inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; @@ -160,7 +250,8 @@ int bch2_unlink_trans(struct btree_trans *trans, subvol_inum dir, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, - const struct qstr *name) + const struct qstr *name, + int deleting_snapshot) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -169,6 +260,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_hash_info dir_hash; subvol_inum inum; u64 now = bch2_current_time(c); + struct bkey_s_c k; int ret; ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); @@ -187,29 +279,51 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (inode_u->bi_dir == dirent_iter.pos.inode && - inode_u->bi_dir_offset == dirent_iter.pos.offset) { - inode_u->bi_dir = 0; - inode_u->bi_dir_offset = 0; + if (deleting_snapshot == 1 && !inode_u->bi_subvol) { + ret = -ENOENT; + goto err; } - if (S_ISDIR(inode_u->bi_mode)) { + if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (dir.subvol != inum.subvol) { - ret = bch2_subvolume_delete(trans, inum.subvol, false); + if (inode_u->bi_subvol) { + ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, + deleting_snapshot); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); if (ret) goto err; } + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { + inode_u->bi_dir = 0; + inode_u->bi_dir_offset = 0; + } + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; - dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); bch2_inode_nlink_dec(inode_u); - ret = bch2_dirent_delete_at(trans, &dir_hash, &dirent_iter) ?: + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: bch2_inode_write(trans, &dir_iter, dir_u) ?: bch2_inode_write(trans, &inode_iter, inode_u); err: @@ -348,12 +462,12 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (S_ISDIR(src_inode_u->bi_mode)) { + if (is_subdir_for_nlink(src_inode_u)) { src_dir_u->bi_nlink--; dst_dir_u->bi_nlink++; } - if (dst_inum.inum && S_ISDIR(dst_inode_u->bi_mode)) { + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; } diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index 1bb2ac4dc13a..9bb0a9676147 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -5,6 +5,9 @@ struct posix_acl; #define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) int bch2_create_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, @@ -13,7 +16,7 @@ int bch2_create_trans(struct btree_trans *, subvol_inum, uid_t, gid_t, umode_t, dev_t, struct posix_acl *, struct posix_acl *, - unsigned); + subvol_inum, unsigned); int bch2_link_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, @@ -23,7 +26,7 @@ int bch2_link_trans(struct btree_trans *, int bch2_unlink_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, - const struct qstr *); + const struct qstr *, int); int bch2_rename_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 91f52ab9b4e2..ae402d350d4c 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -10,7 +10,11 @@ #include "quota.h" #include +#include #include +#include +#include +#include #define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) #define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ @@ -292,6 +296,154 @@ err: return ret; } +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -ENOENT; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_idmap(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + + inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + path_put(&path); + return -EXDEV; + } + + ret = __bch2_unlink(path.dentry->d_parent->d_inode, path.dentry, 1); + path_put(&path); + + return ret; +} + long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct bch_inode_info *inode = file_bch_inode(file); @@ -322,6 +474,22 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) case FS_IOC_GOINGDOWN: return bch2_ioc_goingdown(c, (u32 __user *) arg); + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_create(c, file, i); + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + if (copy_from_user(&i, (void __user *) arg, sizeof(i))) + return -EFAULT; + return bch2_ioctl_subvolume_destroy(c, file, i); + } + default: return bch2_fs_ioctl(c, cmd, (void __user *) arg); } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 0d47d9d5737b..7475830bb33f 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -240,12 +240,6 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_info *inode; int ret; - /* - * debug assert, to be removed when we start creating - * subvolumes/snapshots: - */ - BUG_ON(inum.subvol != BCACHEFS_ROOT_SUBVOL); - inode = to_bch_ei(iget5_locked(c->vfs_sb, bch2_inode_hash(inum), bch2_iget5_test, @@ -274,7 +268,8 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_info * __bch2_create(struct mnt_idmap *idmap, struct bch_inode_info *dir, struct dentry *dentry, - umode_t mode, dev_t rdev, unsigned flags) + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) { struct bch_fs *c = dir->v.i_sb->s_fs_info; struct btree_trans trans; @@ -319,7 +314,7 @@ retry: from_kuid(i_user_ns(&dir->v), current_fsuid()), from_kgid(i_user_ns(&dir->v), current_fsgid()), mode, rdev, - default_acl, acl, flags) ?: + default_acl, acl, snapshot_src, flags) ?: bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, KEY_TYPE_QUOTA_PREALLOC); if (unlikely(ret)) @@ -426,7 +421,8 @@ static int bch2_mknod(struct mnt_idmap *idmap, umode_t mode, dev_t rdev) { struct bch_inode_info *inode = - __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, 0); + __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -493,7 +489,8 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, return 0; } -static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + int deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); @@ -509,7 +506,8 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, inode_inum(dir), &dir_u, - &inode_u, &dentry->d_name)); + &inode_u, &dentry->d_name, + deleting_snapshot)); if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); @@ -527,6 +525,11 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry) return ret; } +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, -1); +} + static int bch2_symlink(struct mnt_idmap *idmap, struct inode *vdir, struct dentry *dentry, const char *symname) @@ -536,7 +539,7 @@ static int bch2_symlink(struct mnt_idmap *idmap, int ret; inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, - BCH_CREATE_TMPFILE); + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (unlikely(IS_ERR(inode))) return PTR_ERR(inode); @@ -855,7 +858,7 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, struct bch_inode_info *inode = __bch2_create(idmap, to_bch_ei(vdir), file->f_path.dentry, mode, 0, - BCH_CREATE_TMPFILE); + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); if (IS_ERR(inode)) return PTR_ERR(inode); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index aa755987b36c..40898c4d197b 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -146,7 +146,7 @@ struct bch_inode_unpacked; struct bch_inode_info * __bch2_create(struct mnt_idmap *, struct bch_inode_info *, - struct dentry *, umode_t, dev_t, unsigned); + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); int bch2_fs_quota_transfer(struct bch_fs *, struct bch_inode_info *, @@ -183,6 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, int bch2_setattr_nonsize(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, int); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index f9a6a0b3ce7a..16a1eae9b374 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -307,7 +307,7 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter); + &dir_hash_info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -386,7 +386,8 @@ create_lostfound: BTREE_INSERT_LAZY_RW, bch2_create_trans(trans, root_inum, &root, lostfound, &lostfound_str, - 0, 0, S_IFDIR|0700, 0, NULL, NULL, 0)); + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0)); if (ret) bch_err(c, "error creating lost+found: %i", ret); } @@ -759,7 +760,7 @@ static int fsck_hash_delete_at(struct btree_trans *trans, { int ret; retry: - ret = bch2_hash_delete_at(trans, desc, info, iter) ?: + ret = bch2_hash_delete_at(trans, desc, info, iter, 0) ?: bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL| BTREE_INSERT_LAZY_RW); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 47c8fecc6839..64e0b542e779 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1485,7 +1485,7 @@ int bch2_fs_initialize(struct bch_fs *c) &root_inode, &lostfound_inode, &lostfound, 0, 0, S_IFDIR|0700, 0, - NULL, NULL, 0)); + NULL, NULL, (subvol_inum) { 0 }, 0)); if (ret) { bch_err(c, "error creating lost+found"); goto err; diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 6418089531ad..6486e709b700 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -307,7 +307,8 @@ static __always_inline int bch2_hash_delete_at(struct btree_trans *trans, const struct bch_hash_desc desc, const struct bch_hash_info *info, - struct btree_iter *iter) + struct btree_iter *iter, + unsigned update_flags) { struct bkey_i *delete; int ret; @@ -325,7 +326,7 @@ int bch2_hash_delete_at(struct btree_trans *trans, delete->k.p = iter->pos; delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; - return bch2_trans_update(trans, iter, delete, 0); + return bch2_trans_update(trans, iter, delete, update_flags); } static __always_inline @@ -342,7 +343,7 @@ int bch2_hash_delete(struct btree_trans *trans, if (ret) return ret; - ret = bch2_hash_delete_at(trans, desc, info, &iter); + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); bch2_trans_iter_exit(trans, &iter); return ret; } -- cgit v1.2.3 From 2027875bd8318171159495c948461eae2f84936d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 11 Oct 2021 12:03:19 -0400 Subject: bcachefs: Add BCH_SUBVOLUME_UNLINKED Snapshot deletion needs to become a multi step process, where we unlink, then tear down the page cache, then delete the subvolume - the deleting flag is equivalent to an inode with i_nlink = 0. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 + fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/fs-common.c | 30 ++----- fs/bcachefs/fs-common.h | 2 +- fs/bcachefs/fs-ioctl.c | 2 +- fs/bcachefs/fs.c | 11 ++- fs/bcachefs/fs.h | 2 +- fs/bcachefs/fsck.c | 18 ++++- fs/bcachefs/inode.c | 6 +- fs/bcachefs/subvolume.c | 182 ++++++++++++++++++++++++++++++++++++++---- fs/bcachefs/subvolume.h | 5 +- fs/bcachefs/subvolume_types.h | 11 +++ 12 files changed, 223 insertions(+), 51 deletions(-) create mode 100644 fs/bcachefs/subvolume_types.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 1608faae0d0b..567270015008 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -353,6 +353,7 @@ enum bch_time_stats { #include "quota_types.h" #include "rebalance_types.h" #include "replicas_types.h" +#include "subvolume_types.h" #include "super_types.h" /* Number of nodes btree coalesce will try to coalesce at once */ @@ -657,6 +658,9 @@ struct bch_fs { struct bch_snapshot_table __rcu *snapshot_table; struct mutex snapshot_table_lock; struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + struct snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; /* BTREE CACHE */ struct bio_set btree_bio; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 481bf643bd6f..8e1423b138a6 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -974,6 +974,7 @@ LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) * can delete it (or whether it should just be rm -rf'd) */ LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) /* Snapshots */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c index c49de741e1e3..5f3429e99115 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/fs-common.c @@ -239,7 +239,7 @@ int bch2_unlink_trans(struct btree_trans *trans, struct bch_inode_unpacked *dir_u, struct bch_inode_unpacked *inode_u, const struct qstr *name, - int deleting_snapshot) + bool deleting_snapshot) { struct bch_fs *c = trans->c; struct btree_iter dir_iter = { NULL }; @@ -267,35 +267,19 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - if (deleting_snapshot <= 0 && S_ISDIR(inode_u->bi_mode)) { + if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { ret = bch2_empty_dir_trans(trans, inum); if (ret) goto err; } - if (deleting_snapshot < 0 && - inode_u->bi_subvol) { - struct bch_subvolume s; - - ret = bch2_subvolume_get(trans, inode_u->bi_subvol, true, - BTREE_ITER_CACHED| - BTREE_ITER_WITH_UPDATES, - &s); - if (ret) - goto err; - - if (BCH_SUBVOLUME_SNAP(&s)) - deleting_snapshot = 1; + if (deleting_snapshot && !inode_u->bi_subvol) { + ret = -ENOENT; + goto err; } - if (deleting_snapshot == 1) { - if (!inode_u->bi_subvol) { - ret = -ENOENT; - goto err; - } - - ret = bch2_subvolume_delete(trans, inode_u->bi_subvol, - deleting_snapshot); + if (deleting_snapshot || inode_u->bi_subvol) { + ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); if (ret) goto err; diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h index 9bb0a9676147..dde237859514 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/fs-common.h @@ -26,7 +26,7 @@ int bch2_link_trans(struct btree_trans *, int bch2_unlink_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_inode_unpacked *, - const struct qstr *, int); + const struct qstr *, bool); int bch2_rename_trans(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index a12b591ec9ca..de94895ace9f 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -441,7 +441,7 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, dir = path.dentry->d_parent->d_inode; - ret = __bch2_unlink(dir, path.dentry, 1); + ret = __bch2_unlink(dir, path.dentry, true); if (!ret) { fsnotify_rmdir(dir, path.dentry); d_delete(path.dentry); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 334cd335ff11..c325e5c4325c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -490,7 +490,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir, } int __bch2_unlink(struct inode *vdir, struct dentry *dentry, - int deleting_snapshot) + bool deleting_snapshot) { struct bch_fs *c = vdir->i_sb->s_fs_info; struct bch_inode_info *dir = to_bch_ei(vdir); @@ -527,7 +527,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, static int bch2_unlink(struct inode *vdir, struct dentry *dentry) { - return __bch2_unlink(vdir, dentry, -1); + return __bch2_unlink(vdir, dentry, false); } static int bch2_symlink(struct mnt_idmap *idmap, @@ -1292,6 +1292,12 @@ static int bch2_vfs_write_inode(struct inode *vinode, return ret; } +static int bch2_drop_inode(struct inode *vinode) +{ + + return generic_drop_inode(vinode); +} + static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; @@ -1496,6 +1502,7 @@ static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .destroy_inode = bch2_destroy_inode, .write_inode = bch2_vfs_write_inode, + .drop_inode = bch2_drop_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, .statfs = bch2_statfs, diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 40898c4d197b..2616b15eb51c 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -183,7 +183,7 @@ int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, int bch2_setattr_nonsize(struct mnt_idmap *, struct bch_inode_info *, struct iattr *); -int __bch2_unlink(struct inode *, struct dentry *, int); +int __bch2_unlink(struct inode *, struct dentry *, bool); void bch2_vfs_exit(void); int bch2_vfs_init(void); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index a61d380a47b6..6b3eecdef81a 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -256,7 +256,7 @@ retry: /* Subvolume root? */ if (inode_u.bi_subvol) { - ret = bch2_subvolume_delete(trans, inode_u.bi_subvol, -1); + ret = bch2_subvolume_delete(trans, inode_u.bi_subvol); if (ret) goto err; } @@ -992,12 +992,28 @@ static int check_subvols(struct bch_fs *c) struct btree_trans trans; struct btree_iter iter; struct bkey_s_c k; + struct bkey_s_c_subvolume subvol; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); for_each_btree_key(&trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + subvol = bkey_s_c_to_subvolume(k); + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + ret = __bch2_trans_do(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW, + bch2_subvolume_delete(&trans, iter.pos.offset)); + if (ret) { + bch_err(c, "error deleting subvolume %llu: %i", + iter.pos.offset, ret); + break; + } + } } bch2_trans_iter_exit(&trans, &iter); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 7fccf842a46b..3ae321a99cee 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -709,11 +709,7 @@ retry: bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u); /* Subvolume root? */ - if (inode_u.bi_subvol) { - ret = bch2_subvolume_delete(&trans, inode_u.bi_subvol, -1); - if (ret) - goto err; - } + BUG_ON(inode_u.bi_subvol); bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 9bd8d61c96fe..58cda98989b1 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -4,6 +4,7 @@ #include "btree_key_cache.h" #include "btree_update.h" #include "error.h" +#include "fs.h" #include "subvolume.h" /* Snapshot tree: */ @@ -541,13 +542,6 @@ err: return ret; } -/* List of snapshot IDs that are being deleted: */ -struct snapshot_id_list { - u32 nr; - u32 size; - u32 *d; -}; - static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) { unsigned i; @@ -819,9 +813,11 @@ int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, return ret; } -/* XXX: mark snapshot id for deletion, walk btree and delete: */ -int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, - int deleting_snapshot) +/* + * Delete subvolume, mark snapshot ID as deleted, queue up snapshot + * deletion/cleanup: + */ +int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { struct btree_iter iter; struct bkey_s_c k; @@ -849,12 +845,6 @@ int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid, subvol = bkey_s_c_to_subvolume(k); snapid = le32_to_cpu(subvol.v->snapshot); - if (deleting_snapshot >= 0 && - deleting_snapshot != BCH_SUBVOLUME_SNAP(subvol.v)) { - ret = -ENOENT; - goto err; - } - delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ret = PTR_ERR_OR_ZERO(delete); if (ret) @@ -880,6 +870,163 @@ err: return ret; } +static void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) +{ + struct super_block *sb = c->vfs_sb; + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + spin_unlock(&sb->s_inode_list_lock); +again: + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + if (!(inode->i_state & I_DONTCACHE)) { + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + + spin_lock(&inode->i_lock); + if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && + !(inode->i_state & I_FREEING)) { + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); +} + +void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); + struct snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; + memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + + for (id = s.d; id < s.d + s.nr; id++) { + ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_delete(&trans, *id)); + if (ret) { + bch_err(c, "error %i deleting subvolume %u", ret, *id); + break; + } + } + + kfree(s.d); + } + + percpu_ref_put(&c->writes); +} + +struct subvolume_unlink_hook { + struct btree_trans_commit_hook h; + u32 subvol; +}; + +int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); + struct bch_fs *c = trans->c; + int ret = 0; + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_id_add(&c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) + return ret; + + if (unlikely(!percpu_ref_tryget(&c->writes))) + return -EROFS; + + if (!queue_work(system_long_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + percpu_ref_put(&c->writes); + return 0; +} + +int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_subvolume *n; + struct subvolume_unlink_hook *h; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolumes, + POS(0, subvolid), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_subvolume) { + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvolid); + ret = -EIO; + goto err; + } + + n = bch2_trans_kmalloc(trans, sizeof(*n)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, k); + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + + ret = bch2_trans_update(trans, &iter, &n->k_i, 0); + if (ret) + goto err; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_subvolume_create(struct btree_trans *trans, u64 inode, u32 src_subvolid, u32 *new_subvolid, @@ -977,5 +1124,8 @@ err: int bch2_fs_subvolumes_init(struct bch_fs *c) { INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); + mutex_init(&c->snapshots_unlinked_lock); return 0; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index f98c8c0dbea2..45234c9de0f6 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -2,6 +2,8 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H +#include "subvolume_types.h" + void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); const char *bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c); @@ -108,7 +110,8 @@ int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); -int bch2_subvolume_delete(struct btree_trans *, u32, int); +int bch2_subvolume_delete(struct btree_trans *, u32); +int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32 *, u32 *, bool); diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 index 000000000000..9410b9587591 --- /dev/null +++ b/fs/bcachefs/subvolume_types.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + +struct snapshot_id_list { + u32 nr; + u32 size; + u32 *d; +}; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ -- cgit v1.2.3 From 41f9b7d39fb11c9f306809681bb6991ac96f9b2e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 28 Oct 2021 16:24:39 -0400 Subject: bcachefs: Move bch2_evict_subvolume_inodes() to fs.c This fixes building in userspace - code that's coupled to the kernel VFS interface should live in fs.c Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 54 ++++++++++++++++++++++++++++++++++++++++------ fs/bcachefs/fs.h | 4 ++++ fs/bcachefs/subvolume.c | 57 ------------------------------------------------- fs/bcachefs/subvolume.h | 10 +++++++++ 4 files changed, 61 insertions(+), 64 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c325e5c4325c..7647e117013d 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1292,12 +1292,6 @@ static int bch2_vfs_write_inode(struct inode *vinode, return ret; } -static int bch2_drop_inode(struct inode *vinode) -{ - - return generic_drop_inode(vinode); -} - static void bch2_evict_inode(struct inode *vinode) { struct bch_fs *c = vinode->i_sb->s_fs_info; @@ -1318,6 +1312,53 @@ static void bch2_evict_inode(struct inode *vinode) } } +void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) +{ + struct super_block *sb = c->vfs_sb; + struct inode *inode; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + spin_unlock(&sb->s_inode_list_lock); +again: + cond_resched(); + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || + (inode->i_state & I_FREEING)) + continue; + + if (!(inode->i_state & I_DONTCACHE)) { + d_mark_dontcache(inode); + d_prune_aliases(inode); + } + + spin_lock(&inode->i_lock); + if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && + !(inode->i_state & I_FREEING)) { + wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + + spin_unlock(&inode->i_lock); + } + spin_unlock(&sb->s_inode_list_lock); +} + static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -1502,7 +1543,6 @@ static const struct super_operations bch_super_operations = { .alloc_inode = bch2_alloc_inode, .destroy_inode = bch2_destroy_inode, .write_inode = bch2_vfs_write_inode, - .drop_inode = bch2_drop_inode, .evict_inode = bch2_evict_inode, .sync_fs = bch2_sync_fs, .statfs = bch2_statfs, diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 2616b15eb51c..38c04282da64 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -185,11 +185,15 @@ int bch2_setattr_nonsize(struct mnt_idmap *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); +void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); + void bch2_vfs_exit(void); int bch2_vfs_init(void); #else +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + struct snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 58cda98989b1..4d385c9e9268 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -542,16 +542,6 @@ err: return ret; } -static bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) -{ - unsigned i; - - for (i = 0; i < s->nr; i++) - if (id == s->d[i]) - return true; - return false; -} - static int snapshot_id_add(struct snapshot_id_list *s, u32 id) { BUG_ON(snapshot_list_has_id(s, id)); @@ -870,53 +860,6 @@ err: return ret; } -static void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) -{ - struct super_block *sb = c->vfs_sb; - struct inode *inode; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) - continue; - - d_mark_dontcache(inode); - d_prune_aliases(inode); - } - spin_unlock(&sb->s_inode_list_lock); -again: - cond_resched(); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) - continue; - - if (!(inode->i_state & I_DONTCACHE)) { - d_mark_dontcache(inode); - d_prune_aliases(inode); - } - - spin_lock(&inode->i_lock); - if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && - !(inode->i_state & I_FREEING)) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); - goto again; - } - - spin_unlock(&inode->i_lock); - } - spin_unlock(&sb->s_inode_list_lock); -} - void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 45234c9de0f6..b5067dc68fc7 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -94,6 +94,16 @@ static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, return 0; } +static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +{ + unsigned i; + + for (i = 0; i < s->nr; i++) + if (id == s->d[i]) + return true; + return false; +} + int bch2_fs_snapshots_check(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); int bch2_fs_snapshots_start(struct bch_fs *); -- cgit v1.2.3 From 68a2054d88f7cd2866806148d9a2e4389eb46992 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 5 Nov 2021 15:17:13 -0400 Subject: bcachefs: Switch fsync to use bi_journal_seq Now that we're recording in each inode the journal sequence number of the most recent update, fsync becomes a lot simpler and we can delete all the plumbing for ei_journal_seq. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 3 +-- fs/bcachefs/fs-io.c | 58 +++++++++++++++++++++++++-------------------------- fs/bcachefs/fs.c | 52 +++++++-------------------------------------- fs/bcachefs/fs.h | 1 - fs/bcachefs/io.c | 9 ++++---- fs/bcachefs/io.h | 10 ++------- fs/bcachefs/reflink.c | 8 +++---- fs/bcachefs/reflink.h | 2 +- fs/bcachefs/xattr.c | 18 +++++++++++++++- 9 files changed, 65 insertions(+), 96 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 2afa15b26700..51a0b48a5313 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -330,8 +330,7 @@ retry: inode_u.bi_mode = mode; ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, 0); + bch2_trans_commit(&trans, NULL, NULL, 0); btree_err: bch2_trans_iter_exit(&trans, &inode_iter); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index f4c97fc0e3d1..7de6b7a7aa60 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -1096,7 +1096,6 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op = &w->io->op; bch2_write_op_init(op, c, w->opts); op->target = w->opts.foreground_target; - op_journal_seq_set(op, &inode->ei_journal_seq); op->nr_replicas = nr_replicas; op->res.nr_replicas = nr_replicas; op->write_point = writepoint_hashed(inode->ei_last_dirtied); @@ -1947,7 +1946,6 @@ static long bch2_dio_write_loop(struct dio_write *dio) bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); dio->op.end_io = bch2_dio_write_loop_async; dio->op.target = dio->op.opts.foreground_target; - op_journal_seq_set(&dio->op, &inode->ei_journal_seq); dio->op.write_point = writepoint_hashed((unsigned long) current); dio->op.nr_replicas = dio->op.opts.data_replicas; dio->op.subvol = inode->ei_subvol; @@ -2164,29 +2162,36 @@ unlock: /* fsync: */ -int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +/* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead + */ +static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) { - struct bch_inode_info *inode = file_bch_inode(file); - struct bch_fs *c = inode->v.i_sb->s_fs_info; - int ret, ret2; + struct bch_inode_unpacked inode; + int ret; - ret = file_write_and_wait_range(file, start, end); + if (c->opts.journal_flush_disabled) + return 0; + + ret = bch2_inode_find_by_inum(c, inum, &inode); if (ret) return ret; - if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) - goto out; + return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); +} - ret = sync_inode_metadata(&inode->v, 1); - if (ret) - return ret; -out: - if (!c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, - inode->ei_journal_seq); - ret2 = file_check_and_advance_wb_err(file); +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret, ret2, ret3; + + ret = file_write_and_wait_range(file, start, end); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode_inum(inode)); - return ret ?: ret2; + return ret ?: ret2 ?: ret3; } /* truncate: */ @@ -2448,7 +2453,7 @@ int bch2_truncate(struct mnt_idmap *idmap, ret = bch2_fpunch(c, inode_inum(inode), round_up(iattr->ia_size, block_bytes(c)) >> 9, - U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); + U64_MAX, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); if (unlikely(ret)) @@ -2508,7 +2513,6 @@ static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len ret = bch2_fpunch(c, inode_inum(inode), discard_start, discard_end, - &inode->ei_journal_seq, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); } @@ -2587,7 +2591,6 @@ static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ret = bch2_fpunch(c, inode_inum(inode), offset >> 9, (offset + len) >> 9, - &inode->ei_journal_seq, &i_sectors_delta); i_sectors_acct(c, inode, NULL, i_sectors_delta); @@ -2691,8 +2694,7 @@ reassemble: ret = bch2_btree_iter_traverse(&del) ?: bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: - bch2_trans_commit(&trans, &disk_res, - &inode->ei_journal_seq, + bch2_trans_commit(&trans, &disk_res, NULL, BTREE_INSERT_NOFAIL); bch2_disk_reservation_put(c, &disk_res); @@ -2803,7 +2805,7 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, ret = bch2_extent_update(&trans, inode_inum(inode), &iter, &reservation.k_i, - &disk_res, &inode->ei_journal_seq, + &disk_res, NULL, 0, &i_sectors_delta, true); i_sectors_acct(c, inode, "a_res, i_sectors_delta); bkey_err: @@ -3003,7 +3005,6 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, inode_inum(dst), pos_dst >> 9, inode_inum(src), pos_src >> 9, aligned_len >> 9, - &dst->ei_journal_seq, pos_dst + len, &i_sectors_delta); if (ret < 0) goto err; @@ -3021,10 +3022,9 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, i_size_write(&dst->v, pos_dst + ret); spin_unlock(&dst->v.i_lock); - if (((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || - IS_SYNC(file_inode(file_dst))) && - !c->opts.journal_flush_disabled) - ret = bch2_journal_flush_seq(&c->journal, dst->ei_journal_seq); + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, inode_inum(dst)); err: bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 12178bd15c34..92919b16f2f5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -41,25 +41,6 @@ static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *); -static void journal_seq_copy(struct bch_fs *c, - struct bch_inode_info *dst, - u64 journal_seq) -{ - /* - * atomic64_cmpxchg has a fallback for archs that don't support it, - * cmpxchg does not: - */ - atomic64_t *dst_seq = (void *) &dst->ei_journal_seq; - u64 old, v = READ_ONCE(dst->ei_journal_seq); - - do { - old = v; - - if (old >= journal_seq) - break; - } while ((v = atomic64_cmpxchg(dst_seq, old, journal_seq)) != old); -} - static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { BUG_ON(atomic_long_read(&lock->v) == 0); @@ -152,9 +133,7 @@ retry: BTREE_ITER_INTENT) ?: (set ? set(inode, &inode_u, p) : 0) ?: bch2_inode_write(&trans, &iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, - BTREE_INSERT_NOFAIL); + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); /* * the btree node lock protects inode->ei_inode, not ei_update_lock; @@ -329,7 +308,6 @@ err_before_quota: if (!(flags & BCH_CREATE_TMPFILE)) { bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dir, journal_seq); mutex_unlock(&dir->ei_update_lock); } @@ -337,7 +315,6 @@ err_before_quota: inum.inum = inode_u.bi_inum; bch2_vfs_inode_init(c, inum, inode, &inode_u); - journal_seq_copy(c, inode, journal_seq); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -362,7 +339,6 @@ err_before_quota: * We raced, another process pulled the new inode into cache * before us: */ - journal_seq_copy(c, old, journal_seq); make_bad_inode(&inode->v); iput(&inode->v); @@ -446,7 +422,7 @@ static int __bch2_link(struct bch_fs *c, mutex_lock(&inode->ei_update_lock); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &inode->ei_journal_seq, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_link_trans(&trans, inode_inum(dir), &dir_u, inode_inum(inode), &inode_u, @@ -455,7 +431,6 @@ static int __bch2_link(struct bch_fs *c, if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); - journal_seq_copy(c, inode, dir->ei_journal_seq); bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); @@ -498,7 +473,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); bch2_trans_init(&trans, c, 4, 1024); - ret = __bch2_trans_do(&trans, NULL, &dir->ei_journal_seq, + ret = __bch2_trans_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_unlink_trans(&trans, inode_inum(dir), &dir_u, @@ -508,7 +483,6 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, if (likely(!ret)) { BUG_ON(inode_u.bi_inum != inode->v.i_ino); - journal_seq_copy(c, inode, dir->ei_journal_seq); bch2_inode_update_after_write(c, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); bch2_inode_update_after_write(c, inode, &inode_u, @@ -550,8 +524,6 @@ static int bch2_symlink(struct mnt_idmap *idmap, if (unlikely(ret)) goto err; - journal_seq_copy(c, dir, inode->ei_journal_seq); - ret = __bch2_link(c, inode, dir, dentry); if (unlikely(ret)) goto err; @@ -586,7 +558,6 @@ static int bch2_rename2(struct mnt_idmap *idmap, ? BCH_RENAME_EXCHANGE : dst_dentry->d_inode ? BCH_RENAME_OVERWRITE : BCH_RENAME; - u64 journal_seq = 0; int ret; if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) @@ -626,7 +597,7 @@ static int bch2_rename2(struct mnt_idmap *idmap, goto err; } - ret = __bch2_trans_do(&trans, NULL, &journal_seq, 0, + ret = __bch2_trans_do(&trans, NULL, NULL, 0, bch2_rename_trans(&trans, inode_inum(src_dir), &src_dir_u, inode_inum(dst_dir), &dst_dir_u, @@ -644,23 +615,17 @@ static int bch2_rename2(struct mnt_idmap *idmap, bch2_inode_update_after_write(c, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, src_dir, journal_seq); - if (src_dir != dst_dir) { + if (src_dir != dst_dir) bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - journal_seq_copy(c, dst_dir, journal_seq); - } bch2_inode_update_after_write(c, src_inode, &src_inode_u, ATTR_CTIME); - journal_seq_copy(c, src_inode, journal_seq); - if (dst_inode) { + if (dst_inode) bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ATTR_CTIME); - journal_seq_copy(c, dst_inode, journal_seq); - } err: bch2_trans_exit(&trans); @@ -767,8 +732,7 @@ retry: } ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, - &inode->ei_journal_seq, + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); btree_err: bch2_trans_iter_exit(&trans, &inode_iter); @@ -1203,7 +1167,6 @@ static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, inode->v.i_size = bi->bi_size; inode->ei_flags = 0; - inode->ei_journal_seq = bi->bi_journal_seq; inode->ei_quota_reserved = 0; inode->ei_qid = bch_qid(bi); inode->ei_subvol = inum.subvol; @@ -1242,7 +1205,6 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) mutex_init(&inode->ei_update_lock); pagecache_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); - inode->ei_journal_seq = 0; return &inode->v; } diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 38c04282da64..1c8936df9fbb 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -36,7 +36,6 @@ struct bch_inode_info { unsigned long ei_flags; struct mutex ei_update_lock; - u64 ei_journal_seq; u64 ei_quota_reserved; unsigned long ei_last_dirtied; struct pagecache_lock ei_pagecache_lock; diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 0a9cb4d489f4..dc41286c229e 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -393,7 +393,7 @@ err: */ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, subvol_inum inum, u64 end, - u64 *journal_seq, s64 *i_sectors_delta) + s64 *i_sectors_delta) { struct bch_fs *c = trans->c; unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); @@ -431,7 +431,7 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, bch2_cut_back(end_pos, &delete); ret = bch2_extent_update(trans, inum, iter, &delete, - &disk_res, journal_seq, + &disk_res, NULL, 0, i_sectors_delta, false); bch2_disk_reservation_put(c, &disk_res); btree_err: @@ -450,7 +450,7 @@ btree_err: } int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, - u64 *journal_seq, s64 *i_sectors_delta) + s64 *i_sectors_delta) { struct btree_trans trans; struct btree_iter iter; @@ -461,8 +461,7 @@ int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, POS(inum.inum, start), BTREE_ITER_INTENT); - ret = bch2_fpunch_at(&trans, &iter, inum, end, - journal_seq, i_sectors_delta); + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index ebb0944b4ca3..8be77561badb 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -68,12 +68,6 @@ static inline u64 *op_journal_seq(struct bch_write_op *op) ? op->journal_seq_p : &op->journal_seq; } -static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) -{ - op->journal_seq_p = journal_seq; - op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; -} - static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { return op->alloc_reserve == RESERVE_MOVINGGC @@ -88,8 +82,8 @@ int bch2_extent_update(struct btree_trans *, subvol_inum, struct disk_reservation *, u64 *, u64, s64 *, bool); int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, - subvol_inum, u64, u64 *, s64 *); -int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, u64 *, s64 *); + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, struct bch_io_opts opts) diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 8e66e6390e62..d003f4088dfc 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -210,7 +210,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) s64 bch2_remap_range(struct bch_fs *c, subvol_inum dst_inum, u64 dst_offset, subvol_inum src_inum, u64 src_offset, - u64 remap_sectors, u64 *journal_seq, + u64 remap_sectors, u64 new_i_size, s64 *i_sectors_delta) { struct btree_trans trans; @@ -281,7 +281,7 @@ s64 bch2_remap_range(struct bch_fs *c, min(dst_end.offset, dst_iter.pos.offset + src_iter.pos.offset - src_want.offset), - journal_seq, i_sectors_delta); + i_sectors_delta); continue; } @@ -320,7 +320,7 @@ s64 bch2_remap_range(struct bch_fs *c, dst_end.offset - dst_iter.pos.offset)); ret = bch2_extent_update(&trans, dst_inum, &dst_iter, - new_dst.k, &disk_res, journal_seq, + new_dst.k, &disk_res, NULL, new_i_size, i_sectors_delta, true); bch2_disk_reservation_put(c, &disk_res); @@ -347,7 +347,7 @@ s64 bch2_remap_range(struct bch_fs *c, inode_u.bi_size < new_i_size) { inode_u.bi_size = new_i_size; ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: - bch2_trans_commit(&trans, NULL, journal_seq, 0); + bch2_trans_commit(&trans, NULL, NULL, 0); } bch2_trans_iter_exit(&trans, &inode_iter); diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h index 4c1b82860b0b..3745873fd88d 100644 --- a/fs/bcachefs/reflink.h +++ b/fs/bcachefs/reflink.h @@ -58,6 +58,6 @@ static inline __le64 *bkey_refcount(struct bkey_i *k) } s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, - subvol_inum, u64, u64, u64 *, u64, s64 *); + subvol_inum, u64, u64, u64, s64 *); #endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 181af89b0553..21823ce69237 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -165,8 +165,24 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, const char *name, const void *value, size_t size, int type, int flags) { + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; int ret; + /* + * We need to do an inode update so that bi_journal_sync gets updated + * and fsync works: + * + * Perhaps we should be updating bi_mtime too? + */ + + ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: + bch2_inode_write(trans, &inode_iter, &inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; + if (value) { struct bkey_i_xattr *xattr; unsigned namelen = strlen(name); @@ -352,7 +368,7 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); - return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, + return bch2_trans_do(c, NULL, NULL, 0, bch2_xattr_set(&trans, inode_inum(inode), &hash, name, value, size, handler->flags, flags)); -- cgit v1.2.3 From 32b26e8c7f6418b2d8bd404c7482c44141ba52e5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 6 Nov 2021 00:03:40 -0400 Subject: bcachefs: bch2_assert_pos_locked() This adds a new assertion to be used by bch2_inode_update_after_write(), which updates the VFS inode based on the update to the btree inode we just did - we require that the btree inode still be locked when we do that update. Signed-off-by: Kent Overstreet --- fs/bcachefs/acl.c | 2 +- fs/bcachefs/btree_iter.c | 45 ++++++++++++++++++++++++++++++++++--- fs/bcachefs/btree_iter.h | 4 ++++ fs/bcachefs/fs.c | 58 +++++++++++++++++++++++++++++------------------- fs/bcachefs/fs.h | 2 +- fs/bcachefs/inode.c | 6 ++--- fs/bcachefs/inode.h | 2 ++ 7 files changed, 88 insertions(+), 31 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 51a0b48a5313..00cd40a8d7fa 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -339,7 +339,7 @@ btree_err: if (unlikely(ret)) goto err; - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME|ATTR_MODE); set_cached_acl(&inode->v, type, acl); diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 94ba43626cde..1ad81cad36f1 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -46,7 +46,7 @@ static inline int __btree_path_cmp(const struct btree_path *l, unsigned r_level) { return cmp_int(l->btree_id, r_btree_id) ?: - cmp_int(l->cached, r_cached) ?: + cmp_int((int) l->cached, (int) r_cached) ?: bpos_cmp(l->pos, r_pos) ?: -cmp_int(l->level, r_level); } @@ -762,6 +762,43 @@ out: return ret; } +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + char buf[100]; + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!(path->nodes_locked & 1) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_cmp(pos, path->l[0].b->data->min_key) >= 0 && + bkey_cmp(pos, path->l[0].b->key.k.p) <= 0) + return; + } else { + if (!bkey_cmp(pos, path->pos)) + return; + } + } + + bch2_dump_trans_paths_updates(trans); + panic("not locked: %s %s%s\n", + bch2_btree_ids[id], + (bch2_bpos_to_text(&PBUF(buf), pos), buf), + key_cache ? " cached" : ""); +} + #else static inline void bch2_btree_path_verify_level(struct btree_trans *trans, @@ -1720,11 +1757,13 @@ void bch2_dump_trans_paths_updates(struct btree_trans *trans) btree_trans_sort_paths(trans); trans_for_each_path_inorder(trans, path, idx) - printk(KERN_ERR "path: idx %u ref %u:%u%s btree %s pos %s %pS\n", + printk(KERN_ERR "path: idx %u ref %u:%u%s%s btree %s pos %s locks %u %pS\n", path->idx, path->ref, path->intent_ref, - path->preserve ? " preserve" : "", + path->should_be_locked ? " S" : "", + path->preserve ? " P" : "", bch2_btree_ids[path->btree_id], (bch2_bpos_to_text(&PBUF(buf1), path->pos), buf1), + path->nodes_locked, #ifdef CONFIG_BCACHEFS_DEBUG (void *) path->ip_allocated #else diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index c71e42a782d6..72b9605cf3e7 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -166,9 +166,13 @@ inline struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bke #ifdef CONFIG_BCACHEFS_DEBUG void bch2_trans_verify_paths(struct btree_trans *); void bch2_trans_verify_locks(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); #else static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} #endif void bch2_btree_path_fix_key_modified(struct btree_trans *trans, diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 92919b16f2f5..5596081b93c1 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -37,7 +37,7 @@ static struct kmem_cache *bch2_inode_cache; -static void bch2_vfs_inode_init(struct bch_fs *, subvol_inum, +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, struct bch_inode_unpacked *); @@ -93,11 +93,19 @@ void bch2_pagecache_block_get(struct pagecache_lock *lock) __pagecache_lock_get(lock, -1); } -void bch2_inode_update_after_write(struct bch_fs *c, +void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, unsigned fields) { + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + 0 && c->opts.inodes_use_key_cache); + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); i_uid_write(&inode->v, bi->bi_uid); i_gid_write(&inode->v, bi->bi_gid); @@ -126,6 +134,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, int ret; bch2_trans_init(&trans, c, 0, 512); + trans.ip = _RET_IP_; retry: bch2_trans_begin(&trans); @@ -140,7 +149,7 @@ retry: * this is important for inode updates via bchfs_write_index_update */ if (!ret) - bch2_inode_update_after_write(c, inode, &inode_u, fields); + bch2_inode_update_after_write(&trans, inode, &inode_u, fields); bch2_trans_iter_exit(&trans, &iter); @@ -215,6 +224,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) { struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; + struct btree_trans trans; int ret; inode = to_bch_ei(iget5_locked(c->vfs_sb, @@ -227,14 +237,19 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) if (!(inode->v.i_state & I_NEW)) return &inode->v; - ret = bch2_inode_find_by_inum(c, inum, &inode_u); + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_trans_exit(&trans); + if (ret) { iget_failed(&inode->v); return ERR_PTR(ret); } - bch2_vfs_inode_init(c, inum, inode, &inode_u); - unlock_new_inode(&inode->v); return &inode->v; @@ -306,7 +321,7 @@ err_before_quota: } if (!(flags & BCH_CREATE_TMPFILE)) { - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); mutex_unlock(&dir->ei_update_lock); } @@ -314,7 +329,8 @@ err_before_quota: inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; inum.inum = inode_u.bi_inum; - bch2_vfs_inode_init(c, inum, inode, &inode_u); + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -429,11 +445,9 @@ static int __bch2_link(struct bch_fs *c, &dentry->d_name)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); } bch2_trans_exit(&trans); @@ -481,11 +495,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, deleting_snapshot)); if (likely(!ret)) { - BUG_ON(inode_u.bi_inum != inode->v.i_ino); - - bch2_inode_update_after_write(c, dir, &dir_u, + bch2_inode_update_after_write(&trans, dir, &dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, inode, &inode_u, + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_MTIME); } @@ -613,18 +625,18 @@ static int bch2_rename2(struct mnt_idmap *idmap, BUG_ON(dst_inode && dst_inode->v.i_ino != dst_inode_u.bi_inum); - bch2_inode_update_after_write(c, src_dir, &src_dir_u, + bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, ATTR_MTIME|ATTR_CTIME); if (src_dir != dst_dir) - bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, + bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, ATTR_MTIME|ATTR_CTIME); - bch2_inode_update_after_write(c, src_inode, &src_inode_u, + bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, ATTR_CTIME); if (dst_inode) - bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, + bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, ATTR_CTIME); err: bch2_trans_exit(&trans); @@ -742,7 +754,7 @@ btree_err: if (unlikely(ret)) goto err_trans; - bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); + bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); if (acl) set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); @@ -1154,11 +1166,11 @@ static const struct export_operations bch_export_ops = { //.get_parent = bch2_get_parent, }; -static void bch2_vfs_inode_init(struct bch_fs *c, subvol_inum inum, +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, struct bch_inode_unpacked *bi) { - bch2_inode_update_after_write(c, inode, bi, ~0); + bch2_inode_update_after_write(trans, inode, bi, ~0); inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 1c8936df9fbb..530238780a88 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -172,7 +172,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); typedef int (*inode_set_fn)(struct bch_inode_info *, struct bch_inode_unpacked *, void *); -void bch2_inode_update_after_write(struct bch_fs *, +void bch2_inode_update_after_write(struct btree_trans *, struct bch_inode_info *, struct bch_inode_unpacked *, unsigned); diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 728545141a39..a24bbc5228c1 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -722,9 +722,9 @@ err: return ret; } -static int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) { struct btree_iter iter; int ret; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index d433d48de4e0..723186d8afb6 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -89,6 +89,8 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum, bool); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, struct bch_inode_unpacked *); -- cgit v1.2.3 From 9ca4853b98af5fa15a2ddc47a45f8e103027f95d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 27 Oct 2021 13:05:56 -0400 Subject: bcachefs: Fix quota support for snapshots Quota support was disabled when snapshots were released, because of some tricky interactions with snpashots. We're sidestepping that for now - we're simply disabling quota accounting on snapshot subvolumes. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 28 ++++++++++++++------ fs/bcachefs/fs.h | 6 +++++ fs/bcachefs/opts.h | 12 ++++----- fs/bcachefs/quota.c | 69 +++++++++++++++++++++++++++++++++++++------------ fs/bcachefs/subvolume.c | 9 +++++++ fs/bcachefs/subvolume.h | 2 ++ 6 files changed, 96 insertions(+), 30 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 61027d349cd8..31adc0e0d452 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -39,7 +39,8 @@ static struct kmem_cache *bch2_inode_cache; static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_info *, - struct bch_inode_unpacked *); + struct bch_inode_unpacked *, + struct bch_subvolume *); static void __pagecache_lock_put(struct pagecache_lock *lock, long i) { @@ -225,6 +226,7 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) struct bch_inode_unpacked inode_u; struct bch_inode_info *inode; struct btree_trans trans; + struct bch_subvolume subvol; int ret; inode = to_bch_ei(iget5_locked(c->vfs_sb, @@ -239,10 +241,11 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) bch2_trans_init(&trans, c, 8, 0); ret = lockrestart_do(&trans, + bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); if (!ret) - bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); bch2_trans_exit(&trans); if (ret) { @@ -268,6 +271,7 @@ __bch2_create(struct mnt_idmap *idmap, struct bch_inode_unpacked inode_u; struct posix_acl *default_acl = NULL, *acl = NULL; subvol_inum inum; + struct bch_subvolume subvol; u64 journal_seq = 0; int ret; @@ -310,7 +314,12 @@ retry: if (unlikely(ret)) goto err_before_quota; - ret = bch2_trans_commit(&trans, NULL, &journal_seq, 0); + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(&trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(&trans, NULL, &journal_seq, 0); if (unlikely(ret)) { bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, KEY_TYPE_QUOTA_WARN); @@ -326,11 +335,8 @@ err_before_quota: mutex_unlock(&dir->ei_update_lock); } - inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; - inum.inum = inode_u.bi_inum; - bch2_iget5_set(&inode->v, &inum); - bch2_vfs_inode_init(&trans, inum, inode, &inode_u); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); @@ -1352,10 +1358,16 @@ static const struct export_operations bch_export_ops = { static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, struct bch_inode_info *inode, - struct bch_inode_unpacked *bi) + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) { bch2_inode_update_after_write(trans, inode, bi, ~0); + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + inode->v.i_blocks = bi->bi_sectors; inode->v.i_ino = bi->bi_inum; inode->v.i_rdev = bi->bi_dev; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 530238780a88..a67ab1ad2a31 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -63,6 +63,12 @@ static inline subvol_inum inode_inum(struct bch_inode_info *inode) */ #define EI_INODE_ERROR 0 +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + #define to_bch_ei(_inode) \ container_of_or_null(_inode, struct bch_inode_info, v) diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 10c022ec6ee0..896b8c9c1180 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -223,19 +223,19 @@ enum opt_type { BCH_SB_POSIX_ACL, true, \ NULL, "Enable POSIX acls") \ x(usrquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_USRQUOTA, false, \ NULL, "Enable user quotas") \ x(grpquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_GRPQUOTA, false, \ NULL, "Enable group quotas") \ x(prjquota, u8, \ - 0, \ + OPT_FORMAT|OPT_MOUNT, \ OPT_BOOL(), \ - NO_SB_OPT, false, \ + BCH_SB_PRJQUOTA, false, \ NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_MOUNT, \ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 5f1216da76d0..8f8f4b0accd6 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -3,6 +3,7 @@ #include "btree_update.h" #include "inode.h" #include "quota.h" +#include "subvolume.h" #include "super-io.h" static const char *bch2_sb_validate_quota(struct bch_sb *sb, @@ -415,14 +416,55 @@ static void bch2_sb_quota_read(struct bch_fs *c) } } +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_subvolume subvolume; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret) + return ret; + + if (!k.k) + return 1; + + ret = bch2_snapshot_get_subvol(trans, k.k->p.snapshot, &subvolume); + if (ret) + return ret; + + /* + * We don't do quota accounting in snapshots: + */ + if (BCH_SUBVOLUME_SNAP(&subvolume)) + goto advance; + + if (!bkey_is_inode(k.k)) + goto advance; + + ret = bch2_inode_unpack(k, &u); + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, POS(iter->pos.inode, iter->pos.offset + 1)); + return 0; +} + int bch2_fs_quota_read(struct bch_fs *c) { unsigned i, qtypes = enabled_qtypes(c); struct bch_memquota_type *q; struct btree_trans trans; struct btree_iter iter; - struct bch_inode_unpacked u; - struct bkey_s_c k; int ret; mutex_lock(&c->sb_lock); @@ -437,23 +479,18 @@ int bch2_fs_quota_read(struct bch_fs *c) bch2_trans_init(&trans, c, 0, 0); - for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, - BTREE_ITER_PREFETCH, k, ret) { - if (bkey_is_inode(k.k)) { - ret = bch2_inode_unpack(k, &u); - if (ret) - return ret; - - bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, - KEY_TYPE_QUOTA_NOCHECK); - bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, - KEY_TYPE_QUOTA_NOCHECK); - } - } + bch2_trans_iter_init(&trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + do { + ret = lockrestart_do(&trans, + bch2_fs_quota_read_inode(&trans, &iter)); + } while (!ret); bch2_trans_iter_exit(&trans, &iter); bch2_trans_exit(&trans); - return ret; + return ret < 0 ? ret : 0; } /* Enable/disable/delete quotas for an entire filesystem: */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 0ef625d21672..7e909a118189 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -789,6 +789,15 @@ int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, return ret; } +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, u32 *snapid) { diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index dde755b45392..e4c3fdcdf22f 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -118,6 +118,8 @@ void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c) int bch2_subvolume_get(struct btree_trans *, unsigned, bool, int, struct bch_subvolume *); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); int bch2_subvolume_delete(struct btree_trans *, u32); -- cgit v1.2.3 From 91d961badfd123b6759488bc4aa7a4d014b739f1 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 29 Mar 2022 15:48:45 -0400 Subject: bcachefs: darrays Inspired by CCAN darray - simple, stupid resizable (dynamic) arrays. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 2 +- fs/bcachefs/darray.h | 77 +++++++++++++++++++++ fs/bcachefs/fs.c | 2 +- fs/bcachefs/fs.h | 4 +- fs/bcachefs/fsck.c | 153 ++++++++++++++++-------------------------- fs/bcachefs/move.c | 8 +-- fs/bcachefs/subvolume.c | 41 ++++------- fs/bcachefs/subvolume.h | 38 ++++------- fs/bcachefs/subvolume_types.h | 8 +-- 9 files changed, 170 insertions(+), 163 deletions(-) create mode 100644 fs/bcachefs/darray.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 6cda77ad4342..01e9ed5dfc61 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -665,7 +665,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct work_struct snapshot_delete_work; struct work_struct snapshot_wait_for_pagecache_and_delete_work; - struct snapshot_id_list snapshots_unlinked; + snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; /* BTREE CACHE */ diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 index 000000000000..519ab9b96e67 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H + +/* + * Dynamic arrays: + * + * Inspired by CCAN's darray + */ + +#include "util.h" +#include + +#define DARRAY(type) \ +struct { \ + size_t nr, size; \ + type *data; \ +} + +typedef DARRAY(void) darray_void; + +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +{ + if (d->nr + more > d->size) { + size_t new_size = roundup_pow_of_two(d->nr + more); + void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} + +#define darray_make_room(_d, _more) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) + +#define darray_top(_d) ((_d).data[(_d).nr]) + +#define darray_push(_d, _item) \ +({ \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + (_d)->data[(_d)->nr++] = (_item); \ + _ret; \ +}) + +#define darray_insert_item(_d, _pos, _item) \ +({ \ + size_t pos = (_pos); \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ + _ret; \ +}) + +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + +#define darray_init(_d) \ +do { \ + (_d)->data = NULL; \ + (_d)->nr = (_d)->size = 0; \ +} while (0) + +#define darray_exit(_d) \ +do { \ + kfree((_d)->data); \ + darray_init(_d); \ +} while (0) + +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index afaee020e7e3..d8cd32b5d765 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1478,7 +1478,7 @@ static void bch2_evict_inode(struct inode *vinode) } void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) + snapshot_id_list *s) { struct super_block *sb = c->vfs_sb; struct inode *inode; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index a67ab1ad2a31..73b96d0b5d83 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -190,7 +190,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *, struct iattr *); int __bch2_unlink(struct inode *, struct dentry *, bool); -void bch2_evict_subvolume_inodes(struct bch_fs *, struct snapshot_id_list *); +void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); void bch2_vfs_exit(void); int bch2_vfs_init(void); @@ -198,7 +198,7 @@ int bch2_vfs_init(void); #else static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, - struct snapshot_id_list *s) {} + snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} static inline int bch2_vfs_init(void) { return 0; } diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 8783b950055e..10754b13ec15 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "bkey_buf.h" #include "btree_update.h" +#include "darray.h" #include "dirent.h" #include "error.h" #include "fs-common.h" @@ -471,11 +472,11 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, str pos.snapshot = snapshot_t(c, pos.snapshot)->equiv; if (bkey_cmp(s->pos, pos)) - s->nr = 0; + s->ids.nr = 0; s->pos = pos; /* Might get called multiple times due to lock restarts */ - if (s->nr && s->d[s->nr - 1] == pos.snapshot) + if (s->ids.nr && s->ids.data[s->ids.nr - 1] == pos.snapshot) return 0; return snapshots_seen_add(c, s, pos.snapshot); @@ -498,7 +499,7 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see ancestor = snapshot_t(c, ancestor)->equiv; /* @ancestor should be the snapshot most recently added to @seen */ - BUG_ON(!seen->nr || seen->d[seen->nr - 1] != ancestor); + BUG_ON(!seen->ids.nr || seen->ids.data[seen->ids.nr - 1] != ancestor); BUG_ON(seen->pos.snapshot != ancestor); if (id == ancestor) @@ -507,11 +508,11 @@ static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *see if (!bch2_snapshot_is_ancestor(c, id, ancestor)) return false; - for (i = seen->nr - 2; - i >= 0 && seen->d[i] >= id; + for (i = seen->ids.nr - 2; + i >= 0 && seen->ids.data[i] >= id; --i) - if (bch2_snapshot_is_ancestor(c, id, seen->d[i]) && - bch2_snapshot_is_ancestor(c, seen->d[i], ancestor)) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i]) && + bch2_snapshot_is_ancestor(c, seen->ids.data[i], ancestor)) return false; return true; @@ -537,26 +538,25 @@ static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, } #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ - for (_i = (_w)->d; _i < (_w)->d + (_w)->nr && (_i)->snapshot <= (_snapshot); _i++)\ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && (_i)->snapshot <= (_snapshot); _i++)\ if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) +struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; +}; + struct inode_walker { bool first_this_inode; u64 cur_inum; - size_t nr; - size_t size; - struct inode_walker_entry { - struct bch_inode_unpacked inode; - u32 snapshot; - u64 count; - } *d; + DARRAY(struct inode_walker_entry) inodes; }; static void inode_walker_exit(struct inode_walker *w) { - kfree(w->d); - w->d = NULL; + darray_exit(&w->inodes); } static struct inode_walker inode_walker_init(void) @@ -564,43 +564,17 @@ static struct inode_walker inode_walker_init(void) return (struct inode_walker) { 0, }; } -static int inode_walker_realloc(struct bch_fs *c, struct inode_walker *w) -{ - if (w->nr == w->size) { - size_t new_size = max_t(size_t, 8UL, w->size * 2); - void *d = krealloc(w->d, new_size * sizeof(w->d[0]), - GFP_KERNEL); - if (!d) { - bch_err(c, "fsck: error allocating memory for inode_walker, size %zu", - new_size); - return -ENOMEM; - } - - w->d = d; - w->size = new_size; - } - - return 0; -} - static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { struct bch_inode_unpacked u; - int ret; - - ret = inode_walker_realloc(c, w); - if (ret) - return ret; BUG_ON(bch2_inode_unpack(inode, &u)); - w->d[w->nr++] = (struct inode_walker_entry) { + return darray_push(&w->inodes, ((struct inode_walker_entry) { .inode = u, .snapshot = snapshot_t(c, inode.k->p.snapshot)->equiv, - }; - - return 0; + })); } static int __walk_inode(struct btree_trans *trans, @@ -619,7 +593,7 @@ static int __walk_inode(struct btree_trans *trans, goto lookup_snapshot; } - w->nr = 0; + w->inodes.nr = 0; for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, pos.inode), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -637,26 +611,25 @@ static int __walk_inode(struct btree_trans *trans, w->cur_inum = pos.inode; w->first_this_inode = true; lookup_snapshot: - for (i = 0; i < w->nr; i++) - if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->d[i].snapshot)) + for (i = 0; i < w->inodes.nr; i++) + if (bch2_snapshot_is_ancestor(c, pos.snapshot, w->inodes.data[i].snapshot)) goto found; return INT_MAX; found: - BUG_ON(pos.snapshot > w->d[i].snapshot); + BUG_ON(pos.snapshot > w->inodes.data[i].snapshot); - if (pos.snapshot != w->d[i].snapshot) { + if (pos.snapshot != w->inodes.data[i].snapshot) { ancestor_pos = i; - while (i && w->d[i - 1].snapshot > pos.snapshot) + while (i && w->inodes.data[i - 1].snapshot > pos.snapshot) --i; - ret = inode_walker_realloc(c, w); + ret = darray_insert_item(&w->inodes, i, w->inodes.data[ancestor_pos]); if (ret) return ret; - array_insert_item(w->d, w->nr, i, w->d[ancestor_pos]); - w->d[i].snapshot = pos.snapshot; - w->d[i].count = 0; + w->inodes.data[i].snapshot = pos.snapshot; + w->inodes.data[i].count = 0; } return i; @@ -672,7 +645,7 @@ static int __get_visible_inodes(struct btree_trans *trans, struct bkey_s_c k; int ret; - w->nr = 0; + w->inodes.nr = 0; for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), BTREE_ITER_ALL_SNAPSHOTS, k, ret) { @@ -1133,7 +1106,7 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) int ret = 0, ret2 = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_sectors == i->count) continue; @@ -1232,7 +1205,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - i = inode->d + ret; + i = inode->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && @@ -1333,7 +1306,7 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) int ret = 0, ret2 = 0; s64 count2; - for (i = w->d; i < w->d + w->nr; i++) { + darray_for_each(w->inodes, i) { if (i->inode.bi_nlink == i->count) continue; @@ -1537,7 +1510,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto out; } - i = dir->d + ret; + i = dir->inodes.data + ret; ret = 0; if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, @@ -1550,7 +1523,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, } if (dir->first_this_inode) - *hash_info = bch2_hash_info_init(c, &dir->d[0].inode); + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); @@ -1618,7 +1591,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (fsck_err_on(!target->nr, c, + if (fsck_err_on(!target->inodes.nr, c, "dirent points to missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), @@ -1628,7 +1601,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, goto err; } - for (i = target->d; i < target->d + target->nr; i++) { + darray_for_each(target->inodes, i) { ret = check_dirent_target(trans, iter, d, &i->inode, i->snapshot); if (ret) @@ -1726,7 +1699,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, ret = 0; if (inode->first_this_inode) - *hash_info = bch2_hash_info_init(c, &inode->d[0].inode); + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); fsck_err: @@ -1836,21 +1809,18 @@ static int check_root(struct bch_fs *c) check_root_trans(&trans)); } -struct pathbuf { - size_t nr; - size_t size; - - struct pathbuf_entry { - u64 inum; - u32 snapshot; - } *entries; +struct pathbuf_entry { + u64 inum; + u32 snapshot; }; -static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) +typedef DARRAY(struct pathbuf_entry) pathbuf; + +static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) { struct pathbuf_entry *i; - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) if (i->inum == inum && i->snapshot == snapshot) return true; @@ -1858,29 +1828,18 @@ static bool path_is_dup(struct pathbuf *p, u64 inum, u32 snapshot) return false; } -static int path_down(struct bch_fs *c, struct pathbuf *p, +static int path_down(struct bch_fs *c, pathbuf *p, u64 inum, u32 snapshot) { - if (p->nr == p->size) { - size_t new_size = max_t(size_t, 256UL, p->size * 2); - void *n = krealloc(p->entries, - new_size * sizeof(p->entries[0]), - GFP_KERNEL); - if (!n) { - bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", - new_size); - return -ENOMEM; - } - - p->entries = n; - p->size = new_size; - }; - - p->entries[p->nr++] = (struct pathbuf_entry) { + int ret = darray_push(p, ((struct pathbuf_entry) { .inum = inum, .snapshot = snapshot, - }; - return 0; + })); + + if (ret) + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + p->size); + return ret; } /* @@ -1889,7 +1848,7 @@ static int path_down(struct bch_fs *c, struct pathbuf *p, * XXX: we should also be verifying that inodes are in the right subvolumes */ static int check_path(struct btree_trans *trans, - struct pathbuf *p, + pathbuf *p, struct bch_inode_unpacked *inode, u32 snapshot) { @@ -1963,7 +1922,7 @@ static int check_path(struct btree_trans *trans, /* XXX print path */ bch_err(c, "directory structure loop"); - for (i = p->entries; i < p->entries + p->nr; i++) + darray_for_each(*p, i) pr_err("%llu:%u", i->inum, i->snapshot); pr_err("%llu:%u", inode->bi_inum, snapshot); @@ -2000,7 +1959,7 @@ static int check_directory_structure(struct bch_fs *c) struct btree_iter iter; struct bkey_s_c k; struct bch_inode_unpacked u; - struct pathbuf path = { 0, 0, NULL }; + pathbuf path = { 0, }; int ret; bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); @@ -2030,7 +1989,7 @@ static int check_directory_structure(struct bch_fs *c) BUG_ON(ret == -EINTR); - kfree(path.entries); + darray_exit(&path); bch2_trans_exit(&trans); return ret; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 2eb192da8e1d..b916ee35ee37 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -91,10 +91,10 @@ next: if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, old_pos.snapshot)) { struct bkey_i *update; - size_t i; + u32 *i; - for (i = 0; i < s.nr; i++) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, s.d[i])) + darray_for_each(s.ids, i) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, *i)) goto next; update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); @@ -124,7 +124,7 @@ next: } } bch2_trans_iter_exit(trans, &iter); - kfree(s.d); + darray_exit(&s.ids); return ret; } diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 69603327d93d..2c5f7e7793a7 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -544,36 +544,21 @@ err: return ret; } -static int snapshot_id_add(struct snapshot_id_list *s, u32 id) +static int snapshot_id_add(snapshot_id_list *s, u32 id) { BUG_ON(snapshot_list_has_id(s, id)); - if (s->nr == s->size) { - size_t new_size = max(8U, s->size * 2); - void *n = krealloc(s->d, - new_size * sizeof(s->d[0]), - GFP_KERNEL); - if (!n) { - pr_err("error allocating snapshot ID list"); - return -ENOMEM; - } - - s->d = n; - s->size = new_size; - }; - - s->d[s->nr++] = id; - return 0; + return darray_push(s, id); } static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, - struct snapshot_id_list *deleted, + snapshot_id_list *deleted, enum btree_id btree_id) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; - struct snapshot_id_list equiv_seen = { 0 }; + snapshot_id_list equiv_seen = { 0 }; struct bpos last_pos = POS_MIN; int ret = 0; @@ -620,7 +605,7 @@ static int bch2_snapshot_delete_keys_btree(struct btree_trans *trans, } bch2_trans_iter_exit(trans, &iter); - kfree(equiv_seen.d); + darray_exit(&equiv_seen); return ret; } @@ -632,7 +617,7 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) struct btree_iter iter; struct bkey_s_c k; struct bkey_s_c_snapshot snap; - struct snapshot_id_list deleted = { 0 }; + snapshot_id_list deleted = { 0 }; u32 i, id, children[2]; int ret = 0; @@ -712,15 +697,15 @@ static void bch2_delete_dead_snapshots_work(struct work_struct *work) for (i = 0; i < deleted.nr; i++) { ret = __bch2_trans_do(&trans, NULL, NULL, 0, - bch2_snapshot_node_delete(&trans, deleted.d[i])); + bch2_snapshot_node_delete(&trans, deleted.data[i])); if (ret) { bch_err(c, "error deleting snapshot %u: %i", - deleted.d[i], ret); + deleted.data[i], ret); goto err; } } err: - kfree(deleted.d); + darray_exit(&deleted); bch2_trans_exit(&trans); percpu_ref_put(&c->writes); } @@ -875,14 +860,14 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - struct snapshot_id_list s; + snapshot_id_list s; u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); s = c->snapshots_unlinked; - memset(&c->snapshots_unlinked, 0, sizeof(c->snapshots_unlinked)); + darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); if (!s.nr) @@ -890,7 +875,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) bch2_evict_subvolume_inodes(c, &s); - for (id = s.d; id < s.d + s.nr; id++) { + for (id = s.data; id < s.data + s.nr; id++) { ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL, bch2_subvolume_delete(&trans, *id)); if (ret) { @@ -899,7 +884,7 @@ void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) } } - kfree(s.d); + darray_exit(&s); } percpu_ref_put(&c->writes); diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 4abe53df2788..b3d5ae49101d 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_SUBVOLUME_H #define _BCACHEFS_SUBVOLUME_H +#include "darray.h" #include "subvolume_types.h" void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); @@ -58,15 +59,13 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances struct snapshots_seen { struct bpos pos; - size_t nr; - size_t size; - u32 *d; + DARRAY(u32) ids; }; static inline void snapshots_seen_exit(struct snapshots_seen *s) { - kfree(s->d); - s->d = NULL; + kfree(s->ids.data); + s->ids.data = NULL; } static inline void snapshots_seen_init(struct snapshots_seen *s) @@ -76,30 +75,19 @@ static inline void snapshots_seen_init(struct snapshots_seen *s) static inline int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) { - if (s->nr == s->size) { - size_t new_size = max(s->size, (size_t) 128) * 2; - u32 *d = krealloc(s->d, new_size * sizeof(s->d[0]), GFP_KERNEL); - - if (!d) { - bch_err(c, "error reallocating snapshots_seen table (new size %zu)", - new_size); - return -ENOMEM; - } - - s->size = new_size; - s->d = d; - } - - s->d[s->nr++] = id; - return 0; + int ret = darray_push(&s->ids, id); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; } -static inline bool snapshot_list_has_id(struct snapshot_id_list *s, u32 id) +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - unsigned i; + u32 *i; - for (i = 0; i < s->nr; i++) - if (id == s->d[i]) + darray_for_each(*s, i) + if (*i == id) return true; return false; } diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 9410b9587591..f7562b5d51df 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,10 +2,8 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -struct snapshot_id_list { - u32 nr; - u32 size; - u32 *d; -}; +#include "darray.h" + +typedef DARRAY(u32) snapshot_id_list; #endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ -- cgit v1.2.3 From a7ecd30c8300624448c4e66cd7a7e7209b96ea61 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 4 Nov 2022 13:25:57 -0400 Subject: bcachefs: Factor out two_state_shared_lock We have a unique lock used for controlling adding to the pagecache: the lock has two states, where both states are shared - the lock may be held multiple times for either state - but not both states at the same time. This is exactly what we need for nocow mode locking, so this patch pulls it out of fs.c into its own file. Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/fs-io.c | 50 +++++++++++++++++----------------- fs/bcachefs/fs.c | 54 +------------------------------------ fs/bcachefs/fs.h | 35 ++++++++---------------- fs/bcachefs/two_state_shared_lock.c | 33 +++++++++++++++++++++++ fs/bcachefs/two_state_shared_lock.h | 28 +++++++++++++++++++ 6 files changed, 99 insertions(+), 102 deletions(-) create mode 100644 fs/bcachefs/two_state_shared_lock.c create mode 100644 fs/bcachefs/two_state_shared_lock.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 444e79c62b50..966c9b9a74fc 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -65,6 +65,7 @@ bcachefs-y := \ sysfs.o \ tests.o \ trace.o \ + two_state_shared_lock.o \ util.o \ varint.o \ xattr.o diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 3c3fa95215ac..ab5b4e086e0a 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -751,25 +751,25 @@ vm_fault_t bch2_page_fault(struct vm_fault *vmf) if (fdm > mapping) { struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); - if (bch2_pagecache_add_tryget(&inode->ei_pagecache_lock)) + if (bch2_pagecache_add_tryget(inode)) goto got_lock; - bch2_pagecache_block_put(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_put(fdm_host); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); - bch2_pagecache_block_get(&fdm_host->ei_pagecache_lock); + bch2_pagecache_block_get(fdm_host); /* Signal that lock has been dropped: */ set_fdm_dropped_locks(); return VM_FAULT_SIGBUS; } - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); got_lock: ret = filemap_fault(vmf); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return ret; } @@ -797,7 +797,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) * a write_invalidate_inode_pages_range() that works without dropping * page lock before invalidating page */ - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); lock_page(page); isize = i_size_read(&inode->v); @@ -830,7 +830,7 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) wait_for_stable_page(page); ret = VM_FAULT_LOCKED; out: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); sb_end_pagefault(inode->v.i_sb); return ret; @@ -1098,7 +1098,7 @@ void bch2_readahead(struct readahead_control *ractl) bch2_trans_init(&trans, c, 0, 0); - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); while ((page = readpage_iter_next(&readpages_iter))) { pgoff_t index = readpages_iter.offset + readpages_iter.idx; @@ -1121,7 +1121,7 @@ void bch2_readahead(struct readahead_control *ractl) &readpages_iter); } - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_trans_exit(&trans); kfree(readpages_iter.pages); @@ -1483,7 +1483,7 @@ int bch2_write_begin(struct file *file, struct address_space *mapping, bch2_page_reservation_init(c, inode, res); *fsdata = res; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); page = grab_cache_page_write_begin(mapping, index); if (!page) @@ -1540,7 +1540,7 @@ err: put_page(page); *pagep = NULL; err_unlock: - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); kfree(res); *fsdata = NULL; return bch2_err_class(ret); @@ -1584,7 +1584,7 @@ int bch2_write_end(struct file *file, struct address_space *mapping, unlock_page(page); put_page(page); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); bch2_page_reservation_put(c, inode, res); kfree(res); @@ -1753,7 +1753,7 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ssize_t written = 0; int ret = 0; - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); do { unsigned offset = pos & (PAGE_SIZE - 1); @@ -1811,7 +1811,7 @@ again: balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(iter)); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); return written ? written : ret; } @@ -1991,9 +1991,9 @@ ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (ret >= 0) iocb->ki_pos += ret; } else { - bch2_pagecache_add_get(&inode->ei_pagecache_lock); + bch2_pagecache_add_get(inode); ret = generic_file_read_iter(iocb, iter); - bch2_pagecache_add_put(&inode->ei_pagecache_lock); + bch2_pagecache_add_put(inode); } out: return bch2_err_class(ret); @@ -2149,7 +2149,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) return -EIOCBQUEUED; } - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); bch2_quota_reservation_put(c, inode, &dio->quota_res); if (dio->free_iov) @@ -2357,7 +2357,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) goto err; inode_dio_begin(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); extending = req->ki_pos + iter->count > inode->v.i_size; if (!extending) { @@ -2403,7 +2403,7 @@ err: inode_unlock(&inode->v); return ret; err_put_bio: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); bch2_quota_reservation_put(c, inode, &dio->quota_res); bio_put(bio); inode_dio_end(&inode->v); @@ -2704,7 +2704,7 @@ int bch2_truncate(struct mnt_idmap *idmap, } inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); if (ret) @@ -2783,7 +2783,7 @@ int bch2_truncate(struct mnt_idmap *idmap, ret = bch2_setattr_nonsize(idmap, inode, iattr); err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); return bch2_err_class(ret); } @@ -3195,7 +3195,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, inode_lock(&inode->v); inode_dio_wait(&inode->v); - bch2_pagecache_block_get(&inode->ei_pagecache_lock); + bch2_pagecache_block_get(inode); ret = file_modified(file); if (ret) @@ -3212,7 +3212,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, else ret = -EOPNOTSUPP; err: - bch2_pagecache_block_put(&inode->ei_pagecache_lock); + bch2_pagecache_block_put(inode); inode_unlock(&inode->v); percpu_ref_put(&c->writes); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 485cb9cbcd51..90297cfc7934 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -43,58 +43,6 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_subvolume *); -static void __pagecache_lock_put(struct pagecache_lock *lock, long i) -{ - BUG_ON(atomic_long_read(&lock->v) == 0); - - if (atomic_long_sub_return_release(i, &lock->v) == 0) - wake_up_all(&lock->wait); -} - -static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) -{ - long v = atomic_long_read(&lock->v), old; - - do { - old = v; - - if (i > 0 ? v < 0 : v > 0) - return false; - } while ((v = atomic_long_cmpxchg_acquire(&lock->v, - old, old + i)) != old); - return true; -} - -static void __pagecache_lock_get(struct pagecache_lock *lock, long i) -{ - wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); -} - -void bch2_pagecache_add_put(struct pagecache_lock *lock) -{ - __pagecache_lock_put(lock, 1); -} - -bool bch2_pagecache_add_tryget(struct pagecache_lock *lock) -{ - return __pagecache_lock_tryget(lock, 1); -} - -void bch2_pagecache_add_get(struct pagecache_lock *lock) -{ - __pagecache_lock_get(lock, 1); -} - -void bch2_pagecache_block_put(struct pagecache_lock *lock) -{ - __pagecache_lock_put(lock, -1); -} - -void bch2_pagecache_block_get(struct pagecache_lock *lock) -{ - __pagecache_lock_get(lock, -1); -} - void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -1410,7 +1358,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); - pagecache_lock_init(&inode->ei_pagecache_lock); + two_state_lock_init(&inode->ei_pagecache_lock); mutex_init(&inode->ei_quota_lock); return &inode->v; diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 73b96d0b5d83..4164d0669d70 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -6,31 +6,11 @@ #include "opts.h" #include "str_hash.h" #include "quota_types.h" +#include "two_state_shared_lock.h" #include #include -/* - * Two-state lock - can be taken for add or block - both states are shared, - * like read side of rwsem, but conflict with other state: - */ -struct pagecache_lock { - atomic_long_t v; - wait_queue_head_t wait; -}; - -static inline void pagecache_lock_init(struct pagecache_lock *lock) -{ - atomic_long_set(&lock->v, 0); - init_waitqueue_head(&lock->wait); -} - -void bch2_pagecache_add_put(struct pagecache_lock *); -bool bch2_pagecache_add_tryget(struct pagecache_lock *); -void bch2_pagecache_add_get(struct pagecache_lock *); -void bch2_pagecache_block_put(struct pagecache_lock *); -void bch2_pagecache_block_get(struct pagecache_lock *); - struct bch_inode_info { struct inode v; unsigned long ei_flags; @@ -38,7 +18,7 @@ struct bch_inode_info { struct mutex ei_update_lock; u64 ei_quota_reserved; unsigned long ei_last_dirtied; - struct pagecache_lock ei_pagecache_lock; + two_state_lock_t ei_pagecache_lock; struct mutex ei_quota_lock; struct bch_qid ei_qid; @@ -49,6 +29,13 @@ struct bch_inode_info { struct bch_inode_unpacked ei_inode; }; +#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) + +#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) +#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) + static inline subvol_inum inode_inum(struct bch_inode_info *inode) { return (subvol_inum) { @@ -95,7 +82,7 @@ do { \ if ((_locks) & INODE_LOCK) \ down_write_nested(&a[i]->v.i_rwsem, i); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ + bch2_pagecache_block_get(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ mutex_lock_nested(&a[i]->ei_update_lock, i);\ } \ @@ -113,7 +100,7 @@ do { \ if ((_locks) & INODE_LOCK) \ up_write(&a[i]->v.i_rwsem); \ if ((_locks) & INODE_PAGECACHE_BLOCK) \ - bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ + bch2_pagecache_block_put(a[i]);\ if ((_locks) & INODE_UPDATE_LOCK) \ mutex_unlock(&a[i]->ei_update_lock); \ } \ diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c new file mode 100644 index 000000000000..dc508d545de0 --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "two_state_shared_lock.h" + +void bch2_two_state_unlock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + + BUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +bool bch2_two_state_trylock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +void bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + wait_event(lock->wait, bch2_two_state_trylock(lock, s)); +} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h new file mode 100644 index 000000000000..1b4f108908a1 --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TWO_STATE_LOCK_H +#define _BCACHEFS_TWO_STATE_LOCK_H + +#include +#include +#include + +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +typedef struct { + atomic_long_t v; + wait_queue_head_t wait; +} two_state_lock_t; + +static inline void two_state_lock_init(two_state_lock_t *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +void bch2_two_state_unlock(two_state_lock_t *, int); +bool bch2_two_state_trylock(two_state_lock_t *, int); +void bch2_two_state_lock(two_state_lock_t *, int); + +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ -- cgit v1.2.3 From a8b3a677e786fa869d220a6a78b5532a36dc2f4d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 2 Nov 2022 17:12:00 -0400 Subject: bcachefs: Nocow support This adds support for nocow mode, where we do writes in-place when possible. Patch components: - New boolean filesystem and inode option, nocow: note that when nocow is enabled, data checksumming and compression are implicitly disabled - To prevent in-place writes from racing with data moves (data_update.c) or bucket reuse (i.e. a bucket being reused and re-allocated while a nocow write is in flight, we have a new locking mechanism. Buckets can be locked for either data update or data move, using a fixed size hash table of two_state_shared locks. We don't have any chaining, meaning updates and moves to different buckets that hash to the same lock will wait unnecessarily - we'll want to watch for this becoming an issue. - The allocator path also needs to check for in-place writes in flight to a given bucket before giving it out: thus we add another counter to bucket_alloc_state so we can track this. - Fsync now may need to issue cache flushes to block devices instead of flushing the journal. We add a device bitmask to bch_inode_info, ei_devs_need_flush, which tracks devices that need to have flushes issued - note that this will lead to unnecessary flushes when other codepaths have already issued flushes, we may want to replace this with a sequence number. - New nocow write path: look up extents, and if they're writable write to them - otherwise fall back to the normal COW write path. XXX: switch to sequence numbers instead of bitmask for devs needing journal flush XXX: ei_quota_lock being a mutex means bch2_nocow_write_done() needs to run in process context - see if we can improve this Signed-off-by: Kent Overstreet --- fs/bcachefs/Makefile | 1 + fs/bcachefs/alloc_foreground.c | 5 + fs/bcachefs/alloc_types.h | 1 + fs/bcachefs/bcachefs.h | 10 +- fs/bcachefs/bcachefs_format.h | 10 +- fs/bcachefs/btree_io.c | 3 +- fs/bcachefs/checksum.h | 7 +- fs/bcachefs/data_update.c | 10 + fs/bcachefs/extents.c | 39 ++-- fs/bcachefs/extents.h | 1 + fs/bcachefs/fs-io.c | 98 ++++++++- fs/bcachefs/fs.h | 11 + fs/bcachefs/inode.c | 3 + fs/bcachefs/io.c | 452 +++++++++++++++++++++++++++++++++++++++-- fs/bcachefs/io.h | 7 +- fs/bcachefs/io_types.h | 7 + fs/bcachefs/move.c | 7 + fs/bcachefs/nocow_locking.c | 15 ++ fs/bcachefs/nocow_locking.h | 55 +++++ fs/bcachefs/opts.h | 7 + fs/bcachefs/super.h | 7 +- fs/bcachefs/trace.h | 5 +- 22 files changed, 709 insertions(+), 52 deletions(-) create mode 100644 fs/bcachefs/nocow_locking.c create mode 100644 fs/bcachefs/nocow_locking.h (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 456d540441ce..55b6d85d55c3 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -52,6 +52,7 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + nocow_locking.o \ opts.o \ printbuf.o \ quota.o \ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index a179bbe23c93..f78eaa52c11f 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -227,6 +227,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return NULL; } + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { + s->skipped_nocow++; + return NULL; + } + spin_lock(&c->freelist_lock); if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 2c96794d1993..2e6f48069258 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -12,6 +12,7 @@ struct bucket_alloc_state { u64 buckets_seen; u64 skipped_open; u64 skipped_need_journal_commit; + u64 skipped_nocow; u64 skipped_nouse; }; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 6d048e5d8843..74632105fb45 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -206,6 +206,7 @@ #include "bcachefs_format.h" #include "errcode.h" #include "fifo.h" +#include "nocow_locking.h" #include "opts.h" #include "util.h" @@ -383,7 +384,8 @@ BCH_DEBUG_PARAMS_DEBUG() x(journal_flush_seq) \ x(blocked_journal) \ x(blocked_allocate) \ - x(blocked_allocate_open_bucket) + x(blocked_allocate_open_bucket) \ + x(nocow_lock_contended) enum bch_time_stats { #define x(name) BCH_TIME_##name, @@ -483,6 +485,7 @@ struct bch_dev { struct bch_sb *sb_read_scratch; int sb_write_error; dev_t dev; + atomic_t flush_seq; struct bch_devs_mask self; @@ -897,7 +900,9 @@ struct bch_fs { struct bio_set bio_read_split; struct bio_set bio_write; struct mutex bio_bounce_pages_lock; -mempool_t bio_bounce_pages; + mempool_t bio_bounce_pages; + struct bucket_nocow_lock_table + nocow_locks; struct rhashtable promote_table; mempool_t compression_bounce[2]; @@ -959,6 +964,7 @@ mempool_t bio_bounce_pages; struct bio_set writepage_bioset; struct bio_set dio_write_bioset; struct bio_set dio_read_bioset; + struct bio_set nocow_flush_bioset; /* ERRORS */ struct list_head fsck_errors; diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 57327c4dc9b4..024a714955f2 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -798,7 +798,8 @@ struct bch_inode_generation { x(bi_dir, 64) \ x(bi_dir_offset, 64) \ x(bi_subvol, 32) \ - x(bi_parent_subvol, 32) + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -810,7 +811,8 @@ struct bch_inode_generation { x(promote_target, 16) \ x(foreground_target, 16) \ x(background_target, 16) \ - x(erasure_code, 16) + x(erasure_code, 16) \ + x(nocow, 8) enum inode_opt_id { #define x(name, ...) \ @@ -1548,7 +1550,8 @@ struct bch_sb_field_journal_seq_blacklist { x(alloc_v4, 20) \ x(new_data_types, 21) \ x(backpointers, 22) \ - x(inode_v3, 23) + x(inode_v3, 23) \ + x(unwritten_extents, 24) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -1696,6 +1699,7 @@ LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); +LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); /* diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index dfa45cf4021f..87d80a59dd7e 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1832,7 +1832,8 @@ static void btree_write_submit(struct work_struct *work) bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) ptr->offset += wbio->sector_offset; - bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, &tmp.k); + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, + &tmp.k, false); } void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index f7ccef7a5520..409ad534d9f4 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -99,14 +99,17 @@ static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, } static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, - unsigned opt) + struct bch_io_opts opts) { + if (opts.nocow) + return 0; + if (c->sb.encryption_type) return c->opts.wide_macs ? BCH_CSUM_chacha20_poly1305_128 : BCH_CSUM_chacha20_poly1305_80; - return bch2_csum_opt_to_type(opt, true); + return bch2_csum_opt_to_type(opts.data_checksum, true); } static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 82d7e13e61a5..c3f12b3adb14 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -303,6 +303,13 @@ void bch2_data_update_read_done(struct data_update *m, void bch2_data_update_exit(struct data_update *update) { struct bch_fs *c = update->op.c; + struct bkey_ptrs_c ptrs = + bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), 0); bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); @@ -451,6 +458,9 @@ int bch2_data_update_init(struct bch_fs *c, struct data_update *m, m->op.incompressible = true; i++; + + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); } if (reserve_sectors) { diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 627edba24900..55a8879dc4fe 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -664,22 +664,21 @@ unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) return replicas; } -static unsigned bch2_extent_ptr_durability(struct bch_fs *c, - struct extent_ptr_decoded p) +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) { unsigned durability = 0; struct bch_dev *ca; - if (p.ptr.cached) + if (p->ptr.cached) return 0; - ca = bch_dev_bkey_exists(c, p.ptr.dev); + ca = bch_dev_bkey_exists(c, p->ptr.dev); if (ca->mi.state != BCH_MEMBER_STATE_failed) durability = max_t(unsigned, durability, ca->mi.durability); - if (p.has_ec) - durability += p.ec.redundancy; + if (p->has_ec) + durability += p->ec.redundancy; return durability; } @@ -692,7 +691,7 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) unsigned durability = 0; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - durability += bch2_extent_ptr_durability(c, p); + durability += bch2_extent_ptr_durability(c,& p); return durability; } @@ -907,23 +906,31 @@ bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, */ bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) { - struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); - struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); - const union bch_extent_entry *entry1, *entry2; - struct extent_ptr_decoded p1, p2; - - if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + if (k1.k->type != k2.k->type) return false; - bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) - bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) if (p1.ptr.dev == p2.ptr.dev && p1.ptr.gen == p2.ptr.gen && (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) return true; - return false; + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } } bool bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 659ab76ea62c..e27d39b728b3 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -596,6 +596,7 @@ bool bch2_bkey_is_incompressible(struct bkey_s_c); unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); +unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); void bch2_bkey_drop_device(struct bkey_s, unsigned); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b5cf0a3218ea..ec575b27eedb 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -35,6 +35,72 @@ #include +struct nocow_flush { + struct closure *cl; + struct bch_dev *ca; + struct bio bio; +}; + +static void nocow_flush_endio(struct bio *_bio) +{ + + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); + + closure_put(bio->cl); + percpu_ref_put(&bio->ca->io_ref); + bio_put(&bio->bio); +} + +static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + struct bch_inode_info *inode, + struct closure *cl) +{ + struct nocow_flush *bio; + struct bch_dev *ca; + struct bch_devs_mask devs; + unsigned dev; + + dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); + if (dev == BCH_SB_MEMBERS_MAX) + return; + + devs = inode->ei_devs_need_flush; + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (!ca) + continue; + + bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, + REQ_OP_FLUSH, + GFP_KERNEL, + &c->nocow_flush_bioset), + struct nocow_flush, bio); + bio->cl = cl; + bio->ca = ca; + bio->bio.bi_end_io = nocow_flush_endio; + closure_bio_submit(&bio->bio, cl); + } +} + +static int bch2_inode_flush_nocow_writes(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct closure cl; + + closure_init_stack(&cl); + bch2_inode_flush_nocow_writes_async(c, inode, &cl); + closure_sync(&cl); + + return 0; +} + static inline bool bio_full(struct bio *bio, unsigned len) { if (bio->bi_vcnt >= bio->bi_max_vecs) @@ -1327,6 +1393,7 @@ static void bch2_writepage_io_alloc(struct bch_fs *c, op->subvol = inode->ei_subvol; op->pos = POS(inode->v.i_ino, sector); op->end_io = bch2_writepage_io_done; + op->devs_need_flush = &inode->ei_devs_need_flush; op->wbio.bio.bi_iter.bi_sector = sector; op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); } @@ -2148,10 +2215,12 @@ static noinline void bch2_dio_write_flush(struct dio_write *dio) if (!dio->op.error) { ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); - if (ret) + if (ret) { dio->op.error = ret; - else + } else { bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } } if (dio->sync) { @@ -2296,6 +2365,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.nr_replicas = dio->op.opts.data_replicas; dio->op.subvol = inode->ei_subvol; dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) dio->op.flags |= BCH_WRITE_SYNC; @@ -2495,19 +2565,21 @@ out: * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an * insert trigger: look up the btree inode instead */ -static int bch2_flush_inode(struct bch_fs *c, subvol_inum inum) +static int bch2_flush_inode(struct bch_fs *c, + struct bch_inode_info *inode) { - struct bch_inode_unpacked inode; + struct bch_inode_unpacked u; int ret; if (c->opts.journal_flush_disabled) return 0; - ret = bch2_inode_find_by_inum(c, inum, &inode); + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); if (ret) return ret; - return bch2_journal_flush_seq(&c->journal, inode.bi_journal_seq); + return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); } int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) @@ -2518,7 +2590,7 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = file_write_and_wait_range(file, start, end); ret2 = sync_inode_metadata(&inode->v, 1); - ret3 = bch2_flush_inode(c, inode_inum(inode)); + ret3 = bch2_flush_inode(c, inode); return bch2_err_class(ret ?: ret2 ?: ret3); } @@ -3105,6 +3177,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, continue; } + /* + * XXX: for nocow mode, we should promote shared extents to + * unshared here + */ + sectors = bpos_min(k.k->p, end_pos).offset - iter.pos.offset; if (!bkey_extent_is_allocation(k.k)) { @@ -3368,7 +3445,7 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || IS_SYNC(file_inode(file_dst))) - ret = bch2_flush_inode(c, inode_inum(dst)); + ret = bch2_flush_inode(c, dst); err: bch2_quota_reservation_put(c, dst, "a_res); bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); @@ -3622,6 +3699,7 @@ loff_t bch2_llseek(struct file *file, loff_t offset, int whence) void bch2_fs_fsio_exit(struct bch_fs *c) { + bioset_exit(&c->nocow_flush_bioset); bioset_exit(&c->dio_write_bioset); bioset_exit(&c->dio_read_bioset); bioset_exit(&c->writepage_bioset); @@ -3641,7 +3719,9 @@ int bch2_fs_fsio_init(struct bch_fs *c) BIOSET_NEED_BVECS) || bioset_init(&c->dio_write_bioset, 4, offsetof(struct dio_write, op.wbio.bio), - BIOSET_NEED_BVECS)) + BIOSET_NEED_BVECS) || + bioset_init(&c->nocow_flush_bioset, + 1, offsetof(struct nocow_flush, bio), 0)) ret = -ENOMEM; pr_verbose_init(c->opts, "ret %i", ret); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 4164d0669d70..e1c73a38c607 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -25,6 +25,17 @@ struct bch_inode_info { u32 ei_subvol; + /* + * When we've been doing nocow writes we'll need to issue flushes to the + * underlying block devices + * + * XXX: a device may have had a flush issued by some other codepath. It + * would be better to keep for each device a sequence number that's + * incremented when we isusue a cache flush, and track here the sequence + * number that needs flushing. + */ + struct bch_devs_mask ei_devs_need_flush; + /* copy of inode in btree: */ struct bch_inode_unpacked ei_inode; }; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index f338cf6fd8b7..a98e40065122 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -892,4 +892,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, #define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); BCH_INODE_OPTS() #undef x + + if (opts->nocow) + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; } diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 1d0ec638f645..d511bd664953 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -34,6 +34,7 @@ #include "trace.h" #include +#include #include #include @@ -375,24 +376,118 @@ int bch2_extent_fallocate(struct btree_trans *trans, s64 *i_sectors_delta, struct write_point_specifier write_point) { - int ret; struct bch_fs *c = trans->c; struct disk_reservation disk_res = { 0 }; - struct bkey_i_reservation *reservation = - bch2_trans_kmalloc(trans, sizeof(*reservation)); + struct closure cl; + struct open_buckets open_buckets; + struct bkey_s_c k; + struct bkey_buf old, new; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; - ret = PTR_ERR_OR_ZERO(reservation); + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + open_buckets.nr = 0; +retry: + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); if (ret) return ret; - bkey_reservation_init(&reservation->k_i); - reservation->k.p = iter->pos; - bch2_key_resize(&reservation->k, sectors); - reservation->v.nr_replicas = opts.data_replicas; + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto out; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto out; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + RESERVE_none, 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { + bch2_trans_unlock(trans); + closure_sync(&cl); + goto retry; + } + if (ret) + return ret; + + sectors = min(sectors, wp->sectors_free); + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; - ret = bch2_extent_update(trans, inum, iter, &reservation->k_i, &disk_res, + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, 0, i_sectors_delta, true); +out: + if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_begin(trans); + goto retry; + } + + bch2_open_buckets_put(c, &open_buckets); bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + return ret; } @@ -539,7 +634,8 @@ static int bch2_write_index_default(struct bch_write_op *op) void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, enum bch_data_type type, - const struct bkey_i *k) + const struct bkey_i *k, + bool nocow) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); const struct bch_extent_ptr *ptr; @@ -573,8 +669,9 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, n->c = c; n->dev = ptr->dev; - n->have_ioref = bch2_dev_get_ioref(ca, + n->have_ioref = nocow || bch2_dev_get_ioref(ca, type == BCH_DATA_btree ? READ : WRITE); + n->nocow = nocow; n->submit_time = local_clock(); n->inode_offset = bkey_start_offset(&k->k); n->bio.bi_iter.bi_sector = ptr->offset; @@ -801,6 +898,9 @@ static void bch2_write_endio(struct bio *bio) op->flags |= BCH_WRITE_IO_ERROR; } + if (wbio->nocow) + set_bit(wbio->dev, op->devs_need_flush->d); + if (wbio->have_ioref) { bch2_latency_acct(ca, wbio->submit_time, WRITE); percpu_ref_put(&ca->io_ref); @@ -1221,6 +1321,321 @@ err: return ret; } +static bool bch2_extent_is_writeable(struct bch_write_op *op, + struct bkey_s_c k) +{ + struct bch_fs *c = op->c; + struct bkey_s_c_extent e; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + unsigned replicas = 0; + + if (k.k->type != KEY_TYPE_extent) + return false; + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { + if (p.crc.csum_type || + crc_is_compressed(p.crc) || + p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); + } + + return replicas >= op->opts.data_replicas; +} + +static inline void bch2_nocow_write_unlock(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + const struct bch_extent_ptr *ptr; + struct bkey_i *k; + + for_each_keylist_key(&op->insert_keys, k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); + } +} + +static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +{ + struct bkey_i *new; + struct bkey_ptrs ptrs; + struct bch_extent_ptr *ptr; + int ret; + + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { + /* trace this */ + return 0; + } + + new = bch2_bkey_make_mut(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important + * that we update the extent that we wrote to - even if a snapshot has + * since been created. The write is still outstanding, so we're ok + * w.r.t. snapshot atomicity: + */ + return bch2_extent_update_i_size_sectors(trans, iter, + min(new->k.p.offset << 9, new_i_size), 0) ?: + bch2_trans_update(trans, iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i *orig; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_keylist_key(&op->insert_keys, orig) { + ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, ({ + bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); + })); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + k->k.p.inode, k->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) { + op->error = ret; + break; + } + } + + bch2_trans_exit(&trans); +} + +static void __bch2_nocow_write_done(struct bch_write_op *op) +{ + bch2_nocow_write_unlock(op); + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + op->error = -EIO; + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + bch2_nocow_write_convert_unwritten(op); +} + +static void bch2_nocow_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + __bch2_nocow_write_done(op); + bch2_write_done(cl); +} + +static void bch2_nocow_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr, *ptr2; + struct { + struct bpos b; + unsigned gen; + two_state_lock_t *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; + u32 snapshot; + int ret, i; + + if (op->flags & BCH_WRITE_MOVE) + return; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); + if (unlikely(ret)) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(op->pos.inode, op->pos.offset, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bio *bio = &op->wbio.bio; + + nr_buckets = 0; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + /* fall back to normal cow write path? */ + if (unlikely(k.k->p.snapshot != snapshot || + !bch2_extent_is_writeable(op, k))) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) + break; + + /* Get iorefs before dropping btree locks: */ + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, buckets[nr_buckets].b); + + prefetch(buckets[nr_buckets].l); + nr_buckets++; + + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) + goto err_get_ioref; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + + /* Unlock before taking nocow locks, doing IO: */ + bkey_reassemble(op->insert_keys.top, k); + bch2_trans_unlock(&trans); + + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + two_state_lock_t *l = buckets[i].l; + bool stale; + + if (!bch2_two_state_trylock(l, BUCKET_NOCOW_LOCK_UPDATE)) + __bch2_bucket_nocow_lock(&c->nocow_locks, l, BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; + } + + bio = &op->wbio.bio; + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { + bio = bio_split(bio, k.k->p.offset - op->pos.offset, + GFP_KERNEL, &c->bio_write); + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { + op->flags |= BCH_WRITE_DONE; + } + + op->pos.offset += bio_sectors(bio); + op->written += bio_sectors(bio); + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_DONE) + break; + bch2_btree_iter_advance(&iter); + } +out: + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s: btree lookup error %s", + __func__, bch2_err_str(ret)); + op->error = ret; + op->flags |= BCH_WRITE_DONE; + } + + bch2_trans_exit(&trans); + + /* fallback to cow write path? */ + if (!(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; + } else if (op->flags & BCH_WRITE_SYNC) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl); + } else { + /* + * XXX + * needs to run out of process context because ei_quota_lock is + * a mutex + */ + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); + } + return; +err_get_ioref: + bkey_for_each_ptr(ptrs, ptr2) { + if (ptr2 == ptr) + break; + + percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); + } + + /* Fall back to COW path: */ + goto out; +err_bucket_stale: + while (--i >= 0) + bch2_bucket_nocow_unlock(&c->nocow_locks, + buckets[i].b, + BUCKET_NOCOW_LOCK_UPDATE); + + bkey_for_each_ptr(ptrs, ptr2) + percpu_ref_put(&bch_dev_bkey_exists(c, ptr2->dev)->io_ref); + + /* We can retry this: */ + ret = BCH_ERR_transaction_restart; + goto out; +} + static void __bch2_write(struct bch_write_op *op) { struct bch_fs *c = op->c; @@ -1230,6 +1645,12 @@ static void __bch2_write(struct bch_write_op *op) int ret; nofs_flags = memalloc_nofs_save(); + + if (unlikely(op->opts.nocow)) { + bch2_nocow_write(op); + if (op->flags & BCH_WRITE_DONE) + goto out_nofs_restore; + } again: memset(&op->failed, 0, sizeof(op->failed)); op->btree_update_ready = false; @@ -1310,7 +1731,7 @@ err: key_to_write_offset); bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, - key_to_write); + key_to_write, false); } while (ret); /* @@ -1332,7 +1753,7 @@ err: } else { continue_at(&op->cl, bch2_write_index, NULL); } - +out_nofs_restore: memalloc_nofs_restore(nofs_flags); } @@ -2563,6 +2984,11 @@ void bch2_fs_io_exit(struct bch_fs *c) int bch2_fs_io_init(struct bch_fs *c) { + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) + two_state_lock_init(&c->nocow_locks.l[i]); + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index aafe1bf993bb..77a4a1cef71c 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -22,7 +22,7 @@ static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw #endif void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, - enum bch_data_type, const struct bkey_i *); + enum bch_data_type, const struct bkey_i *, bool); #define BLK_STS_REMOVED ((__force blk_status_t)128) @@ -43,6 +43,7 @@ enum bch_write_flags { __BCH_WRITE_IN_WORKER, __BCH_WRITE_DONE, __BCH_WRITE_IO_ERROR, + __BCH_WRITE_CONVERT_UNWRITTEN, }; #define BCH_WRITE_ALLOC_NOWAIT (1U << __BCH_WRITE_ALLOC_NOWAIT) @@ -61,6 +62,7 @@ enum bch_write_flags { #define BCH_WRITE_IN_WORKER (1U << __BCH_WRITE_IN_WORKER) #define BCH_WRITE_DONE (1U << __BCH_WRITE_DONE) #define BCH_WRITE_IO_ERROR (1U << __BCH_WRITE_IO_ERROR) +#define BCH_WRITE_CONVERT_UNWRITTEN (1U << __BCH_WRITE_CONVERT_UNWRITTEN) static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { @@ -90,7 +92,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->flags = 0; op->written = 0; op->error = 0; - op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); + op->csum_type = bch2_data_checksum_type(c, opts); op->compression_type = bch2_compression_opt_to_type[opts.compression]; op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; @@ -107,6 +109,7 @@ static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, op->res = (struct disk_reservation) { 0 }; op->new_i_size = U64_MAX; op->i_sectors_delta = 0; + op->devs_need_flush = NULL; } void bch2_write(struct closure *); diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index 8e83ce5bc805..200af9e3e6b0 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -97,6 +97,7 @@ struct bch_write_bio { bounce:1, put_bio:1, have_ioref:1, + nocow:1, used_mempool:1, first_btree_write:1; ); @@ -151,6 +152,12 @@ struct bch_write_op { struct keylist insert_keys; u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; + /* + * Bitmask of devices that have had nocow writes issued to them since + * last flush: + */ + struct bch_devs_mask *devs_need_flush; + /* Must be last: */ struct bch_write_bio wbio; }; diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 52f126a0bb73..9e453b8495e8 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -260,6 +260,12 @@ static int bch2_move_extent(struct btree_trans *trans, if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_move)) return -BCH_ERR_erofs_no_writes; + /* + * Before memory allocations & taking nocow locks in + * bch2_data_update_init(): + */ + bch2_trans_unlock(trans); + /* write path might have to decompress data: */ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); @@ -506,6 +512,7 @@ static int __bch2_move_data(struct moving_context *ctxt, */ bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); ret2 = bch2_move_extent(&trans, &iter, ctxt, io_opts, btree_id, k, data_opts); diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c new file mode 100644 index 000000000000..8f06e08370a2 --- /dev/null +++ b/fs/bcachefs/nocow_locking.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "nocow_locking.h" +#include "util.h" + +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + two_state_lock_t *l, int flags) +{ + struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); + u64 start_time = local_clock(); + + bch2_two_state_lock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); +} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h new file mode 100644 index 000000000000..2a7a9f44e88e --- /dev/null +++ b/fs/bcachefs/nocow_locking.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_H +#define _BCACHEFS_NOCOW_LOCKING_H + +#include "bcachefs_format.h" +#include "two_state_shared_lock.h" + +#include + +#define BUCKET_NOCOW_LOCKS_BITS 10 +#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) + +struct bucket_nocow_lock_table { + two_state_lock_t l[BUCKET_NOCOW_LOCKS]; +}; + +#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) + +static inline two_state_lock_t *bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct bpos bucket) +{ + u64 dev_bucket = bucket.inode << 56 | bucket.offset; + unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); + + return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); +} + +static inline bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, + struct bpos bucket) +{ + two_state_lock_t *l = bucket_nocow_lock(t, bucket); + + return atomic_long_read(&l->v) != 0; +} + +static inline void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + two_state_lock_t *l = bucket_nocow_lock(t, bucket); + + bch2_two_state_unlock(l, flags & BUCKET_NOCOW_LOCK_UPDATE); +} + +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, two_state_lock_t *, int); + +static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + two_state_lock_t *l = bucket_nocow_lock(t, bucket); + + if (!bch2_two_state_trylock(l, flags & BUCKET_NOCOW_LOCK_UPDATE)) + __bch2_bucket_nocow_lock(t, l, flags); +} + +#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 85927b306014..ef1b8a03f149 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -392,6 +392,13 @@ enum opt_type { OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, NULL) \ + x(nocow, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_BOOL(), \ + BCH_SB_NOCOW, false, \ + NULL, "Nocow mode: Writes will be done in place when possible.\n"\ + "Snapshots and reflink will still caused writes to be COW\n"\ + "Implicitly disables data checksumming, compression and encryption")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 5e6fbbfd2d43..36bcb9ec2b3a 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -88,9 +88,10 @@ static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, unsigned dev) { - BUG_ON(bch2_dev_list_has_dev(*devs, dev)); - BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); - devs->devs[devs->nr++] = dev; + if (!bch2_dev_list_has_dev(*devs, dev)) { + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); + devs->devs[devs->nr++] = dev; + } } static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index fabee8302afa..24dd2defe7c7 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -543,6 +543,7 @@ DECLARE_EVENT_CLASS(bucket_alloc, __field(u64, need_journal_commit ) __field(u64, nouse ) __field(bool, nonblocking ) + __field(u64, nocow ) __array(char, err, 32 ) ), @@ -560,10 +561,11 @@ DECLARE_EVENT_CLASS(bucket_alloc, __entry->need_journal_commit = s->skipped_need_journal_commit; __entry->nouse = s->skipped_nouse; __entry->nonblocking = nonblocking; + __entry->nocow = s->skipped_nocow; strscpy(__entry->err, err, sizeof(__entry->err)); ), - TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nonblocking %u err %s", + TP_printk("%d,%d reserve %s user %u bucket %llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->reserve, __entry->user, @@ -576,6 +578,7 @@ DECLARE_EVENT_CLASS(bucket_alloc, __entry->open, __entry->need_journal_commit, __entry->nouse, + __entry->nocow, __entry->nonblocking, __entry->err) ); -- cgit v1.2.3 From 9edbcc72f6987bbb58f113d04e7704b7a84106a6 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Wed, 15 Mar 2023 11:53:51 -0400 Subject: bcachefs: Fix bch2_evict_subvolume_inodes() This fixes a bug in bch2_evict_subvolume_inodes(): d_mark_dontcache() doesn't handle the case where i_count is already 0, we need to grab and put the inode in order for it to be dropped. Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs.h | 4 +++ fs/bcachefs/darray.h | 15 +++++--- fs/bcachefs/fs.c | 93 ++++++++++++++++++++++++++++++++++---------------- fs/bcachefs/fs.h | 1 + fs/bcachefs/inode.c | 3 -- fs/bcachefs/super.c | 3 ++ 6 files changed, 81 insertions(+), 38 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 05fc0f7434dd..c1f27b4910a0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -971,6 +971,10 @@ struct bch_fs { reflink_gc_table reflink_gc_table; size_t reflink_gc_nr; + /* fs.c */ + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + /* VFS IO PATH - fs-io.c */ struct bio_set writepage_bioset; struct bio_set dio_write_bioset; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 519ab9b96e67..978ab7961f1b 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -19,11 +19,11 @@ struct { \ typedef DARRAY(void) darray_void; -static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) { if (d->nr + more > d->size) { size_t new_size = roundup_pow_of_two(d->nr + more); - void *data = krealloc_array(d->data, new_size, t_size, GFP_KERNEL); + void *data = krealloc_array(d->data, new_size, t_size, gfp); if (!data) return -ENOMEM; @@ -35,20 +35,25 @@ static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more) return 0; } +#define darray_make_room_gfp(_d, _more, _gfp) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + #define darray_make_room(_d, _more) \ - __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more)) + darray_make_room_gfp(_d, _more, GFP_KERNEL) #define darray_top(_d) ((_d).data[(_d).nr]) -#define darray_push(_d, _item) \ +#define darray_push_gfp(_d, _item, _gfp) \ ({ \ - int _ret = darray_make_room((_d), 1); \ + int _ret = darray_make_room_gfp((_d), 1, _gfp); \ \ if (!_ret) \ (_d)->data[(_d)->nr++] = (_item); \ _ret; \ }) +#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) + #define darray_insert_item(_d, _pos, _item) \ ({ \ size_t pos = (_pos); \ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 828887abc261..129924dfaf69 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -201,6 +201,10 @@ struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) return ERR_PTR(ret); } + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + unlock_new_inode(&inode->v); return &inode->v; @@ -314,6 +318,9 @@ err_before_quota: inode = old; } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); /* * we really don't want insert_inode_locked2() to be setting * I_NEW... @@ -1370,6 +1377,7 @@ static struct inode *bch2_alloc_inode(struct super_block *sb) inode_init_once(&inode->v); mutex_init(&inode->ei_update_lock); two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); mutex_init(&inode->ei_quota_lock); return &inode->v; @@ -1434,53 +1442,78 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_inode_rm(c, inode_inum(inode)); } + + mutex_lock(&c->vfs_inodes_lock); + list_del_init(&inode->ei_vfs_inode_list); + mutex_unlock(&c->vfs_inodes_lock); } -void bch2_evict_subvolume_inodes(struct bch_fs *c, - snapshot_id_list *s) +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) { - struct super_block *sb = c->vfs_sb; - struct inode *inode; + struct bch_inode_info *inode, **i; + DARRAY(struct bch_inode_info *) grabbed; + bool clean_pass = false, this_pass_clean; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) - continue; + /* + * Initially, we scan for inodes without I_DONTCACHE, then mark them to + * be pruned with d_mark_dontcache(). + * + * Once we've had a clean pass where we didn't find any inodes without + * I_DONTCACHE, we wait for them to be freed: + */ - d_mark_dontcache(inode); - d_prune_aliases(inode); - } - spin_unlock(&sb->s_inode_list_lock); + darray_init(&grabbed); + darray_make_room(&grabbed, 1024); again: cond_resched(); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - if (!snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) || - (inode->i_state & I_FREEING)) + this_pass_clean = true; + + mutex_lock(&c->vfs_inodes_lock); + list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { + if (!snapshot_list_has_id(s, inode->ei_subvol)) continue; - if (!(inode->i_state & I_DONTCACHE)) { - d_mark_dontcache(inode); - d_prune_aliases(inode); - } + if (!(inode->v.i_state & I_DONTCACHE) && + !(inode->v.i_state & I_FREEING)) { + this_pass_clean = false; + + d_mark_dontcache(&inode->v); + d_prune_aliases(&inode->v); + + /* + * If i_count was zero, we have to take and release a + * ref in order for I_DONTCACHE to be noticed and the + * inode to be dropped; + */ + + if (!atomic_read(&inode->v.i_count) && + igrab(&inode->v) && + darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) + break; + } else if (clean_pass && this_pass_clean) { + wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); - spin_lock(&inode->i_lock); - if (snapshot_list_has_id(s, to_bch_ei(inode)->ei_subvol) && - !(inode->i_state & I_FREEING)) { - wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_NEW); - DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + mutex_unlock(&c->vfs_inodes_lock); + schedule(); finish_wait(wq, &wait.wq_entry); goto again; } + } + mutex_unlock(&c->vfs_inodes_lock); - spin_unlock(&inode->i_lock); + darray_for_each(grabbed, i) + iput(&(*i)->v); + grabbed.nr = 0; + + if (!clean_pass || !this_pass_clean) { + clean_pass = this_pass_clean; + goto again; } - spin_unlock(&sb->s_inode_list_lock); + + darray_exit(&grabbed); } static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index e1c73a38c607..2e63cb6603bd 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -13,6 +13,7 @@ struct bch_inode_info { struct inode v; + struct list_head ei_vfs_inode_list; unsigned long ei_flags; struct mutex ei_update_lock; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 560545a7ea03..7ccbc00b7156 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -803,9 +803,6 @@ retry: bch2_inode_unpack(k, &inode_u); - /* Subvolume root? */ - BUG_ON(inode_u.bi_subvol); - bkey_inode_generation_init(&delete.k_i); delete.k.p = iter.pos; delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 278f8f19a230..d6f2f453c027 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -709,6 +709,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) sema_init(&c->io_in_flight, 128); + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + c->copy_gc_enabled = 1; c->rebalance.enabled = 1; c->promote_whole_extents = true; -- cgit v1.2.3 From 07f293c8630d5bdae1615e6add90c76fed333d20 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 9 Jul 2023 14:18:28 -0400 Subject: bcachefs: bch2_xattr_set() now updates ctime Fixes fstests generic/728 Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.h | 2 ++ fs/bcachefs/xattr.c | 31 +++++++++++++++++++------------ fs/bcachefs/xattr.h | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 2e63cb6603bd..6170d214d648 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -196,6 +196,8 @@ int bch2_vfs_init(void); #else +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) + static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} static inline void bch2_vfs_exit(void) {} diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 43904c0ec9ba..70f78006daf2 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -167,23 +167,22 @@ err1: } int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, const struct bch_hash_info *hash_info, const char *name, const void *value, size_t size, int type, int flags) { + struct bch_fs *c = trans->c; struct btree_iter inode_iter = { NULL }; - struct bch_inode_unpacked inode_u; int ret; - /* - * We need to do an inode update so that bi_journal_sync gets updated - * and fsync works: - * - * Perhaps we should be updating bi_mtime too? - */ + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; - ret = bch2_inode_peek(trans, &inode_iter, &inode_u, inum, BTREE_ITER_INTENT) ?: - bch2_inode_write(trans, &inode_iter, &inode_u); + inode_u->bi_ctime = bch2_current_time(c); + + ret = bch2_inode_write(trans, &inode_iter, inode_u); bch2_trans_iter_exit(trans, &inode_iter); if (ret) @@ -373,12 +372,20 @@ static int bch2_xattr_set_handler(const struct xattr_handler *handler, struct bch_inode_info *inode = to_bch_ei(vinode); struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct bch_inode_unpacked inode_u; + struct btree_trans trans; int ret; - ret = bch2_trans_do(c, NULL, NULL, 0, - bch2_xattr_set(&trans, inode_inum(inode), &hash, - name, value, size, + bch2_trans_init(&trans, c, 0, 0); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &inode_u, + &hash, name, value, size, handler->flags, flags)); + if (!ret) + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + bch2_trans_exit(&trans); + return bch2_err_class(ret); } diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index ad568c06e1f8..f5a52e3a6016 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -40,7 +40,7 @@ struct bch_inode_info; /* Exported for cmd_migrate.c in tools: */ int bch2_xattr_set(struct btree_trans *, subvol_inum, - const struct bch_hash_info *, + struct bch_inode_unpacked *, const struct bch_hash_info *, const char *, const void *, size_t, int, int); ssize_t bch2_xattr_list(struct dentry *, char *, size_t); -- cgit v1.2.3 From 791236b85c2dfd3bc6b857431658efb49de83343 Mon Sep 17 00:00:00 2001 From: Joshua Ashton Date: Sat, 12 Aug 2023 15:47:45 +0100 Subject: bcachefs: Add btree_trans* to inode_set_fn This will be used when we need to re-hash a directory tree when setting flags. It is not possible to have concurrent btree_trans on a thread. Signed-off-by: Joshua Ashton Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-io.c | 12 ++++++++---- fs/bcachefs/fs-ioctl.c | 11 +++++++---- fs/bcachefs/fs.c | 5 +++-- fs/bcachefs/fs.h | 3 ++- fs/bcachefs/xattr.c | 3 ++- 5 files changed, 22 insertions(+), 12 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 11a4919f30cd..ceab12fb8a8f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -109,7 +109,8 @@ struct inode_new_size { unsigned fields; }; -static int inode_set_size(struct bch_inode_info *inode, +static int inode_set_size(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -389,7 +390,8 @@ static int bch2_extend(struct mnt_idmap *idmap, return bch2_setattr_nonsize(idmap, inode, iattr); } -static int bch2_truncate_finish_fn(struct bch_inode_info *inode, +static int bch2_truncate_finish_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -397,7 +399,8 @@ static int bch2_truncate_finish_fn(struct bch_inode_info *inode, return 0; } -static int bch2_truncate_start_fn(struct bch_inode_info *inode, +static int bch2_truncate_start_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { u64 *new_i_size = p; @@ -518,7 +521,8 @@ err: /* fallocate: */ -static int inode_update_times_fn(struct bch_inode_info *inode, +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { struct bch_fs *c = inode->v.i_sb->s_fs_info; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index dfa1bf73c854..141bcced031e 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -31,7 +31,8 @@ struct flags_set { bool projinherit; }; -static int bch2_inode_flags_set(struct bch_inode_info *inode, +static int bch2_inode_flags_set(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -124,7 +125,8 @@ static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, return copy_to_user(arg, &fa, sizeof(fa)); } -static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, +static int fssetxattr_inode_update_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { @@ -135,7 +137,7 @@ static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, bi->bi_project = s->projid; } - return bch2_inode_flags_set(inode, bi, p); + return bch2_inode_flags_set(trans, inode, bi, p); } static int bch2_ioc_fssetxattr(struct bch_fs *c, @@ -192,7 +194,8 @@ err: return ret; } -static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, +static int bch2_reinherit_attrs_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index aa7ec5dc9ff1..113518ebd095 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -92,7 +92,7 @@ retry: ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), BTREE_ITER_INTENT) ?: - (set ? set(inode, &inode_u, p) : 0) ?: + (set ? set(&trans, inode, &inode_u, p) : 0) ?: bch2_inode_write(&trans, &iter, &inode_u) ?: bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); @@ -1414,7 +1414,8 @@ static void bch2_destroy_inode(struct inode *vinode) call_rcu(&vinode->i_rcu, bch2_i_callback); } -static int inode_update_times_fn(struct bch_inode_info *inode, +static int inode_update_times_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 6170d214d648..10e11119ded2 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -174,7 +174,8 @@ static inline int bch2_set_projid(struct bch_fs *c, struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); /* returns 0 if we want to do the update, or error is passed up */ -typedef int (*inode_set_fn)(struct bch_inode_info *, +typedef int (*inode_set_fn)(struct btree_trans *, + struct bch_inode_info *, struct bch_inode_unpacked *, void *); void bch2_inode_update_after_write(struct btree_trans *, diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index 70f78006daf2..6f6b3caf0607 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -494,7 +494,8 @@ struct inode_opt_set { bool defined; }; -static int inode_opt_set_fn(struct bch_inode_info *inode, +static int inode_opt_set_fn(struct btree_trans *trans, + struct bch_inode_info *inode, struct bch_inode_unpacked *bi, void *p) { -- cgit v1.2.3 From 793a06d984511593c6375d219b38cc84f5a71aff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 23 Sep 2023 19:07:16 -0400 Subject: bcachefs: Fixes for building in userspace Signed-off-by: Kent Overstreet --- fs/bcachefs/checksum.c | 20 ++++++++++++++++++++ fs/bcachefs/checksum.h | 3 +++ fs/bcachefs/fs.h | 2 +- fs/bcachefs/io_read.c | 2 ++ fs/bcachefs/super-io.c | 2 +- 5 files changed, 27 insertions(+), 2 deletions(-) (limited to 'fs/bcachefs/fs.h') diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index c70262b7fd6e..1948119edbf4 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -559,6 +559,26 @@ int bch2_request_key(struct bch_sb *sb, struct bch_key *key) return ret; } +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *sb) +{ + key_serial_t key_id; + struct printbuf key_description = PRINTBUF; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + key_id = request_key("user", key_description.buf, NULL, KEY_SPEC_USER_KEYRING); + printbuf_exit(&key_description); + if (key_id < 0) + return errno; + + keyctl_revoke(key_id); + + return 0; +} +#endif + int bch2_decrypt_sb_key(struct bch_fs *c, struct bch_sb_field_crypt *crypt, struct bch_key *key) diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 779f175029a8..13998388c545 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -47,6 +47,9 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); int bch2_request_key(struct bch_sb *, struct bch_key *); +#ifndef __KERNEL__ +int bch2_revoke_key(struct bch_sb *); +#endif int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, void *data, size_t); diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h index 10e11119ded2..5edf1d4b9e6b 100644 --- a/fs/bcachefs/fs.h +++ b/fs/bcachefs/fs.h @@ -197,7 +197,7 @@ int bch2_vfs_init(void); #else -#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) ({ do {} while (0); }) static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) {} diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 9a57da00573d..443c3ea65527 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -24,6 +24,8 @@ #include "subvolume.h" #include "trace.h" +#include + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 55bc03d2e8ed..c9bf342d14aa 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -675,7 +675,7 @@ retry: #ifndef __KERNEL__ if (opt_get(*opts, direct_io) == false) - sb->mode |= FMODE_BUFFERED; + sb->mode |= BLK_OPEN_BUFFERED; #endif if (!opt_get(*opts, noexcl)) -- cgit v1.2.3