diff options
author | Mike Marshall <hubcap@omnibond.com> | 2017-02-25 17:12:48 +0100 |
---|---|---|
committer | Mike Marshall <hubcap@omnibond.com> | 2017-02-25 17:12:48 +0100 |
commit | e98bdb3059cbf2b1cd4261e126b08429f64466c3 (patch) | |
tree | e378fc95b495cc6e0e558f247e99bcaa21a6d567 /fs | |
parent | orangefs: fix buffer size mis-match between kernel space and user space. (diff) | |
parent | Linux 4.10 (diff) | |
download | linux-e98bdb3059cbf2b1cd4261e126b08429f64466c3.tar.xz linux-e98bdb3059cbf2b1cd4261e126b08429f64466c3.zip |
Merge tag 'v4.10' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into for-next
Linux 4.10
Diffstat (limited to 'fs')
538 files changed, 16359 insertions, 21240 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 6181ad79e1a5..adaf6f6dd858 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -34,6 +34,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/uio.h> +#include <linux/bvec.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -309,18 +310,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (unlikely(copied < len)) { - /* - * zero out the rest of the area - */ - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user(page, from + copied, len - copied); - flush_dcache_page(page); + if (unlikely(copied < len && !PageUptodate(page))) { + copied = 0; + goto out; } - - if (!PageUptodate(page)) - SetPageUptodate(page); /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. @@ -330,6 +323,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, i_size_write(inode, last_pos); } set_page_dirty(page); +out: unlock_page(page); put_page(page); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index d7b78d531e63..6a0f3fa85ef7 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -34,7 +34,7 @@ #include <linux/list.h> #include <linux/pagemap.h> #include <linux/utsname.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/idr.h> #include <linux/uio.h> #include <linux/slab.h> diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 30ca770c5e0b..f4f4450119e4 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1464,7 +1464,6 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = v9fs_vfs_get_link, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index afaa4b6de801..5999bd050678 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -979,7 +979,6 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { }; const struct inode_operations v9fs_symlink_inode_operations_dotl = { - .readlink = generic_readlink, .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, diff --git a/fs/Kconfig b/fs/Kconfig index 4bd03a2b0518..83eab52fb3f6 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -38,6 +38,7 @@ config FS_DAX bool "Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) + select FS_IOMAP help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, @@ -55,7 +56,6 @@ config FS_DAX_PMD depends on FS_DAX depends on ZONE_DEVICE depends on TRANSPARENT_HUGEPAGE - depends on BROKEN endif # BLOCK @@ -235,7 +235,6 @@ source "fs/efs/Kconfig" source "fs/jffs2/Kconfig" # UBIFS File system configuration source "fs/ubifs/Kconfig" -source "fs/logfs/Kconfig" source "fs/cramfs/Kconfig" source "fs/squashfs/Kconfig" source "fs/freevxfs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4c09d93d9569..b2f82cf6bf86 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -170,8 +170,8 @@ config BINFMT_MISC You can do other nice things, too. Read the file <file:Documentation/binfmt_misc.txt> to learn how to use this - feature, <file:Documentation/java.txt> for information about how - to include Java support. and <file:Documentation/mono.txt> for + feature, <file:Documentation/admin-guide/java.rst> for information about how + to include Java support. and <file:Documentation/admin-guide/mono.rst> for information about how to include Mono-based .NET support. To use binfmt_misc, you will need to mount it: diff --git a/fs/Makefile b/fs/Makefile index ed2b63257ba9..7bbaca9c67b1 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -97,7 +97,6 @@ obj-$(CONFIG_NTFS_FS) += ntfs/ obj-$(CONFIG_UFS_FS) += ufs/ obj-$(CONFIG_EFS_FS) += efs/ obj-$(CONFIG_JFFS2_FS) += jffs2/ -obj-$(CONFIG_LOGFS) += logfs/ obj-$(CONFIG_UBIFS_FS) += ubifs/ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 69b03dbb792f..ae622cdce142 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -70,7 +70,6 @@ const struct address_space_operations affs_symlink_aops = { }; const struct inode_operations affs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = affs_notify_change, }; diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 2853b4095344..35efb9a31dd7 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -14,7 +14,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" static struct proc_dir_entry *proc_afs; @@ -42,7 +42,7 @@ #include <linux/mount.h> #include <asm/kmap_types.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -277,10 +277,10 @@ static void put_aio_ring_file(struct kioctx *ctx) struct address_space *i_mapping; if (aio_ring_file) { - truncate_setsize(aio_ring_file->f_inode, 0); + truncate_setsize(file_inode(aio_ring_file), 0); /* Prevent further access to the kioctx from migratepages */ - i_mapping = aio_ring_file->f_inode->i_mapping; + i_mapping = aio_ring_file->f_mapping; spin_lock(&i_mapping->private_lock); i_mapping->private_data = NULL; ctx->aio_ring_file = NULL; @@ -483,7 +483,7 @@ static int aio_setup_ring(struct kioctx *ctx) for (i = 0; i < nr_pages; i++) { struct page *page; - page = find_or_create_page(file->f_inode->i_mapping, + page = find_or_create_page(file->f_mapping, i, GFP_HIGHUSER | __GFP_ZERO); if (!page) break; @@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2) * Tell lockdep we inherited freeze protection from submission * thread. */ - __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE); file_end_write(file); } @@ -1285,7 +1286,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, struct timespec __user *timeout) { - ktime_t until = { .tv64 = KTIME_MAX }; + ktime_t until = KTIME_MAX; long ret = 0; if (timeout) { @@ -1311,7 +1312,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until.tv64 == 0) + if (until == 0) aio_read_events(ctx, min_nr, nr, event, &ret); else wait_event_interruptible_hrtimeout(ctx->wait, @@ -1367,6 +1368,39 @@ out: return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctx32p); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + /* truncating is ok because it's a user address */ + ret = put_user((u32)ioctx->user_id, ctx32p); + if (ret) + kill_ioctx(current->mm, ioctx, NULL); + percpu_ref_put(&ioctx->users); + } + +out: + return ret; +} +#endif + /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not @@ -1492,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored, * by telling it the lock got released so that it doesn't * complain about held lock when we return to userspace. */ - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); + if (S_ISREG(file_inode(file)->i_mode)) + __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); } kfree(iovec); return ret; @@ -1591,8 +1626,8 @@ out_put_req: return ret; } -long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +static long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1662,6 +1697,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return do_io_submit(ctx_id, nr, iocbpp, 0); } +#ifdef CONFIG_COMPAT +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} +#endif + /* lookup_kiocb * Finds a given iocb for cancellation. */ @@ -1761,3 +1834,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, } return ret; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) +{ + struct timespec t; + struct timespec __user *ut = NULL; + + if (timeout) { + if (compat_get_timespec(&t, timeout)) + return -EFAULT; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t))) + return -EFAULT; + } + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); +} +#endif diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 80ef38c73e5a..3168ee4e77f4 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -20,7 +20,7 @@ #include <linux/magic.h> #include <linux/anon_inodes.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static struct vfsmount *anon_inode_mnt __read_mostly; static struct inode *anon_inode_inode; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index a1fba4285277..c885daae68c8 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); +int autofs4_expire_wait(const struct path *path, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); @@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify); +int autofs4_wait(struct autofs_sb_info *, + const struct path *, enum autofs_notify); int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs4_catatonic_mode(struct autofs_sb_info *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index fc09eb77ddf3..6f48d670c941 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -204,7 +204,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, - int test(struct path *path, void *data), + int test(const struct path *path, void *data), void *data) { struct path path; @@ -230,12 +230,12 @@ static int find_autofs_mount(const char *pathname, return err; } -static int test_by_dev(struct path *path, void *p) +static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } -static int test_by_type(struct path *path, void *p) +static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs4_dentry_ino(path->dentry); @@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry, 0); + autofs4_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, devid = new_encode_dev(dev); - err = have_submounts(path.dentry); + err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index d8e6d421c27f..57725d4a8c59 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino = autofs4_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + goto out; + } ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - spin_lock(&sbi->fs_lock); if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); return root; } + spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); } out: - spin_unlock(&sbi->fs_lock); dput(root); return NULL; @@ -495,8 +498,9 @@ found: return expired; } -int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) +int autofs4_expire_wait(const struct path *path, int rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -525,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, dentry, NFY_NONE); + status = autofs4_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -592,11 +596,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (dentry) { struct autofs_info *ino = autofs4_dentry_ino(dentry); + const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); + ret = autofs4_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 438b5bf675b6..09e7d68dff02 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -94,7 +94,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",indirect"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) - seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino); + seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); else seq_printf(m, ",pipe_ino=-1"); #endif diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a11f73174877..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file); static struct dentry *autofs4_lookup(struct inode *, struct dentry *, unsigned int); static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(struct dentry *, bool); +static int autofs4_d_manage(const struct path *, bool); static void autofs4_dentry_release(struct dentry *); const struct file_operations autofs4_root_operations = { @@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) * it. */ spin_lock(&sbi->lookup_lock); - if (!d_mountpoint(dentry) && simple_empty(dentry)) { + if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { spin_unlock(&sbi->lookup_lock); return -ENOENT; } @@ -269,39 +269,41 @@ next: return NULL; } -static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) +static int autofs4_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs4_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; - pr_debug("waiting for mount name=%pd\n", dentry); - status = autofs4_wait(sbi, dentry, NFY_MOUNT); + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; return status; } -static int do_expire_wait(struct dentry *dentry, bool rcu_walk) +static int do_expire_wait(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct dentry *expiring; expiring = autofs4_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(dentry, rcu_walk); + return autofs4_expire_wait(path, rcu_walk); else { + const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(expiring, 0); + autofs4_expire_wait(&this, 0); autofs4_del_expiring(expiring); dput(expiring); } @@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * and the directory was removed, so just go ahead and try * the mount. */ - status = do_expire_wait(dentry, 0); + status = do_expire_wait(path, 0); if (status && status != -EAGAIN) return NULL; @@ -362,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -370,28 +372,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path) /* * If the dentry is a symlink it's equivalent to a directory - * having d_mountpoint() true, so there's no need to call back - * to the daemon. + * having path_is_mountpoint() true, so there's no need to call + * back to the daemon. */ if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } - if (!d_mountpoint(dentry)) { + if (!path_is_mountpoint(path)) { /* * It's possible that user space hasn't removed directories * after umounting a rootless multi-mount, although it - * should. For v5 have_submounts() is sufficient to handle - * this because the leaves of the directory tree under the - * mount never trigger mounts themselves (they have an autofs - * trigger mount mounted on them). But v4 pseudo direct mounts - * do need the leaves to trigger mounts. In this case we - * have no choice but to use the list_empty() check and + * should. For v5 path_has_submounts() is sufficient to + * handle this because the leaves of the directory tree under + * the mount never trigger mounts themselves (they have an + * autofs trigger mount mounted on them). But v4 pseudo direct + * mounts do need the leaves to trigger mounts. In this case + * we have no choice but to use the list_empty() check and * require user space behave. */ if (sbi->version > 4) { - if (have_submounts(dentry)) { + if (path_has_submounts(path)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -403,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry, 0); + status = autofs4_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -421,8 +423,9 @@ done: return NULL; } -static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) +static int autofs4_d_manage(const struct path *path, bool rcu_walk) { + struct dentry *dentry = path->dentry; struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; @@ -431,20 +434,20 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { - if (!d_mountpoint(dentry)) + if (!path_is_mountpoint(path)) return -EISDIR; return 0; } /* Wait for pending expires */ - if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + if (do_expire_wait(path, rcu_walk) == -ECHILD) return -ECHILD; /* * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(dentry, rcu_walk); + status = autofs4_mount_wait(path, rcu_walk); if (status) return status; @@ -460,7 +463,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) if (ino->flags & AUTOFS_INF_WANT_EXPIRE) return 0; - if (d_mountpoint(dentry)) + if (path_is_mountpoint(path)) return 0; inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) @@ -487,7 +490,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * we can avoid needless calls ->d_automount() and avoid * an incorrect ELOOP error return. */ - if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || + if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 99aab00dc217..ab0b4285a202 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -25,6 +25,5 @@ static const char *autofs4_get_link(struct dentry *dentry, } const struct inode_operations autofs4_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = autofs4_get_link }; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e44271dfceb6..1278335ce366 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) static int validate_request(struct autofs_wait_queue **wait, struct autofs_sb_info *sbi, const struct qstr *qstr, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct autofs_info *ino; @@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait, */ if (notify == NFY_MOUNT) { struct dentry *new = NULL; + struct path this; int valid = 1; /* @@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait, dentry = new; } } - if (have_submounts(dentry)) + this.mnt = path->mnt; + this.dentry = dentry; + if (path_has_submounts(&this)) valid = 0; if (new) @@ -345,8 +349,9 @@ static int validate_request(struct autofs_wait_queue **wait, } int autofs4_wait(struct autofs_sb_info *sbi, - struct dentry *dentry, enum autofs_notify notify) + const struct path *path, enum autofs_notify notify) { + struct dentry *dentry = path->dentry; struct autofs_wait_queue *wq; struct qstr qstr; char *name; @@ -405,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -EINTR; } - ret = validate_request(&wq, sbi, &qstr, dentry, notify); + ret = validate_request(&wq, sbi, &qstr, path, notify); if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8712062275b8..5f685c819298 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -106,6 +106,50 @@ static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer, return -EIO; } +static const char *bad_inode_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + return ERR_PTR(-EIO); +} + +static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type) +{ + return ERR_PTR(-EIO); +} + +static int bad_inode_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len) +{ + return -EIO; +} + +static int bad_inode_update_time(struct inode *inode, struct timespec *time, + int flags) +{ + return -EIO; +} + +static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, + struct file *file, unsigned int open_flag, + umode_t create_mode, int *opened) +{ + return -EIO; +} + +static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry, + umode_t mode) +{ + return -EIO; +} + +static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EIO; +} + static const struct inode_operations bad_inode_ops = { .create = bad_inode_create, @@ -118,14 +162,17 @@ static const struct inode_operations bad_inode_ops = .mknod = bad_inode_mknod, .rename = bad_inode_rename2, .readlink = bad_inode_readlink, - /* follow_link must be no-op, otherwise unmounting this inode - won't work */ - /* put_link returns void */ - /* truncate returns void */ .permission = bad_inode_permission, .getattr = bad_inode_getattr, .setattr = bad_inode_setattr, .listxattr = bad_inode_listxattr, + .get_link = bad_inode_get_link, + .get_acl = bad_inode_get_acl, + .fiemap = bad_inode_fiemap, + .update_time = bad_inode_update_time, + .atomic_open = bad_inode_atomic_open, + .tmpfile = bad_inode_tmpfile, + .set_acl = bad_inode_set_acl, }; diff --git a/fs/befs/befs.h b/fs/befs/befs.h index c6bad51d8ec7..b914cfb03820 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -129,6 +129,7 @@ static inline befs_inode_addr blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno) { befs_inode_addr iaddr; + iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift; iaddr.start = blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift); @@ -140,7 +141,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno) static inline unsigned int befs_iaddrs_per_block(struct super_block *sb) { - return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr); + return BEFS_SB(sb)->block_size / sizeof(befs_disk_inode_addr); } #include "endian.h" diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index eb557d9dc8be..69c9d8cde955 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -55,12 +55,12 @@ enum super_flags { }; #define BEFS_BYTEORDER_NATIVE 0x42494745 -#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) -#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_LE ((__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)) +#define BEFS_BYTEORDER_NATIVE_BE ((__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)) #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 -#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) -#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_LE ((__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)) +#define BEFS_SUPER_MAGIC1_BE ((__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)) /* * Flags of inode @@ -79,7 +79,7 @@ enum inode_flags { BEFS_INODE_WAS_WRITTEN = 0x00020000, BEFS_NO_TRANSACTION = 0x00040000, }; -/* +/* * On-Disk datastructures of BeFS */ @@ -139,7 +139,7 @@ typedef struct { } PACKED befs_super_block; -/* +/* * Note: the indirect and dbl_indir block_runs may * be longer than one block! */ diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 7e135ea73fdd..d509887c580c 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -12,8 +12,8 @@ * * Dominic Giampaolo, author of "Practical File System * Design with the Be File System", for such a helpful book. - * - * Marcus J. Ranum, author of the b+tree package in + * + * Marcus J. Ranum, author of the b+tree package in * comp.sources.misc volume 10. This code is not copied from that * work, but it is partially based on it. * @@ -38,38 +38,38 @@ */ /* Befs B+tree structure: - * + * * The first thing in the tree is the tree superblock. It tells you * all kinds of useful things about the tree, like where the rootnode * is located, and the size of the nodes (always 1024 with current version * of BeOS). * * The rest of the tree consists of a series of nodes. Nodes contain a header - * (struct befs_btree_nodehead), the packed key data, an array of shorts + * (struct befs_btree_nodehead), the packed key data, an array of shorts * containing the ending offsets for each of the keys, and an array of - * befs_off_t values. In interior nodes, the keys are the ending keys for - * the childnode they point to, and the values are offsets into the - * datastream containing the tree. + * befs_off_t values. In interior nodes, the keys are the ending keys for + * the childnode they point to, and the values are offsets into the + * datastream containing the tree. */ /* Note: - * - * The book states 2 confusing things about befs b+trees. First, + * + * The book states 2 confusing things about befs b+trees. First, * it states that the overflow field of node headers is used by internal nodes * to point to another node that "effectively continues this one". Here is what * I believe that means. Each key in internal nodes points to another node that - * contains key values less than itself. Inspection reveals that the last key - * in the internal node is not the last key in the index. Keys that are - * greater than the last key in the internal node go into the overflow node. + * contains key values less than itself. Inspection reveals that the last key + * in the internal node is not the last key in the index. Keys that are + * greater than the last key in the internal node go into the overflow node. * I imagine there is a performance reason for this. * - * Second, it states that the header of a btree node is sufficient to - * distinguish internal nodes from leaf nodes. Without saying exactly how. + * Second, it states that the header of a btree node is sufficient to + * distinguish internal nodes from leaf nodes. Without saying exactly how. * After figuring out the first, it becomes obvious that internal nodes have * overflow nodes and leafnodes do not. */ -/* +/* * Currently, this code is only good for directory B+trees. * In order to be used for other BFS indexes, it needs to be extended to handle * duplicate keys and non-string keytypes (int32, int64, float, double). @@ -237,8 +237,8 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds, * with @key (usually the disk block number of an inode). * * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND. - * - * Algorithm: + * + * Algorithm: * Read the superblock and rootnode of the b+tree. * Drill down through the interior nodes using befs_find_key(). * Once at the correct leaf node, use befs_find_key() again to get the @@ -402,12 +402,12 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node, * * Here's how it works: Key_no is the index of the key/value pair to * return in keybuf/value. - * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is + * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is * the number of characters in the key (just a convenience). * * Algorithm: * Get the first leafnode of the tree. See if the requested key is in that - * node. If not, follow the node->right link to the next leafnode. Repeat + * node. If not, follow the node->right link to the next leafnode. Repeat * until the (key_no)th key is found or the tree is out of keys. */ int @@ -536,7 +536,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds, * @node_off: Pointer to offset of current node within datastream. Modified * by the function. * - * Helper function for btree traverse. Moves the current position to the + * Helper function for btree traverse. Moves the current position to the * start of the first leaf node. * * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY. @@ -592,10 +592,10 @@ befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds, } /** - * befs_leafnode - Determine if the btree node is a leaf node or an + * befs_leafnode - Determine if the btree node is a leaf node or an * interior node * @node: Pointer to node structure to test - * + * * Return 1 if leaf, 0 if interior */ static int @@ -656,7 +656,7 @@ befs_bt_valarray(struct befs_btree_node *node) * @node: Pointer to the node structure to find the keydata array within * * Returns a pointer to the start of the keydata array - * of the node pointed to by the node header + * of the node pointed to by the node header */ static char * befs_bt_keydata(struct befs_btree_node *node) @@ -702,7 +702,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, /** * befs_compare_strings - compare two strings - * @key1: pointer to the first key to be compared + * @key1: pointer to the first key to be compared * @keylen1: length in bytes of key1 * @key2: pointer to the second key to be compared * @keylen2: length in bytes of key2 diff --git a/fs/befs/btree.h b/fs/befs/btree.h index f2a8f637e9e0..60c6c728e64e 100644 --- a/fs/befs/btree.h +++ b/fs/befs/btree.h @@ -1,13 +1,11 @@ /* * btree.h - * + * */ - int befs_btree_find(struct super_block *sb, const befs_data_stream *ds, - const char *key, befs_off_t * value); + const char *key, befs_off_t *value); int befs_btree_read(struct super_block *sb, const befs_data_stream *ds, loff_t key_no, size_t bufsize, char *keybuf, - size_t * keysize, befs_off_t * value); - + size_t *keysize, befs_off_t *value); diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index b4c7ba013c0d..720b3bc5c16a 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -84,13 +84,11 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, * * Takes a file position and gives back a brun who's starting block * is block number fblock of the file. - * + * * Returns BEFS_OK or BEFS_ERR. - * + * * Calls specialized functions for each of the three possible * datastream regions. - * - * 2001-11-15 Will Dyson */ int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, @@ -120,7 +118,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, /** * befs_read_lsmylink - read long symlink from datastream. - * @sb: Filesystem superblock + * @sb: Filesystem superblock * @ds: Datastream to read from * @buff: Buffer in which to place long symlink data * @len: Length of the long symlink in bytes diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h index 91ba8203d83f..7ff9ff09ec6e 100644 --- a/fs/befs/datastream.h +++ b/fs/befs/datastream.h @@ -5,10 +5,10 @@ struct buffer_head *befs_read_datastream(struct super_block *sb, const befs_data_stream *ds, - befs_off_t pos, uint * off); + befs_off_t pos, uint *off); int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data, - befs_blocknr_t fblock, befs_block_run * run); + befs_blocknr_t fblock, befs_block_run *run); size_t befs_read_lsymlink(struct super_block *sb, const befs_data_stream *data, void *buff, befs_off_t len); @@ -17,4 +17,3 @@ befs_blocknr_t befs_count_blocks(struct super_block *sb, const befs_data_stream *ds); extern const befs_inode_addr BAD_IADDR; - diff --git a/fs/befs/debug.c b/fs/befs/debug.c index 85c13392e9e8..36656c86f50e 100644 --- a/fs/befs/debug.c +++ b/fs/befs/debug.c @@ -1,6 +1,6 @@ /* * linux/fs/befs/debug.c - * + * * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com) * * With help from the ntfs-tng driver by Anton Altparmakov @@ -57,6 +57,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) struct va_format vaf; va_list args; + va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; @@ -67,7 +68,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...) } void -befs_dump_inode(const struct super_block *sb, befs_inode * inode) +befs_dump_inode(const struct super_block *sb, befs_inode *inode) { #ifdef CONFIG_BEFS_DEBUG @@ -151,7 +152,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode) */ void -befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) +befs_dump_super_block(const struct super_block *sb, befs_super_block *sup) { #ifdef CONFIG_BEFS_DEBUG @@ -202,7 +203,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup) #if 0 /* unused */ void -befs_dump_small_data(const struct super_block *sb, befs_small_data * sd) +befs_dump_small_data(const struct super_block *sb, befs_small_data *sd) { } @@ -221,7 +222,8 @@ befs_dump_run(const struct super_block *sb, befs_disk_block_run run) #endif /* 0 */ void -befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super) +befs_dump_index_entry(const struct super_block *sb, + befs_disk_btree_super *super) { #ifdef CONFIG_BEFS_DEBUG @@ -242,7 +244,7 @@ befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * supe } void -befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node) +befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *node) { #ifdef CONFIG_BEFS_DEBUG diff --git a/fs/befs/inode.c b/fs/befs/inode.c index fa4b718de597..5367a6470a69 100644 --- a/fs/befs/inode.c +++ b/fs/befs/inode.c @@ -1,6 +1,6 @@ /* * inode.c - * + * * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com> */ @@ -10,12 +10,12 @@ #include "inode.h" /* - Validates the correctness of the befs inode - Returns BEFS_OK if the inode should be used, otherwise - returns BEFS_BAD_INODE -*/ + * Validates the correctness of the befs inode + * Returns BEFS_OK if the inode should be used, otherwise + * returns BEFS_BAD_INODE + */ int -befs_check_inode(struct super_block *sb, befs_inode * raw_inode, +befs_check_inode(struct super_block *sb, befs_inode *raw_inode, befs_blocknr_t inode) { u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1); diff --git a/fs/befs/inode.h b/fs/befs/inode.h index 9dc7fd9b7570..2219e412f49b 100644 --- a/fs/befs/inode.h +++ b/fs/befs/inode.h @@ -1,8 +1,7 @@ /* * inode.h - * + * */ -int befs_check_inode(struct super_block *sb, befs_inode * raw_inode, +int befs_check_inode(struct super_block *sb, befs_inode *raw_inode, befs_blocknr_t inode); - diff --git a/fs/befs/io.c b/fs/befs/io.c index b4a558126ee1..227cb86e07fe 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -3,7 +3,7 @@ * * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com * - * Based on portions of file.c and inode.c + * Based on portions of file.c and inode.c * by Makoto Kato (m_kato@ga2.so-net.ne.jp) * * Many thanks to Dominic Giampaolo, author of Practical File System @@ -19,8 +19,7 @@ /* * Converts befs notion of disk addr to a disk offset and uses * linux kernel function sb_bread() to get the buffer containing - * the offset. -Will Dyson - * + * the offset. */ struct buffer_head * @@ -55,7 +54,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) befs_debug(sb, "<--- %s", __func__); return bh; - error: +error: befs_debug(sb, "<--- %s ERROR", __func__); return NULL; } diff --git a/fs/befs/io.h b/fs/befs/io.h index 78d7bc6e60de..9b3e1967cb31 100644 --- a/fs/befs/io.h +++ b/fs/befs/io.h @@ -4,4 +4,3 @@ struct buffer_head *befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr); - diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 647a276eba56..19407165f4aa 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -18,6 +18,7 @@ #include <linux/parser.h> #include <linux/namei.h> #include <linux/sched.h> +#include <linux/exportfs.h> #include "befs.h" #include "btree.h" @@ -37,7 +38,8 @@ static int befs_readdir(struct file *, struct dir_context *); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); static int befs_readpage(struct file *file, struct page *page); static sector_t befs_bmap(struct address_space *mapping, sector_t block); -static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); +static struct dentry *befs_lookup(struct inode *, struct dentry *, + unsigned int); static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); @@ -51,6 +53,10 @@ static void befs_put_super(struct super_block *); static int befs_remount(struct super_block *, int *, char *); static int befs_statfs(struct dentry *, struct kstatfs *); static int parse_options(char *, struct befs_mount_options *); +static struct dentry *befs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type); +static struct dentry *befs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ @@ -83,9 +89,14 @@ static const struct address_space_operations befs_symlink_aops = { .readpage = befs_symlink_readpage, }; -/* +static const struct export_operations befs_export_operations = { + .fh_to_dentry = befs_fh_to_dentry, + .fh_to_parent = befs_fh_to_parent, +}; + +/* * Called by generic_file_read() to read a page of data - * + * * In turn, simply calls a generic block read function and * passes it the address of befs_get_block, for mapping file * positions to disk blocks. @@ -102,15 +113,13 @@ befs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, befs_get_block); } -/* - * Generic function to map a file position (block) to a +/* + * Generic function to map a file position (block) to a * disk offset (passed back in bh_result). * * Used by many higher level functions. * * Calls befs_fblock2brun() in datastream.c to do the real work. - * - * -WD 10-26-01 */ static int @@ -269,15 +278,15 @@ befs_alloc_inode(struct super_block *sb) struct befs_inode_info *bi; bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); - if (!bi) - return NULL; - return &bi->vfs_inode; + if (!bi) + return NULL; + return &bi->vfs_inode; } static void befs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); + kmem_cache_free(befs_inode_cachep, BEFS_I(inode)); } static void befs_destroy_inode(struct inode *inode) @@ -287,7 +296,7 @@ static void befs_destroy_inode(struct inode *inode) static void init_once(void *foo) { - struct befs_inode_info *bi = (struct befs_inode_info *) foo; + struct befs_inode_info *bi = (struct befs_inode_info *) foo; inode_init_once(&bi->vfs_inode); } @@ -338,7 +347,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) /* * set uid and gid. But since current BeOS is single user OS, so * you can change by "uid" or "gid" options. - */ + */ inode->i_uid = befs_sb->mount_opts.use_uid ? befs_sb->mount_opts.uid : @@ -353,14 +362,14 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * BEFS's time is 64 bits, but current VFS is 32 bits... * BEFS don't have access time. Nor inode change time. VFS * doesn't have creation time. - * Also, the lower 16 bits of the last_modified_time and + * Also, the lower 16 bits of the last_modified_time and * create_time are just a counter to help ensure uniqueness * for indexing purposes. (PFD, page 54) */ inode->i_mtime.tv_sec = fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; - inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ + inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ inode->i_ctime = inode->i_mtime; inode->i_atime = inode->i_mtime; @@ -414,10 +423,10 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) unlock_new_inode(inode); return inode; - unacquire_bh: +unacquire_bh: brelse(bh); - unacquire_none: +unacquire_none: iget_failed(inode); befs_debug(sb, "<--- %s - Bad inode", __func__); return ERR_PTR(-EIO); @@ -442,7 +451,7 @@ befs_init_inodecache(void) } /* Called at fs teardown. - * + * * Taken from NFS implementation by Al Viro. */ static void @@ -491,13 +500,10 @@ fail: } /* - * UTF-8 to NLS charset convert routine - * + * UTF-8 to NLS charset convert routine * - * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than - * the nls tables directly + * Uses uni2char() / char2uni() rather than the nls tables directly */ - static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len) @@ -521,9 +527,8 @@ befs_utf2nls(struct super_block *sb, const char *in, } *out = result = kmalloc(maxlen, GFP_NOFS); - if (!*out) { + if (!*out) return -ENOMEM; - } for (i = o = 0; i < in_len; i += utflen, o += unilen) { @@ -546,7 +551,7 @@ befs_utf2nls(struct super_block *sb, const char *in, return o; - conv_err: +conv_err: befs_error(sb, "Name using character set %s contains a character that " "cannot be converted to unicode.", nls->charset); befs_debug(sb, "<--- %s", __func__); @@ -561,18 +566,18 @@ befs_utf2nls(struct super_block *sb, const char *in, * @in_len: Length of input string in bytes * @out: The output string in UTF-8 format * @out_len: Length of the output buffer - * + * * Converts input string @in, which is in the format of the loaded NLS map, * into a utf8 string. - * + * * The destination string @out is allocated by this function and the caller is * responsible for freeing it with kfree() - * + * * On return, *@out_len is the length of @out in bytes. * * On success, the return value is the number of utf8 characters written to * the output buffer @out. - * + * * On Failure, a negative number coresponding to the error code is returned. */ @@ -585,9 +590,11 @@ befs_nls2utf(struct super_block *sb, const char *in, wchar_t uni; int unilen, utflen; char *result; - /* There're nls characters that will translate to 3-chars-wide UTF-8 - * characters, a additional byte is needed to save the final \0 - * in special cases */ + /* + * There are nls characters that will translate to 3-chars-wide UTF-8 + * characters, an additional byte is needed to save the final \0 + * in special cases + */ int maxlen = (3 * in_len) + 1; befs_debug(sb, "---> %s\n", __func__); @@ -624,14 +631,41 @@ befs_nls2utf(struct super_block *sb, const char *in, return i; - conv_err: - befs_error(sb, "Name using charecter set %s contains a charecter that " +conv_err: + befs_error(sb, "Name using character set %s contains a character that " "cannot be converted to unicode.", nls->charset); befs_debug(sb, "<--- %s", __func__); kfree(result); return -EILSEQ; } +static struct inode *befs_nfs_get_inode(struct super_block *sb, uint64_t ino, + uint32_t generation) +{ + /* No need to handle i_generation */ + return befs_iget(sb, ino); +} + +/* + * Map a NFS file handle to a corresponding dentry + */ +static struct dentry *befs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + befs_nfs_get_inode); +} + +/* + * Find the parent for a file specified by NFS handle + */ +static struct dentry *befs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + befs_nfs_get_inode); +} + enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; @@ -666,6 +700,7 @@ parse_options(char *options, struct befs_mount_options *opts) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; @@ -721,7 +756,7 @@ parse_options(char *options, struct befs_mount_options *opts) } /* This function has the responsibiltiy of getting the - * filesystem ready for unmounting. + * filesystem ready for unmounting. * Basically, we free everything that we allocated in * befs_read_inode */ @@ -782,8 +817,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * Linux 2.4.10 and later refuse to read blocks smaller than * the logical block size for the device. But we also need to read at * least 1k to get the second 512 bytes of the volume. - * -WD 10-26-01 - */ + */ blocksize = sb_min_blocksize(sb, 1024); if (!blocksize) { if (!silent) @@ -791,7 +825,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) goto unacquire_priv_sbp; } - if (!(bh = sb_bread(sb, sb_block))) { + bh = sb_bread(sb, sb_block); + if (!bh) { if (!silent) befs_error(sb, "unable to read superblock"); goto unacquire_priv_sbp; @@ -816,7 +851,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); - if( befs_sb->num_blocks > ~((sector_t)0) ) { + if (befs_sb->num_blocks > ~((sector_t)0)) { if (!silent) befs_error(sb, "blocks count: %llu is larger than the host can use", befs_sb->num_blocks); @@ -831,6 +866,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) /* Set real blocksize of fs */ sb_set_blocksize(sb, (ulong) befs_sb->block_size); sb->s_op = &befs_sops; + sb->s_export_op = &befs_export_operations; root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir))); if (IS_ERR(root)) { ret = PTR_ERR(root); @@ -861,16 +897,16 @@ befs_fill_super(struct super_block *sb, void *data, int silent) } return 0; -/*****************/ - unacquire_bh: + +unacquire_bh: brelse(bh); - unacquire_priv_sbp: +unacquire_priv_sbp: kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); sb->s_fs_info = NULL; - unacquire_none: +unacquire_none: return ret; } @@ -919,7 +955,7 @@ static struct file_system_type befs_fs_type = { .name = "befs", .mount = befs_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("befs"); @@ -956,9 +992,9 @@ exit_befs_fs(void) } /* -Macros that typecheck the init and exit functions, -ensures that they are called at init and cleanup, -and eliminates warnings about unused functions. -*/ + * Macros that typecheck the init and exit functions, + * ensures that they are called at init and cleanup, + * and eliminates warnings about unused functions. + */ module_init(init_befs_fs) module_exit(exit_befs_fs) diff --git a/fs/befs/super.h b/fs/befs/super.h index dc4556376a22..ec1df30a7e9a 100644 --- a/fs/befs/super.h +++ b/fs/befs/super.h @@ -2,7 +2,5 @@ * super.h */ -int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb); - +int befs_load_sb(struct super_block *sb, befs_super_block *disk_sb); int befs_check_sb(struct super_block *sb); - diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1e5c896f6b79..f2deec0a62f0 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -16,7 +16,7 @@ #include <linux/vfs.h> #include <linux/writeback.h> #include <linux/uio.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "bfs.h" MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>"); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ae1b5404fced..2a59139f520b 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -26,7 +26,7 @@ #include <linux/coredump.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/a.out-core.h> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2472af2798c7..422370293cfd 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -36,7 +36,7 @@ #include <linux/coredump.h> #include <linux/sched.h> #include <linux/dax.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/param.h> #include <asm/page.h> @@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL); + if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) + goto end_coredump; + vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz)); if (!vma_filesz) goto end_coredump; @@ -2296,6 +2298,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; } } + dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; @@ -2311,7 +2314,7 @@ end_coredump: cleanup: free_note_info(&info); kfree(shdr4extnum); - kfree(vma_filesz); + vfree(vma_filesz); kfree(phdr4note); kfree(elf); out: diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 464a972e88c1..d2e36f82c35d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -37,7 +37,7 @@ #include <linux/coredump.h> #include <linux/dax.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/param.h> #include <asm/pgalloc.h> diff --git a/fs/block_dev.c b/fs/block_dev.c index 05b553368bb4..3c47614a4b32 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -30,8 +30,9 @@ #include <linux/cleancache.h> #include <linux/dax.h> #include <linux/badblocks.h> +#include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" struct bdev_inode { @@ -175,17 +176,273 @@ static struct inode *bdev_file_inode(struct file *file) return file->f_mapping->host; } +static unsigned int dio_bio_write_op(struct kiocb *iocb) +{ + unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb->ki_flags & IOCB_DSYNC) + op |= REQ_FUA; + return op; +} + +#define DIO_INLINE_BIO_VECS 4 + +static void blkdev_bio_end_io_simple(struct bio *bio) +{ + struct task_struct *waiter = bio->bi_private; + + WRITE_ONCE(bio->bi_private, NULL); + wake_up_process(waiter); +} + static ssize_t -blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, + int nr_pages) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + blk_qc_t qc; + int i; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + bio_init(&bio, vecs, nr_pages); + bio.bi_bdev = bdev; + bio.bi_iter.bi_sector = pos >> 9; + bio.bi_private = current; + bio.bi_end_io = blkdev_bio_end_io_simple; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + return ret; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio.bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) + should_dirty = true; + } else { + bio.bi_opf = dio_bio_write_op(iocb); + task_io_account_write(ret); + } + + qc = submit_bio(&bio); + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(bio.bi_private)) + break; + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_mq_poll(bdev_get_queue(bdev), qc)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + + bio_for_each_segment_all(bvec, &bio, i) { + if (should_dirty && !PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); + put_page(bvec->bv_page); + } + + if (vecs != inline_vecs) + kfree(vecs); + + if (unlikely(bio.bi_error)) + return bio.bi_error; + return ret; +} + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + bool multi_bio : 1; + bool should_dirty : 1; + bool is_sync : 1; + struct bio bio; +}; + +static struct bio_set *blkdev_dio_pool __read_mostly; + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) { + if (bio->bi_error && !dio->bio.bi_error) + dio->bio.bi_error = bio->bi_error; + } else { + if (!dio->is_sync) { + struct kiocb *iocb = dio->iocb; + ssize_t ret = dio->bio.bi_error; + + if (likely(!ret)) { + ret = dio->size; + iocb->ki_pos += ret; + } + + dio->iocb->ki_complete(iocb, ret, 0); + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + wake_up_process(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static ssize_t +__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + struct blk_plug plug; + struct blkdev_dio *dio; + struct bio *bio; + bool is_read = (iov_iter_rw(iter) == READ), is_sync; + loff_t pos = iocb->ki_pos; + blk_qc_t qc = BLK_QC_T_NONE; + int ret; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool); + bio_get(bio); /* extra ref for the completion handler */ + + dio = container_of(bio, struct blkdev_dio, bio); + dio->is_sync = is_sync = is_sync_kiocb(iocb); + if (dio->is_sync) + dio->waiter = current; + else + dio->iocb = iocb; + + dio->size = 0; + dio->multi_bio = false; + dio->should_dirty = is_read && (iter->type == ITER_IOVEC); + + blk_start_plug(&plug); + for (;;) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = pos >> 9; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_error = ret; + bio_endio(bio); + break; + } + + if (is_read) { + bio->bi_opf = REQ_OP_READ; + if (dio->should_dirty) + bio_set_pages_dirty(bio); + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); + if (!nr_pages) { + qc = submit_bio(bio); + break; + } + + if (!dio->multi_bio) { + dio->multi_bio = true; + atomic_set(&dio->ref, 2); + } else { + atomic_inc(&dio->ref); + } + + submit_bio(bio); + bio = bio_alloc(GFP_KERNEL, nr_pages); + } + blk_finish_plug(&plug); + + if (!is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !blk_mq_poll(bdev_get_queue(bdev), qc)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + + ret = dio->bio.bi_error; + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} - return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, - blkdev_get_block, NULL, NULL, - DIO_SKIP_DIO_COUNT); +static ssize_t +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + int nr_pages; + + nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); + if (!nr_pages) + return 0; + if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + + return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); } +static __init int blkdev_init(void) +{ + blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio)); + if (!blkdev_dio_pool) + return -ENOMEM; + return 0; +} +module_init(blkdev_init); + int __sync_blockdev(struct block_device *bdev, int wait) { if (!bdev) @@ -832,7 +1089,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ - else if (bdev->bd_contains == bdev) + else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) @@ -1950,6 +2207,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || @@ -1970,8 +2228,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) */ iput(old_inode); old_inode = inode; + bdev = I_BDEV(inode); - func(I_BDEV(inode), arg); + mutex_lock(&bdev->bd_mutex); + if (bdev->bd_openers) + func(bdev, arg); + mutex_unlock(&bdev->bd_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index e0f071f6b5a7..ff0b0be92d61 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -86,6 +86,20 @@ btrfs_work_owner(struct btrfs_work *work) return work->wq->fs_info; } +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq) +{ + /* + * We could compare wq->normal->pending with num_online_cpus() + * to support "thresh == NO_THRESHOLD" case, but it requires + * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's + * postpone it until someone needs the support of that case. + */ + if (wq->normal->thresh == NO_THRESHOLD) + return false; + + return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; +} + BTRFS_WORK_HELPER(worker_helper); BTRFS_WORK_HELPER(delalloc_helper); BTRFS_WORK_HELPER(flush_delalloc_helper); @@ -259,6 +273,8 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) unsigned long flags; while (1) { + void *wtag; + spin_lock_irqsave(lock, flags); if (list_empty(list)) break; @@ -285,11 +301,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) spin_unlock_irqrestore(lock, flags); /* - * we don't want to call the ordered free functions - * with the lock held though + * We don't want to call the ordered free functions with the + * lock held though. Save the work as tag for the trace event, + * because the callback could free the structure. */ + wtag = work; work->ordered_free(work); - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } spin_unlock_irqrestore(lock, flags); } @@ -297,6 +315,7 @@ static void run_ordered_work(struct __btrfs_workqueue *wq) static void normal_work_helper(struct btrfs_work *work) { struct __btrfs_workqueue *wq; + void *wtag; int need_order = 0; /* @@ -310,6 +329,8 @@ static void normal_work_helper(struct btrfs_work *work) if (work->ordered_func) need_order = 1; wq = work->wq; + /* Safe for tracepoints in case work gets freed by the callback */ + wtag = work; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -319,7 +340,7 @@ static void normal_work_helper(struct btrfs_work *work) run_ordered_work(wq); } if (!need_order) - trace_btrfs_all_work_done(work); + trace_btrfs_all_work_done(wq->fs_info, wtag); } void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 8e52484cd461..1f9597355c9d 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -84,4 +84,5 @@ void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); +bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 85dc7ab8f89e..8299601a3549 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -788,8 +788,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, if (ref->key_for_search.type) continue; BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, - 0); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -1405,8 +1404,7 @@ again: ref->level == 0) { struct extent_buffer *eb; - eb = read_tree_block(fs_info->extent_root, - ref->parent, 0); + eb = read_tree_block(fs_info, ref->parent, 0); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; @@ -1829,7 +1827,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) - size = fs_info->extent_root->nodesize; + size = fs_info->nodesize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; @@ -2058,7 +2056,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, out: if (!search_commit_root) { btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); - btrfs_end_transaction(trans, fs_info->extent_root); + btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 8e99251650b3..ab14c2e635ca 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -254,7 +254,7 @@ struct btrfsic_state { struct list_head all_blocks_list; struct btrfsic_block_hashtable block_hashtable; struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; u64 max_superblock_generation; struct btrfsic_block *latest_superblock; u32 metablock_size; @@ -646,11 +646,12 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - int ret = 0; + struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; struct btrfsic_dev_state *selected_dev_state = NULL; + int ret = 0; int pass; BUG_ON(NULL == state); @@ -716,9 +717,8 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, break; } - num_copies = - btrfs_num_copies(state->root->fs_info, - next_bytenr, state->metablock_size); + num_copies = btrfs_num_copies(fs_info, next_bytenr, + state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", next_bytenr, num_copies); @@ -783,6 +783,7 @@ static int btrfsic_process_superblock_dev_mirror( struct btrfsic_dev_state **selected_dev_state, struct btrfs_super_block *selected_super) { + struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *super_tmp; u64 dev_bytenr; struct buffer_head *bh; @@ -832,7 +833,7 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->never_written = 0; superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - btrfs_info_in_rcu(device->dev_root->fs_info, + btrfs_info_in_rcu(fs_info, "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, @@ -887,9 +888,8 @@ static int btrfsic_process_superblock_dev_mirror( break; } - num_copies = - btrfs_num_copies(state->root->fs_info, - next_bytenr, state->metablock_size); + num_copies = btrfs_num_copies(fs_info, next_bytenr, + state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", next_bytenr, num_copies); @@ -1254,6 +1254,7 @@ static int btrfsic_create_link_to_next_block( struct btrfs_disk_key *disk_key, u64 parent_generation) { + struct btrfs_fs_info *fs_info = state->fs_info; struct btrfsic_block *next_block = NULL; int ret; struct btrfsic_block_link *l; @@ -1262,9 +1263,8 @@ static int btrfsic_create_link_to_next_block( *next_blockp = NULL; if (0 == *num_copiesp) { - *num_copiesp = - btrfs_num_copies(state->root->fs_info, - next_bytenr, state->metablock_size); + *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, + state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", next_bytenr, *num_copiesp); @@ -1390,13 +1390,14 @@ static int btrfsic_handle_extent_data( struct btrfsic_block_data_ctx *block_ctx, u32 item_offset, int force_iodone_flag) { - int ret; + struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_file_extent_item file_extent_item; u64 file_extent_item_offset; u64 next_bytenr; u64 num_bytes; u64 generation; struct btrfsic_block_link *l; + int ret; file_extent_item_offset = offsetof(struct btrfs_leaf, items) + item_offset; @@ -1456,9 +1457,8 @@ static int btrfsic_handle_extent_data( else chunk_len = num_bytes; - num_copies = - btrfs_num_copies(state->root->fs_info, - next_bytenr, state->datablock_size); + num_copies = btrfs_num_copies(fs_info, next_bytenr, + state->datablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", next_bytenr, num_copies); @@ -1533,13 +1533,14 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfsic_block_data_ctx *block_ctx_out, int mirror_num) { + struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; struct btrfs_bio *multi = NULL; struct btrfs_device *device; length = len; - ret = btrfs_map_block(state->root->fs_info, READ, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &multi, mirror_num); if (ret) { @@ -1731,6 +1732,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state) static int btrfsic_test_for_metadata(struct btrfsic_state *state, char **datav, unsigned int num_pages) { + struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_header *h; u8 csum[BTRFS_CSUM_SIZE]; u32 crc = ~(u32)0; @@ -1741,7 +1743,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE)) + if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -2202,6 +2204,7 @@ static int btrfsic_process_written_superblock( struct btrfsic_block *const superblock, struct btrfs_super_block *const super_hdr) { + struct btrfs_fs_info *fs_info = state->fs_info; int pass; superblock->generation = btrfs_super_generation(super_hdr); @@ -2275,9 +2278,8 @@ static int btrfsic_process_written_superblock( break; } - num_copies = - btrfs_num_copies(state->root->fs_info, - next_bytenr, BTRFS_SUPER_INFO_SIZE); + num_copies = btrfs_num_copies(fs_info, next_bytenr, + BTRFS_SUPER_INFO_SIZE); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", next_bytenr, num_copies); @@ -2699,14 +2701,14 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, struct btrfsic_dev_state *dev_state, u64 dev_bytenr) { + struct btrfs_fs_info *fs_info = state->fs_info; + struct btrfsic_block_data_ctx block_ctx; int num_copies; int mirror_num; - int ret; - struct btrfsic_block_data_ctx block_ctx; int match = 0; + int ret; - num_copies = btrfs_num_copies(state->root->fs_info, - bytenr, state->metablock_size); + num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -2819,10 +2821,11 @@ static void __btrfsic_submit_bio(struct bio *bio) * btrfsic_mount(), this might return NULL */ dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && NULL != bio->bi_io_vec) { + (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i; u64 dev_bytenr; u64 cur_bytenr; + struct bio_vec *bvec; int bio_is_patched; char **mapped_datav; @@ -2840,32 +2843,23 @@ static void __btrfsic_submit_bio(struct bio *bio) if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; - for (i = 0; i < bio->bi_vcnt; i++) { - BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page); - if (!mapped_datav[i]) { - while (i > 0) { - i--; - kunmap(bio->bi_io_vec[i].bv_page); - } - kfree(mapped_datav); - goto leave; - } + + bio_for_each_segment_all(bvec, bio, i) { + BUG_ON(bvec->bv_len != PAGE_SIZE); + mapped_datav[i] = kmap(bvec->bv_page); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bio->bi_io_vec[i].bv_len, - bio->bi_io_vec[i].bv_offset); - cur_bytenr += bio->bi_io_vec[i].bv_len; + i, cur_bytenr, bvec->bv_len, bvec->bv_offset); + cur_bytenr += bvec->bv_len; } btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, bio->bi_vcnt, bio, &bio_is_patched, NULL, bio->bi_opf); - while (i > 0) { - i--; - kunmap(bio->bi_io_vec[i].bv_page); - } + bio_for_each_segment_all(bvec, bio, i) + kunmap(bvec->bv_page); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & @@ -2910,7 +2904,7 @@ int btrfsic_submit_bio_wait(struct bio *bio) return submit_bio_wait(bio); } -int btrfsic_mount(struct btrfs_root *root, +int btrfsic_mount(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices, int including_extent_data, u32 print_mask) { @@ -2919,14 +2913,14 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize & ((u64)PAGE_SIZE - 1)) { + if (fs_info->nodesize & ((u64)PAGE_SIZE - 1)) { pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", - root->nodesize, PAGE_SIZE); + fs_info->nodesize, PAGE_SIZE); return -1; } - if (root->sectorsize & ((u64)PAGE_SIZE - 1)) { + if (fs_info->sectorsize & ((u64)PAGE_SIZE - 1)) { pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", - root->sectorsize, PAGE_SIZE); + fs_info->sectorsize, PAGE_SIZE); return -1; } state = kzalloc(sizeof(*state), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); @@ -2944,12 +2938,12 @@ int btrfsic_mount(struct btrfs_root *root, btrfsic_is_initialized = 1; } mutex_lock(&btrfsic_mutex); - state->root = root; + state->fs_info = fs_info; state->print_mask = print_mask; state->include_extent_data = including_extent_data; state->csum_size = 0; - state->metablock_size = root->nodesize; - state->datablock_size = root->sectorsize; + state->metablock_size = fs_info->nodesize; + state->datablock_size = fs_info->sectorsize; INIT_LIST_HEAD(&state->all_blocks_list); btrfsic_block_hashtable_init(&state->block_hashtable); btrfsic_block_link_hashtable_init(&state->block_link_hashtable); @@ -2982,7 +2976,7 @@ int btrfsic_mount(struct btrfs_root *root, ret = btrfsic_process_superblock(state, fs_devices); if (0 != ret) { mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(root, fs_devices); + btrfsic_unmount(fs_devices); return ret; } @@ -2995,8 +2989,7 @@ int btrfsic_mount(struct btrfs_root *root, return 0; } -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices) +void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) { struct btrfsic_block *b_all, *tmp_all; struct btrfsic_state *state; diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index f78dff1c7e86..2de58a99ee92 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -29,10 +29,9 @@ int btrfsic_submit_bio_wait(struct bio *bio); #define btrfsic_submit_bio_wait submit_bio_wait #endif -int btrfsic_mount(struct btrfs_root *root, +int btrfsic_mount(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices, int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices); +void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); #endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d4d8b7e36b2f..c4444d6f439f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -81,17 +81,17 @@ struct compressed_bio { u32 sums; }; -static int btrfs_decompress_biovec(int type, struct page **pages_in, - u64 disk_start, struct bio_vec *bvec, - int vcnt, size_t srclen); +static int btrfs_decompress_bio(int type, struct page **pages_in, + u64 disk_start, struct bio *orig_bio, + size_t srclen); -static inline int compressed_bio_size(struct btrfs_root *root, +static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, unsigned long disk_size) { - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size; + (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * csum_size; } static struct bio *compressed_bio_alloc(struct block_device *bdev, @@ -120,7 +120,7 @@ static int check_compressed_csum(struct inode *inode, kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr, csum, PAGE_SIZE); - btrfs_csum_final(csum, (char *)&csum); + btrfs_csum_final(csum, (u8 *)&csum); kunmap_atomic(kaddr); if (csum != *cb_sum) { @@ -175,11 +175,10 @@ static void end_compressed_bio_read(struct bio *bio) /* ok, we're the last bio for this extent, lets start * the decompression. */ - ret = btrfs_decompress_biovec(cb->compress_type, + ret = btrfs_decompress_bio(cb->compress_type, cb->compressed_pages, cb->start, - cb->orig_bio->bi_io_vec, - cb->orig_bio->bi_vcnt, + cb->orig_bio, cb->compressed_len); csum_failed: if (ret) @@ -329,8 +328,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, struct page **compressed_pages, unsigned long nr_pages) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; struct compressed_bio *cb; unsigned long bytes_left; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -342,7 +341,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; WARN_ON(start & ((u64)PAGE_SIZE - 1)); - cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return -ENOMEM; atomic_set(&cb->pending_bios, 0); @@ -356,7 +355,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); if (!bio) { @@ -392,17 +391,16 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * freed before we're done setting it up */ atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, - BTRFS_WQ_ENDIO_DATA); + ret = btrfs_bio_wq_end_io(fs_info, bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(root, inode, bio, - start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, 1); BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -418,7 +416,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { - btrfs_info(BTRFS_I(inode)->root->fs_info, + btrfs_info(fs_info, "bytes left %lu compress len %lu nr %lu", bytes_left, cb->compressed_len, cb->nr_pages); } @@ -428,15 +426,15 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { - ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, 1); BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, bio, 0, 1); + ret = btrfs_map_bio(fs_info, bio, 0, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -446,6 +444,13 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, return 0; } +static u64 bio_end_offset(struct bio *bio) +{ + struct bio_vec *last = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + return page_offset(last->bv_page) + last->bv_len + last->bv_offset; +} + static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb) @@ -464,8 +469,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 end; int misses = 0; - page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; - last_offset = (page_offset(page) + PAGE_SIZE); + last_offset = bio_end_offset(cb->orig_bio); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; @@ -563,7 +567,6 @@ next: * * bio->bi_iter.bi_sector points to the compressed extent on disk * bio->bi_io_vec points to all of the inode pages - * bio->bi_vcnt is a count of pages * * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls @@ -571,11 +574,10 @@ next: int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree; struct extent_map_tree *em_tree; struct compressed_bio *cb; - struct btrfs_root *root = BTRFS_I(inode)->root; - unsigned long uncompressed_len = bio->bi_vcnt * PAGE_SIZE; unsigned long compressed_len; unsigned long nr_pages; unsigned long pg_index; @@ -603,7 +605,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, return -EIO; compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) goto out; @@ -620,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, free_extent_map(em); em = NULL; - cb->len = uncompressed_len; + cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; @@ -631,7 +633,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb->compressed_pages) goto fail1; - bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_bdev; for (pg_index = 0; pg_index < nr_pages; pg_index++) { cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS | @@ -648,8 +650,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ - uncompressed_len = bio->bi_vcnt * PAGE_SIZE; - cb->len = uncompressed_len; + cb->len = bio->bi_iter.bi_size; comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); if (!comp_bio) @@ -676,8 +677,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_SIZE) { bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ /* @@ -689,14 +690,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, atomic_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(root, inode, - comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, + sums); BUG_ON(ret); /* -ENOMEM */ } sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - root->sectorsize); + fs_info->sectorsize); - ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { comp_bio->bi_error = ret; bio_endio(comp_bio); @@ -717,16 +718,15 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); + ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - ret = btrfs_map_bio(root, comp_bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, comp_bio, mirror_num, 0); if (ret) { comp_bio->bi_error = ret; bio_endio(comp_bio); @@ -959,9 +959,7 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * * disk_start is the starting logical offset of this array in the file * - * bvec is a bio_vec of pages from the file that we want to decompress into - * - * vcnt is the count of pages in the biovec + * orig_bio contains the pages from the file that we want to decompress into * * srclen is the number of bytes in pages_in * @@ -970,18 +968,18 @@ int btrfs_compress_pages(int type, struct address_space *mapping, * be contiguous. They all correspond to the range of bytes covered by * the compressed extent. */ -static int btrfs_decompress_biovec(int type, struct page **pages_in, - u64 disk_start, struct bio_vec *bvec, - int vcnt, size_t srclen) +static int btrfs_decompress_bio(int type, struct page **pages_in, + u64 disk_start, struct bio *orig_bio, + size_t srclen) { struct list_head *workspace; int ret; workspace = find_workspace(type); - ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, - disk_start, - bvec, vcnt, srclen); + ret = btrfs_compress_op[type-1]->decompress_bio(workspace, pages_in, + disk_start, orig_bio, + srclen); free_workspace(type, workspace); return ret; } @@ -1021,23 +1019,22 @@ void btrfs_exit_compress(void) */ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset) + struct bio *bio) { unsigned long buf_offset; unsigned long current_buf_start; unsigned long start_byte; + unsigned long prev_start_byte; unsigned long working_bytes = total_out - buf_start; unsigned long bytes; char *kaddr; - struct page *page_out = bvec[*pg_index].bv_page; + struct bio_vec bvec = bio_iter_iovec(bio, bio->bi_iter); /* * start byte is the first byte of the page we're currently * copying into relative to the start of the compressed data. */ - start_byte = page_offset(page_out) - disk_start; + start_byte = page_offset(bvec.bv_page) - disk_start; /* we haven't yet hit data corresponding to this page */ if (total_out <= start_byte) @@ -1057,29 +1054,34 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { - bytes = min(PAGE_SIZE - *pg_offset, - PAGE_SIZE - buf_offset); + bytes = min_t(unsigned long, bvec.bv_len, + PAGE_SIZE - buf_offset); bytes = min(bytes, working_bytes); - kaddr = kmap_atomic(page_out); - memcpy(kaddr + *pg_offset, buf + buf_offset, bytes); + + kaddr = kmap_atomic(bvec.bv_page); + memcpy(kaddr + bvec.bv_offset, buf + buf_offset, bytes); kunmap_atomic(kaddr); - flush_dcache_page(page_out); + flush_dcache_page(bvec.bv_page); - *pg_offset += bytes; buf_offset += bytes; working_bytes -= bytes; current_buf_start += bytes; /* check if we need to pick another page */ - if (*pg_offset == PAGE_SIZE) { - (*pg_index)++; - if (*pg_index >= vcnt) - return 0; - - page_out = bvec[*pg_index].bv_page; - *pg_offset = 0; - start_byte = page_offset(page_out) - disk_start; + bio_advance(bio, bytes); + if (!bio->bi_iter.bi_size) + return 0; + bvec = bio_iter_iovec(bio, bio->bi_iter); + prev_start_byte = start_byte; + start_byte = page_offset(bvec.bv_page) - disk_start; + /* + * We need to make sure we're only adjusting + * our offset into compression working buffer when + * we're switching pages. Otherwise we can incorrectly + * keep copying when we were actually done. + */ + if (start_byte != prev_start_byte) { /* * make sure our new page is covered by this * working buffer @@ -1103,34 +1105,3 @@ int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, return 1; } - -/* - * When uncompressing data, we need to make sure and zero any parts of - * the biovec that were not filled in by the decompression code. pg_index - * and pg_offset indicate the last page and the last offset of that page - * that have been filled in. This will zero everything remaining in the - * biovec. - */ -void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, - unsigned long pg_index, - unsigned long pg_offset) -{ - while (pg_index < vcnt) { - struct page *page = bvec[pg_index].bv_page; - unsigned long off = bvec[pg_index].bv_offset; - unsigned long len = bvec[pg_index].bv_len; - - if (pg_offset < off) - pg_offset = off; - if (pg_offset < off + len) { - unsigned long bytes = off + len - pg_offset; - char *kaddr; - - kaddr = kmap_atomic(page); - memset(kaddr + pg_offset, 0, bytes); - kunmap_atomic(kaddr); - } - pg_index++; - pg_offset = 0; - } -} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index f49d8b8c0f00..09879579fbc8 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,9 +34,7 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(char *buf, unsigned long buf_start, unsigned long total_out, u64 disk_start, - struct bio_vec *bvec, int vcnt, - unsigned long *pg_index, - unsigned long *pg_offset); + struct bio *bio); int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, @@ -45,9 +43,6 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long nr_pages); int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -void btrfs_clear_biovec_end(struct bio_vec *bvec, int vcnt, - unsigned long pg_index, - unsigned long pg_offset); enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, @@ -72,11 +67,10 @@ struct btrfs_compress_op { unsigned long *total_out, unsigned long max_out); - int (*decompress_biovec)(struct list_head *workspace, + int (*decompress_bio)(struct list_head *workspace, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen); int (*decompress)(struct list_head *workspace, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f6ba165d3f81..a426dc822d4d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -32,10 +32,11 @@ static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *ins_key, struct btrfs_path *path, int data_size, int extend); static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, + struct btrfs_fs_info *fs_info, + struct extent_buffer *dst, struct extent_buffer *src, int empty); static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct extent_buffer *dst_buf, struct extent_buffer *src_buf); static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, @@ -212,21 +213,23 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) */ static void add_root_to_dirty_list(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (test_bit(BTRFS_ROOT_DIRTY, &root->state) || !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state)) return; - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) { /* Want the extent tree to be the last on the list */ if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID) list_move_tail(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + &fs_info->dirty_cowonly_roots); else list_move(&root->dirty_list, - &root->fs_info->dirty_cowonly_roots); + &fs_info->dirty_cowonly_roots); } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } /* @@ -239,13 +242,14 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cow; int ret = 0; int level; struct btrfs_disk_key disk_key; WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - trans->transid != root->fs_info->running_transaction->transid); + trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && trans->transid != root->last_trans); @@ -260,7 +264,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, if (IS_ERR(cow)) return PTR_ERR(cow); - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); @@ -271,8 +275,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, new_root_objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); + write_extent_buffer_fsid(cow, fs_info->fsid); WARN_ON(btrfs_header_generation(buf) > trans->transid); if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) @@ -978,6 +981,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, struct extent_buffer *cow, int *last_ref) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 refs; u64 owner; u64 flags; @@ -1002,14 +1006,14 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, */ if (btrfs_block_can_be_shared(root, buf)) { - ret = btrfs_lookup_extent_info(trans, root, buf->start, + ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, &refs, &flags); if (ret) return ret; if (refs == 0) { ret = -EROFS; - btrfs_handle_fs_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); return ret; } } else { @@ -1052,7 +1056,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (new_flags != 0) { int level = btrfs_header_level(buf); - ret = btrfs_set_disk_extent_flags(trans, root, + ret = btrfs_set_disk_extent_flags(trans, fs_info, buf->start, buf->len, new_flags, level, 0); @@ -1070,7 +1074,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root->fs_info, buf); + clean_tree_block(trans, fs_info, buf); *last_ref = 1; } return 0; @@ -1095,6 +1099,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 search_start, u64 empty_size) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *cow; int level, ret; @@ -1108,7 +1113,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_assert_tree_locked(buf); WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && - trans->transid != root->fs_info->running_transaction->transid); + trans->transid != fs_info->running_transaction->transid); WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) && trans->transid != root->last_trans); @@ -1130,7 +1135,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, /* cow is set to blocking by btrfs_init_new_buffer */ - copy_extent_buffer(cow, buf, 0, 0, cow->len); + copy_extent_buffer_full(cow, buf); btrfs_set_header_bytenr(cow, cow->start); btrfs_set_header_generation(cow, trans->transid); btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV); @@ -1141,8 +1146,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_set_header_owner(cow, root->root_key.objectid); - write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); + write_extent_buffer_fsid(cow, fs_info->fsid); ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { @@ -1174,7 +1178,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(root->fs_info, parent, parent_slot, + tree_mod_log_insert_key(fs_info, parent, parent_slot, MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); @@ -1182,7 +1186,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(parent); if (last_ref) { - ret = tree_mod_log_free_eb(root->fs_info, buf); + ret = tree_mod_log_free_eb(fs_info, buf); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1359,8 +1363,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start, - eb->len); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); if (!eb_rewin) { btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); @@ -1388,7 +1391,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); return eb_rewin; } @@ -1403,6 +1406,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, static inline struct extent_buffer * get_old_root(struct btrfs_root *root, u64 time_seq) { + struct btrfs_fs_info *fs_info = root->fs_info; struct tree_mod_elem *tm; struct extent_buffer *eb = NULL; struct extent_buffer *eb_root; @@ -1412,7 +1416,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) u64 logical; eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); + tm = __tree_mod_log_oldest_root(fs_info, eb_root, time_seq); if (!tm) return eb_root; @@ -1424,16 +1428,17 @@ get_old_root(struct btrfs_root *root, u64 time_seq) logical = eb_root->start; } - tm = tree_mod_log_search(root->fs_info, logical, time_seq); + tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(root, logical, 0); + old = read_tree_block(fs_info, logical, 0); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); - btrfs_warn(root->fs_info, - "failed to read tree block %llu from get_old_root", logical); + btrfs_warn(fs_info, + "failed to read tree block %llu from get_old_root", + logical); } else { eb = btrfs_clone_extent_buffer(old); free_extent_buffer(old); @@ -1441,8 +1446,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) } else if (old_root) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(root->fs_info, logical, - root->nodesize); + eb = alloc_dummy_extent_buffer(fs_info, logical); } else { btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK); eb = btrfs_clone_extent_buffer(eb_root); @@ -1462,10 +1466,10 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_generation(eb, old_generation); } if (tm) - __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm); + __tree_mod_log_rewind(fs_info, eb, time_seq, tm); else WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root)); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); return eb; } @@ -1527,17 +1531,18 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; int ret; - if (trans->transaction != root->fs_info->running_transaction) + if (trans->transaction != fs_info->running_transaction) WARN(1, KERN_CRIT "trans %llu running %llu\n", trans->transid, - root->fs_info->running_transaction->transid); + fs_info->running_transaction->transid); - if (trans->transid != root->fs_info->generation) + if (trans->transid != fs_info->generation) WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, root->fs_info->generation); + trans->transid, fs_info->generation); if (!should_cow_block(trans, root, buf)) { trans->dirty = true; @@ -1614,6 +1619,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, int start_slot, u64 *last_ret, struct btrfs_key *progress) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; u64 blocknr; u64 gen; @@ -1632,11 +1638,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, parent_level = btrfs_header_level(parent); - WARN_ON(trans->transaction != root->fs_info->running_transaction); - WARN_ON(trans->transid != root->fs_info->generation); + WARN_ON(trans->transaction != fs_info->running_transaction); + WARN_ON(trans->transid != fs_info->generation); parent_nritems = btrfs_header_nritems(parent); - blocksize = root->nodesize; + blocksize = fs_info->nodesize; end_slot = parent_nritems - 1; if (parent_nritems <= 1) @@ -1670,14 +1676,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root->fs_info, blocknr); + cur = find_extent_buffer(fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(root, blocknr, gen); + cur = read_tree_block(fs_info, blocknr, gen); if (IS_ERR(cur)) { return PTR_ERR(cur); } else if (!extent_buffer_uptodate(cur)) { @@ -1715,7 +1721,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, return err; } - /* * search for key in the extent_buffer. The items start at offset p, * and they are item_size apart. There are 'max' items in p. @@ -1839,8 +1844,9 @@ static void root_sub_used(struct btrfs_root *root, u32 size) /* given a node and slot number, this reads the blocks it points to. The * extent buffer is returned with a reference taken (but unlocked). */ -static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, - struct extent_buffer *parent, int slot) +static noinline struct extent_buffer * +read_node_slot(struct btrfs_fs_info *fs_info, struct extent_buffer *parent, + int slot) { int level = btrfs_header_level(parent); struct extent_buffer *eb; @@ -1850,7 +1856,7 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); - eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), + eb = read_tree_block(fs_info, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -1869,6 +1875,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; @@ -1906,10 +1913,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; /* promote the child to a root */ - child = read_node_slot(root, mid, 0); + child = read_node_slot(fs_info, mid, 0); if (IS_ERR(child)) { ret = PTR_ERR(child); - btrfs_handle_fs_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); goto enospc; } @@ -1930,7 +1937,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root->fs_info, mid); + clean_tree_block(trans, fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1942,10 +1949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, return 0; } if (btrfs_header_nritems(mid) > - BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4) return 0; - left = read_node_slot(root, parent, pslot - 1); + left = read_node_slot(fs_info, parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -1960,7 +1967,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } } - right = read_node_slot(root, parent, pslot + 1); + right = read_node_slot(fs_info, parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -1978,7 +1985,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* first, try to make some room in the middle buffer */ if (left) { orig_slot += btrfs_header_nritems(left); - wret = push_node_left(trans, root, left, mid, 1); + wret = push_node_left(trans, fs_info, left, mid, 1); if (wret < 0) ret = wret; } @@ -1987,11 +1994,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, * then try to empty the right most buffer into the middle */ if (right) { - wret = push_node_left(trans, root, mid, right, 1); + wret = push_node_left(trans, fs_info, mid, right, 1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root->fs_info, right); + clean_tree_block(trans, fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2001,7 +2008,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, + tree_mod_log_set_node_key(fs_info, parent, pslot + 1, 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2019,23 +2026,23 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, */ if (!left) { ret = -EROFS; - btrfs_handle_fs_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); goto enospc; } - wret = balance_node_right(trans, root, mid, left); + wret = balance_node_right(trans, fs_info, mid, left); if (wret < 0) { ret = wret; goto enospc; } if (wret == 1) { - wret = push_node_left(trans, root, left, mid, 1); + wret = push_node_left(trans, fs_info, left, mid, 1); if (wret < 0) ret = wret; } BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root->fs_info, mid); + clean_tree_block(trans, fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2046,8 +2053,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot, 0); + tree_mod_log_set_node_key(fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); } @@ -2094,6 +2100,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = NULL; struct extent_buffer *mid; struct extent_buffer *left = NULL; @@ -2117,7 +2124,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (!parent) return 1; - left = read_node_slot(root, parent, pslot - 1); + left = read_node_slot(fs_info, parent, pslot - 1); if (IS_ERR(left)) left = NULL; @@ -2129,7 +2136,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(left); left_nr = btrfs_header_nritems(left); - if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, @@ -2137,7 +2144,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = push_node_left(trans, root, + wret = push_node_left(trans, fs_info, left, mid, 0); } } @@ -2147,8 +2154,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, - pslot, 0); + tree_mod_log_set_node_key(fs_info, parent, pslot, 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); if (btrfs_header_nritems(left) > orig_slot) { @@ -2169,7 +2175,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_tree_unlock(left); free_extent_buffer(left); } - right = read_node_slot(root, parent, pslot + 1); + right = read_node_slot(fs_info, parent, pslot + 1); if (IS_ERR(right)) right = NULL; @@ -2183,7 +2189,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(right); right_nr = btrfs_header_nritems(right); - if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) { wret = 1; } else { ret = btrfs_cow_block(trans, root, right, @@ -2192,7 +2198,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (ret) wret = 1; else { - wret = balance_node_right(trans, root, + wret = balance_node_right(trans, fs_info, right, mid); } } @@ -2202,7 +2208,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - tree_mod_log_set_node_key(root->fs_info, parent, + tree_mod_log_set_node_key(fs_info, parent, pslot + 1, 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2230,7 +2236,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, * readahead one full node of leaves, finding things that are close * to the block in 'slot', and triggering ra on them. */ -static void reada_for_search(struct btrfs_root *root, +static void reada_for_search(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int level, int slot, u64 objectid) { @@ -2254,8 +2260,8 @@ static void reada_for_search(struct btrfs_root *root, node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - blocksize = root->nodesize; - eb = btrfs_find_tree_block(root->fs_info, search); + blocksize = fs_info->nodesize; + eb = find_extent_buffer(fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2284,7 +2290,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, nr); if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { - readahead_tree_block(root, search); + readahead_tree_block(fs_info, search); nread += blocksize; } nscan++; @@ -2293,7 +2299,7 @@ static void reada_for_search(struct btrfs_root *root, } } -static noinline void reada_for_balance(struct btrfs_root *root, +static noinline void reada_for_balance(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int level) { int slot; @@ -2314,7 +2320,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root->fs_info, block1); + eb = find_extent_buffer(fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2327,16 +2333,16 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root->fs_info, block2); + eb = find_extent_buffer(fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } if (block1) - readahead_tree_block(root, block1); + readahead_tree_block(fs_info, block1); if (block2) - readahead_tree_block(root, block2); + readahead_tree_block(fs_info, block2); } @@ -2436,6 +2442,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, struct extent_buffer **eb_ret, int level, int slot, struct btrfs_key *key, u64 time_seq) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 blocknr; u64 gen; struct extent_buffer *b = *eb_ret; @@ -2445,7 +2452,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root->fs_info, blocknr); + tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -2484,12 +2491,12 @@ read_block_for_search(struct btrfs_trans_handle *trans, free_extent_buffer(tmp); if (p->reada != READA_NONE) - reada_for_search(root, p, level, slot, key->objectid); + reada_for_search(fs_info, p, level, slot, key->objectid); btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, 0); + tmp = read_tree_block(fs_info, blocknr, 0); if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, @@ -2521,9 +2528,11 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, struct extent_buffer *b, int level, int ins_len, int *write_lock_level) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; + if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) { int sret; if (*write_lock_level < level + 1) { @@ -2533,7 +2542,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } btrfs_set_path_blocking(p); - reada_for_balance(root, p, level); + reada_for_balance(fs_info, p, level); sret = split_node(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2544,7 +2553,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } b = p->nodes[level]; } else if (ins_len < 0 && btrfs_header_nritems(b) < - BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { + BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) { int sret; if (*write_lock_level < level + 1) { @@ -2554,7 +2563,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans, } btrfs_set_path_blocking(p); - reada_for_balance(root, p, level); + reada_for_balance(fs_info, p, level); sret = balance_level(trans, root, p, level); btrfs_clear_path_blocking(p, NULL, 0); @@ -2663,6 +2672,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -2718,12 +2728,12 @@ again: * so we always do read locks */ if (p->need_commit_sem) - down_read(&root->fs_info->commit_root_sem); + down_read(&fs_info->commit_root_sem); b = root->commit_root; extent_buffer_get(b); level = btrfs_header_level(b); if (p->need_commit_sem) - up_read(&root->fs_info->commit_root_sem); + up_read(&fs_info->commit_root_sem); if (!p->skip_locking) btrfs_tree_read_lock(b); } else { @@ -2895,7 +2905,7 @@ cow_done: } else { p->slots[level] = slot; if (ins_len > 0 && - btrfs_leaf_free_space(root, b) < ins_len) { + btrfs_leaf_free_space(fs_info, b) < ins_len) { if (write_lock_level < 1) { write_lock_level = 1; btrfs_release_path(p); @@ -2946,6 +2956,7 @@ done: int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, u64 time_seq) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -3020,7 +3031,7 @@ again: btrfs_clear_path_blocking(p, b, BTRFS_READ_LOCK); } - b = tree_mod_log_rewind(root->fs_info, p, b, time_seq); + b = tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3187,7 +3198,8 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, * error, and > 0 if there was no room in the left hand block. */ static int push_node_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *dst, + struct btrfs_fs_info *fs_info, + struct extent_buffer *dst, struct extent_buffer *src, int empty) { int push_items = 0; @@ -3197,7 +3209,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, src_nritems = btrfs_header_nritems(src); dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; WARN_ON(btrfs_header_generation(src) != trans->transid); WARN_ON(btrfs_header_generation(dst) != trans->transid); @@ -3222,7 +3234,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); - ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0, + ret = tree_mod_log_eb_copy(fs_info, dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3261,7 +3273,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, * this will only push up to 1/2 the contents of the left node over */ static int balance_node_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct extent_buffer *dst, struct extent_buffer *src) { @@ -3276,7 +3288,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans, src_nritems = btrfs_header_nritems(src); dst_nritems = btrfs_header_nritems(dst); - push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems; if (push_items <= 0) return 1; @@ -3291,13 +3303,13 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; - tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems); + tree_mod_log_eb_move(fs_info, dst, push_items, 0, dst_nritems); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0, + ret = tree_mod_log_eb_copy(fs_info, dst, src, 0, src_nritems - push_items, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3328,6 +3340,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; @@ -3348,9 +3361,9 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, root->nodesize); + root_add_used(root, fs_info->nodesize); - memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(c, 0, sizeof(struct btrfs_header)); btrfs_set_header_nritems(c, 1); btrfs_set_header_level(c, level); btrfs_set_header_bytenr(c, c->start); @@ -3358,11 +3371,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(c, root->root_key.objectid); - write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - - write_extent_buffer(c, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE); + write_extent_buffer_fsid(c, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(c, fs_info->chunk_tree_uuid); btrfs_set_node_key(c, &lower_key, 0); btrfs_set_node_blockptr(c, 0, lower->start); @@ -3396,7 +3406,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, * blocknr is the block the key points to. */ static void insert_ptr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_disk_key *key, u64 bytenr, int slot, int level) { @@ -3409,10 +3419,10 @@ static void insert_ptr(struct btrfs_trans_handle *trans, lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); - BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root)); + BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(fs_info)); if (slot != nritems) { if (level) - tree_mod_log_eb_move(root->fs_info, lower, slot + 1, + tree_mod_log_eb_move(fs_info, lower, slot + 1, slot, nritems - slot); memmove_extent_buffer(lower, btrfs_node_key_ptr_offset(slot + 1), @@ -3420,7 +3430,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(root->fs_info, lower, slot, + ret = tree_mod_log_insert_key(fs_info, lower, slot, MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } @@ -3445,6 +3455,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *c; struct extent_buffer *split; struct btrfs_disk_key disk_key; @@ -3472,7 +3483,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, ret = push_nodes_for_insert(trans, root, path, level); c = path->nodes[level]; if (!ret && btrfs_header_nritems(c) < - BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) return 0; if (ret < 0) return ret; @@ -3487,22 +3498,18 @@ static noinline int split_node(struct btrfs_trans_handle *trans, if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, root->nodesize); + root_add_used(root, fs_info->nodesize); - memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(split, 0, sizeof(struct btrfs_header)); btrfs_set_header_level(split, btrfs_header_level(c)); btrfs_set_header_bytenr(split, split->start); btrfs_set_header_generation(split, trans->transid); btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(split, root->root_key.objectid); - write_extent_buffer(split, root->fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(split, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(split), - BTRFS_UUID_SIZE); - - ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0, - mid, c_nritems - mid); + write_extent_buffer_fsid(split, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(split, fs_info->chunk_tree_uuid); + + ret = tree_mod_log_eb_copy(fs_info, split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3518,7 +3525,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); btrfs_mark_buffer_dirty(split); - insert_ptr(trans, root, path, &disk_key, split->start, + insert_ptr(trans, fs_info, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); if (path->slots[level] >= mid) { @@ -3566,17 +3573,19 @@ static int leaf_space_used(struct extent_buffer *l, int start, int nr) * the start of the leaf data. IOW, how much room * the leaf has left for both items and data */ -noinline int btrfs_leaf_free_space(struct btrfs_root *root, +noinline int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf) { int nritems = btrfs_header_nritems(leaf); int ret; - ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + + ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems); if (ret < 0) { - btrfs_crit(root->fs_info, - "leaf free space ret %d, leaf data size %lu, used %d nritems %d", - ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), - leaf_space_used(leaf, 0, nritems), nritems); + btrfs_crit(fs_info, + "leaf free space ret %d, leaf data size %lu, used %d nritems %d", + ret, + (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info), + leaf_space_used(leaf, 0, nritems), nritems); } return ret; } @@ -3586,7 +3595,7 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root, * right. We'll push up to and including min_slot, but no lower */ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *right, @@ -3626,7 +3635,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] > i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, left); + int space = btrfs_leaf_free_space(fs_info, left); if (space + push_space * 2 > free_space) break; } @@ -3655,19 +3664,19 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, right_nritems = btrfs_header_nritems(right); push_space = btrfs_item_end_nr(left, left_nritems - push_items); - push_space -= leaf_data_end(root, left); + push_space -= leaf_data_end(fs_info, left); /* make room in the right data area */ - data_end = leaf_data_end(root, right); + data_end = leaf_data_end(fs_info, right); memmove_extent_buffer(right, btrfs_leaf_data(right) + data_end - push_space, btrfs_leaf_data(right) + data_end, - BTRFS_LEAF_DATA_SIZE(root) - data_end); + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ copy_extent_buffer(right, left, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, - btrfs_leaf_data(left) + leaf_data_end(root, left), + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + btrfs_leaf_data(left) + leaf_data_end(fs_info, left), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), @@ -3682,7 +3691,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, /* update the item pointers */ right_nritems += push_items; btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(i); push_space -= btrfs_token_item_size(right, item, &token); @@ -3695,7 +3704,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root->fs_info, left); + clean_tree_block(trans, fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3707,7 +3716,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root->fs_info, path->nodes[0]); + clean_tree_block(trans, fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3739,6 +3748,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root int min_data_size, int data_size, int empty, u32 min_slot) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *left = path->nodes[0]; struct extent_buffer *right; struct extent_buffer *upper; @@ -3757,7 +3767,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - right = read_node_slot(root, upper, slot + 1); + right = read_node_slot(fs_info, upper, slot + 1); /* * slot + 1 is not valid or we fail to read the right node, * no big deal, just return. @@ -3768,7 +3778,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(right); btrfs_set_lock_blocking(right); - free_space = btrfs_leaf_free_space(root, right); + free_space = btrfs_leaf_free_space(fs_info, right); if (free_space < data_size) goto out_unlock; @@ -3778,7 +3788,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(root, right); + free_space = btrfs_leaf_free_space(fs_info, right); if (free_space < data_size) goto out_unlock; @@ -3799,7 +3809,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root return 0; } - return __push_leaf_right(trans, root, path, min_data_size, empty, + return __push_leaf_right(trans, fs_info, path, min_data_size, empty, right, free_space, left_nritems, min_slot); out_unlock: btrfs_tree_unlock(right); @@ -3816,7 +3826,7 @@ out_unlock: * items */ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, int data_size, int empty, struct extent_buffer *left, int free_space, u32 right_nritems, @@ -3849,7 +3859,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (path->slots[0] < i) break; if (path->slots[0] == i) { - int space = btrfs_leaf_free_space(root, right); + int space = btrfs_leaf_free_space(fs_info, right); if (space + push_space * 2 > free_space) break; } @@ -3878,11 +3888,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_item_nr_offset(0), push_items * sizeof(struct btrfs_item)); - push_space = BTRFS_LEAF_DATA_SIZE(root) - + push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset_nr(right, push_items - 1); copy_extent_buffer(left, right, btrfs_leaf_data(left) + - leaf_data_end(root, left) - push_space, + leaf_data_end(fs_info, left) - push_space, btrfs_leaf_data(right) + btrfs_item_offset_nr(right, push_items - 1), push_space); @@ -3897,7 +3907,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, ioff = btrfs_token_item_offset(left, item, &token); btrfs_set_token_item_offset(left, item, - ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size), + ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size), &token); } btrfs_set_header_nritems(left, old_left_nritems + push_items); @@ -3909,11 +3919,11 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (push_items < right_nritems) { push_space = btrfs_item_offset_nr(right, push_items - 1) - - leaf_data_end(root, right); + leaf_data_end(fs_info, right); memmove_extent_buffer(right, btrfs_leaf_data(right) + - BTRFS_LEAF_DATA_SIZE(root) - push_space, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, btrfs_leaf_data(right) + - leaf_data_end(root, right), push_space); + leaf_data_end(fs_info, right), push_space); memmove_extent_buffer(right, btrfs_item_nr_offset(0), btrfs_item_nr_offset(push_items), @@ -3922,7 +3932,7 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, } right_nritems -= push_items; btrfs_set_header_nritems(right, right_nritems); - push_space = BTRFS_LEAF_DATA_SIZE(root); + push_space = BTRFS_LEAF_DATA_SIZE(fs_info); for (i = 0; i < right_nritems; i++) { item = btrfs_item_nr(i); @@ -3935,10 +3945,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root->fs_info, right); + clean_tree_block(trans, fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root->fs_info, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3972,6 +3982,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int min_data_size, int data_size, int empty, u32 max_slot) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *right = path->nodes[0]; struct extent_buffer *left; int slot; @@ -3991,7 +4002,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_assert_tree_locked(path->nodes[1]); - left = read_node_slot(root, path->nodes[1], slot - 1); + left = read_node_slot(fs_info, path->nodes[1], slot - 1); /* * slot - 1 is not valid or we fail to read the left node, * no big deal, just return. @@ -4002,7 +4013,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_tree_lock(left); btrfs_set_lock_blocking(left); - free_space = btrfs_leaf_free_space(root, left); + free_space = btrfs_leaf_free_space(fs_info, left); if (free_space < data_size) { ret = 1; goto out; @@ -4018,13 +4029,13 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(root, left); + free_space = btrfs_leaf_free_space(fs_info, left); if (free_space < data_size) { ret = 1; goto out; } - return __push_leaf_left(trans, root, path, min_data_size, + return __push_leaf_left(trans, fs_info, path, min_data_size, empty, left, free_space, right_nritems, max_slot); out: @@ -4038,7 +4049,7 @@ out: * available for the resulting leaf level of the path. */ static noinline void copy_for_split(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct extent_buffer *l, struct extent_buffer *right, @@ -4054,19 +4065,18 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, nritems = nritems - mid; btrfs_set_header_nritems(right, nritems); - data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(fs_info, l); copy_extent_buffer(right, l, btrfs_item_nr_offset(0), btrfs_item_nr_offset(mid), nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, - btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, btrfs_leaf_data(l) + - leaf_data_end(root, l), data_copy_size); + leaf_data_end(fs_info, l), data_copy_size); - rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - - btrfs_item_end_nr(l, mid); + rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid); for (i = 0; i < nritems; i++) { struct btrfs_item *item = btrfs_item_nr(i); @@ -4079,7 +4089,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(l, mid); btrfs_item_key(right, &disk_key, 0); - insert_ptr(trans, root, path, &disk_key, right->start, + insert_ptr(trans, fs_info, path, &disk_key, right->start, path->slots[1] + 1, 1); btrfs_mark_buffer_dirty(right); @@ -4115,6 +4125,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, struct btrfs_path *path, int data_size) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int progress = 0; int slot; @@ -4123,7 +4134,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, slot = path->slots[0]; if (slot < btrfs_header_nritems(path->nodes[0])) - space_needed -= btrfs_leaf_free_space(root, path->nodes[0]); + space_needed -= btrfs_leaf_free_space(fs_info, path->nodes[0]); /* * try to push all the items after our slot into the @@ -4144,7 +4155,7 @@ static noinline int push_for_double_split(struct btrfs_trans_handle *trans, if (path->slots[0] == 0 || path->slots[0] == nritems) return 0; - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) return 0; /* try to push all the items before our slot into the next leaf */ @@ -4189,7 +4200,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; if (extend && data_size + btrfs_item_size_nr(l, slot) + - sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info)) return -EOVERFLOW; /* first try to make some room by pushing left and right */ @@ -4197,7 +4208,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int space_needed = data_size; if (slot < btrfs_header_nritems(l)) - space_needed -= btrfs_leaf_free_space(root, l); + space_needed -= btrfs_leaf_free_space(fs_info, l); wret = push_leaf_right(trans, root, path, space_needed, space_needed, 0, 0); @@ -4212,7 +4223,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, l = path->nodes[0]; /* did the pushes work? */ - if (btrfs_leaf_free_space(root, l) >= data_size) + if (btrfs_leaf_free_space(fs_info, l) >= data_size) return 0; } @@ -4231,14 +4242,14 @@ again: if (mid <= slot) { if (nritems == 1 || leaf_space_used(l, mid, nritems - mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { if (slot >= nritems) { split = 0; } else { mid = slot; if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { if (data_size && !tried_avoid_double) goto push_for_double; split = 2; @@ -4247,7 +4258,7 @@ again: } } else { if (leaf_space_used(l, 0, mid) + data_size > - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { if (!extend && data_size && slot == 0) { split = 0; } else if ((extend || !data_size) && slot == 0) { @@ -4256,7 +4267,7 @@ again: mid = slot; if (mid != nritems && leaf_space_used(l, mid, nritems - mid) + - data_size > BTRFS_LEAF_DATA_SIZE(root)) { + data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) { if (data_size && !tried_avoid_double) goto push_for_double; split = 2; @@ -4275,26 +4286,22 @@ again: if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->nodesize); + root_add_used(root, fs_info->nodesize); - memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(right, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); btrfs_set_header_generation(right, trans->transid); btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); - - write_extent_buffer(right, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(right), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(right, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(right, fs_info->chunk_tree_uuid); if (split == 0) { if (mid <= slot) { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1] + 1, 1); + insert_ptr(trans, fs_info, path, &disk_key, + right->start, path->slots[1] + 1, 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -4302,8 +4309,8 @@ again: path->slots[1] += 1; } else { btrfs_set_header_nritems(right, 0); - insert_ptr(trans, root, path, &disk_key, right->start, - path->slots[1], 1); + insert_ptr(trans, fs_info, path, &disk_key, + right->start, path->slots[1], 1); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -4319,7 +4326,7 @@ again: return ret; } - copy_for_split(trans, root, path, l, right, slot, mid, nritems); + copy_for_split(trans, fs_info, path, l, right, slot, mid, nritems); if (split == 2) { BUG_ON(num_doubles != 0); @@ -4332,7 +4339,7 @@ again: push_for_double: push_for_double_split(trans, root, path, data_size); tried_avoid_double = 1; - if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size) + if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= data_size) return 0; goto again; } @@ -4341,6 +4348,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int ins_len) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -4354,7 +4362,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && key.type != BTRFS_EXTENT_CSUM_KEY); - if (btrfs_leaf_free_space(root, leaf) >= ins_len) + if (btrfs_leaf_free_space(fs_info, leaf) >= ins_len) return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); @@ -4381,7 +4389,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; /* the leaf has changed, it now has room. return now */ - if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len) + if (btrfs_leaf_free_space(fs_info, path->nodes[0]) >= ins_len) goto err; if (key.type == BTRFS_EXTENT_DATA_KEY) { @@ -4405,7 +4413,7 @@ err: } static noinline int split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *new_key, unsigned long split_offset) @@ -4421,7 +4429,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; leaf = path->nodes[0]; - BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item)); btrfs_set_path_blocking(path); @@ -4470,7 +4478,7 @@ static noinline int split_item(struct btrfs_trans_handle *trans, item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); + BUG_ON(btrfs_leaf_free_space(fs_info, leaf) < 0); kfree(buf); return 0; } @@ -4502,7 +4510,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(trans, root, path, new_key, split_offset); + ret = split_item(trans, root->fs_info, path, new_key, split_offset); return ret; } @@ -4548,8 +4556,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, - u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4572,7 +4580,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, return; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(fs_info, leaf); old_data_start = btrfs_item_offset_nr(leaf, slot); @@ -4631,15 +4639,15 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root->fs_info, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } item = btrfs_item_nr(slot); btrfs_set_item_size(leaf, item, new_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + btrfs_print_leaf(fs_info, leaf); BUG(); } } @@ -4647,7 +4655,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u32 data_size) { int slot; @@ -4665,10 +4673,10 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(fs_info, leaf); - if (btrfs_leaf_free_space(root, leaf) < data_size) { - btrfs_print_leaf(root, leaf); + if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { + btrfs_print_leaf(fs_info, leaf); BUG(); } slot = path->slots[0]; @@ -4676,9 +4684,9 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { - btrfs_print_leaf(root, leaf); - btrfs_crit(root->fs_info, "slot %d too large, nritems %d", - slot, nritems); + btrfs_print_leaf(fs_info, leaf); + btrfs_crit(fs_info, "slot %d too large, nritems %d", + slot, nritems); BUG_ON(1); } @@ -4706,8 +4714,8 @@ void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_item_size(leaf, item, old_size + data_size); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + btrfs_print_leaf(fs_info, leaf); BUG(); } } @@ -4721,6 +4729,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *cpu_key, u32 *data_size, u32 total_data, u32 total_size, int nr) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; int i; u32 nritems; @@ -4732,7 +4741,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root->fs_info, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4742,13 +4751,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, slot = path->slots[0]; nritems = btrfs_header_nritems(leaf); - data_end = leaf_data_end(root, leaf); + data_end = leaf_data_end(fs_info, leaf); - if (btrfs_leaf_free_space(root, leaf) < total_size) { - btrfs_print_leaf(root, leaf); - btrfs_crit(root->fs_info, - "not enough freespace need %u have %d", - total_size, btrfs_leaf_free_space(root, leaf)); + if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { + btrfs_print_leaf(fs_info, leaf); + btrfs_crit(fs_info, "not enough freespace need %u have %d", + total_size, btrfs_leaf_free_space(fs_info, leaf)); BUG(); } @@ -4756,9 +4764,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, unsigned int old_data = btrfs_item_end_nr(leaf, slot); if (old_data < data_end) { - btrfs_print_leaf(root, leaf); - btrfs_crit(root->fs_info, - "slot %d old_data %d data_end %d", + btrfs_print_leaf(fs_info, leaf); + btrfs_crit(fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); } @@ -4800,8 +4807,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_header_nritems(leaf, nritems + nr); btrfs_mark_buffer_dirty(leaf); - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); + if (btrfs_leaf_free_space(fs_info, leaf) < 0) { + btrfs_print_leaf(fs_info, leaf); BUG(); } } @@ -4876,6 +4883,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, int level, int slot) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *parent = path->nodes[level]; u32 nritems; int ret; @@ -4883,7 +4891,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) - tree_mod_log_eb_move(root->fs_info, parent, slot, + tree_mod_log_eb_move(fs_info, parent, slot, slot + 1, nritems - slot - 1); memmove_extent_buffer(parent, btrfs_node_key_ptr_offset(slot), @@ -4891,7 +4899,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(root->fs_info, parent, slot, + ret = tree_mod_log_insert_key(fs_info, parent, slot, MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } @@ -4906,7 +4914,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root->fs_info, path, &disk_key, level + 1); + fixup_low_keys(fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4948,6 +4956,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans, int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int slot, int nr) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_item *item; u32 last_off; @@ -4969,7 +4978,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(root, leaf); + int data_end = leaf_data_end(fs_info, leaf); memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + data_end + dsize, @@ -4999,7 +5008,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root->fs_info, leaf); + clean_tree_block(trans, fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -5008,11 +5017,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root->fs_info, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { + if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -5132,6 +5141,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_path *path, u64 min_trans) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *cur; struct btrfs_key found_key; int slot; @@ -5208,7 +5218,7 @@ find_next_key: goto out; } btrfs_set_path_blocking(path); - cur = read_node_slot(root, cur, slot); + cur = read_node_slot(fs_info, cur, slot); if (IS_ERR(cur)) { ret = PTR_ERR(cur); goto out; @@ -5231,14 +5241,14 @@ out: return ret; } -static int tree_move_down(struct btrfs_root *root, +static int tree_move_down(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int *level, int root_level) { struct extent_buffer *eb; BUG_ON(*level == 0); - eb = read_node_slot(root, path->nodes[*level], path->slots[*level]); + eb = read_node_slot(fs_info, path->nodes[*level], path->slots[*level]); if (IS_ERR(eb)) return PTR_ERR(eb); @@ -5248,7 +5258,7 @@ static int tree_move_down(struct btrfs_root *root, return 0; } -static int tree_move_next_or_upnext(struct btrfs_root *root, +static int tree_move_next_or_upnext(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int *level, int root_level) { @@ -5279,7 +5289,7 @@ static int tree_move_next_or_upnext(struct btrfs_root *root, * Returns 1 if it had to move up and next. 0 is returned if it moved only next * or down. */ -static int tree_advance(struct btrfs_root *root, +static int tree_advance(struct btrfs_fs_info *fs_info, struct btrfs_path *path, int *level, int root_level, int allow_down, @@ -5288,9 +5298,10 @@ static int tree_advance(struct btrfs_root *root, int ret; if (*level == 0 || !allow_down) { - ret = tree_move_next_or_upnext(root, path, level, root_level); + ret = tree_move_next_or_upnext(fs_info, path, level, + root_level); } else { - ret = tree_move_down(root, path, level, root_level); + ret = tree_move_down(fs_info, path, level, root_level); } if (ret >= 0) { if (*level == 0) @@ -5303,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root, return ret; } -static int tree_compare_item(struct btrfs_root *left_root, - struct btrfs_path *left_path, +static int tree_compare_item(struct btrfs_path *left_path, struct btrfs_path *right_path, char *tmp_buf) { @@ -5349,6 +5359,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, struct btrfs_root *right_root, btrfs_changed_cb_t changed_cb, void *ctx) { + struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; int cmp; struct btrfs_path *left_path = NULL; @@ -5380,9 +5391,9 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->nodesize, GFP_KERNEL | __GFP_NOWARN); + tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!tmp_buf) { - tmp_buf = vmalloc(left_root->nodesize); + tmp_buf = vmalloc(fs_info->nodesize); if (!tmp_buf) { ret = -ENOMEM; goto out; @@ -5430,7 +5441,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, * the right if possible or go up and right. */ - down_read(&left_root->fs_info->commit_root_sem); + down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; left_path->nodes[left_level] = left_root->commit_root; @@ -5440,7 +5451,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, right_root_level = right_level; right_path->nodes[right_level] = right_root->commit_root; extent_buffer_get(right_path->nodes[right_level]); - up_read(&left_root->fs_info->commit_root_sem); + up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -5460,7 +5471,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, while (1) { if (advance_left && !left_end_reached) { - ret = tree_advance(left_root, left_path, &left_level, + ret = tree_advance(fs_info, left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, &left_key); @@ -5471,7 +5482,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, advance_left = 0; } if (advance_right && !right_end_reached) { - ret = tree_advance(right_root, right_path, &right_level, + ret = tree_advance(fs_info, right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, &right_key); @@ -5535,8 +5546,8 @@ int btrfs_compare_trees(struct btrfs_root *left_root, enum btrfs_compare_tree_result result; WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); - ret = tree_compare_item(left_root, left_path, - right_path, tmp_buf); + ret = tree_compare_item(left_path, right_path, + tmp_buf); if (ret) result = BTRFS_COMPARE_TREE_CHANGED; else diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0b8ce2b9f7d0..6a823719b6c5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -90,9 +90,6 @@ static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 -/* specific to btrfs_map_block(), therefore not in include/linux/blk_types.h */ -#define REQ_GET_READ_MIRRORS (1 << 30) - /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) @@ -340,7 +337,7 @@ struct btrfs_path { unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; }; -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) struct btrfs_dev_replace { u64 replace_state; /* see #define above */ @@ -429,6 +426,10 @@ struct btrfs_space_info { struct list_head ro_bgs; struct list_head priority_tickets; struct list_head tickets; + /* + * tickets_id just indicates the next ticket will be handled, so note + * it's not stored per ticket. + */ u64 tickets_id; struct rw_semaphore groups_sem; @@ -518,7 +519,7 @@ struct btrfs_io_ctl { void *cur, *orig; struct page *page; struct page **pages; - struct btrfs_root *root; + struct btrfs_fs_info *fs_info; struct inode *inode; unsigned long size; int index; @@ -798,7 +799,6 @@ struct btrfs_fs_info { spinlock_t super_lock; struct btrfs_super_block *super_copy; struct btrfs_super_block *super_for_commit; - struct block_device *__bdev; struct super_block *sb; struct inode *btree_inode; struct backing_dev_info bdi; @@ -1084,8 +1084,18 @@ struct btrfs_fs_info { /* Used to record internally whether fs has been frozen */ int fs_frozen; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + u32 stripesize; }; +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + struct btrfs_subvolume_writers { struct percpu_counter counter; wait_queue_head_t wait; @@ -1159,14 +1169,6 @@ struct btrfs_root { u64 objectid; u64 last_trans; - /* data allocations are done in sectorsize units */ - u32 sectorsize; - - /* node allocations are done in nodesize units */ - u32 nodesize; - - u32 stripesize; - u32 type; u64 highest_objectid; @@ -1250,38 +1252,42 @@ struct btrfs_root { /* For qgroup metadata space reserve */ atomic_t qgroup_meta_rsv; }; +static inline u32 btrfs_inode_sectorsize(const struct inode *inode) +{ + return btrfs_sb(inode->i_sb)->sectorsize; +} static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize) { return blocksize - sizeof(struct btrfs_header); } -static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root) +static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return __BTRFS_LEAF_DATA_SIZE(root->nodesize); + return __BTRFS_LEAF_DATA_SIZE(info->nodesize); } -static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root) +static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { - return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); } -static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root) +static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) { - return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr); + return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } #define BTRFS_FILE_EXTENT_INLINE_DATA_START \ (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root) +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) { - return BTRFS_MAX_ITEM_SIZE(root) - + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } -static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) +static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { - return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item); + return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } /* @@ -1343,12 +1349,13 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root) #ifdef CONFIG_BTRFS_DEBUG static inline int -btrfs_should_fragment_free_space(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +btrfs_should_fragment_free_space(struct btrfs_block_group_cache *block_group) { - return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) && + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) && + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && block_group->flags & BTRFS_BLOCK_GROUP_DATA); } #endif @@ -2210,6 +2217,8 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, cpu->target = le64_to_cpu(disk->target); cpu->flags = le64_to_cpu(disk->flags); cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); } static inline void @@ -2228,6 +2237,8 @@ btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, disk->target = cpu_to_le64(cpu->target); disk->flags = cpu_to_le64(cpu->flags); disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); } /* struct btrfs_super_block */ @@ -2299,13 +2310,13 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ -static inline unsigned int leaf_data_end(struct btrfs_root *root, +static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(root); + return BTRFS_LEAF_DATA_SIZE(fs_info); return btrfs_item_offset_nr(leaf, nr - 1); } @@ -2501,11 +2512,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -2528,28 +2534,28 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); +u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); -static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, +static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return root->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; + return fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; } /* * Doing a truncate won't result in new nodes or leaves, just what we need for * COW. */ -static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, +static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info, unsigned num_items) { - return root->nodesize * BTRFS_MAX_LEVEL * num_items; + return fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg); @@ -2558,18 +2564,18 @@ void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count); -int btrfs_async_run_delayed_refs(struct btrfs_root *root, + struct btrfs_fs_info *fs_info, unsigned long count); +int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait); -int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_root *root, +int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num, int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct btrfs_root *root, +int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2590,12 +2596,11 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref); int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins); int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins); int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, @@ -2606,52 +2611,52 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data); int btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); -int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, - int delalloc); -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset); int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); + struct btrfs_fs_info *fs_info); +int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_free_block_groups(struct btrfs_fs_info *info); -int btrfs_read_block_groups(struct btrfs_root *root); -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr); +int btrfs_read_block_groups(struct btrfs_fs_info *info); +int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytes_used, + struct btrfs_fs_info *fs_info, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start, + struct btrfs_fs_info *fs_info, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *cache); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); @@ -2681,7 +2686,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len); void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); @@ -2690,7 +2695,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, u64 *qgroup_reserved, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, u64 qgroup_reserved); int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); @@ -2698,16 +2703,15 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len); void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len); void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type); -void btrfs_free_block_rsv(struct btrfs_root *root, +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); int btrfs_block_rsv_refill(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); @@ -2717,22 +2721,21 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *dest, u64 num_bytes, int min_factor); -void btrfs_block_rsv_release(struct btrfs_root *root, +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes); int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache); -void btrfs_dec_block_group_ro(struct btrfs_root *root, - struct btrfs_block_group_cache *cache); +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); -int btrfs_error_unpin_extent_range(struct btrfs_root *root, +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); -int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type); -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range); + struct btrfs_fs_info *fs_info, u64 type); +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, @@ -2742,8 +2745,7 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const u64 type); + struct btrfs_fs_info *fs_info, const u64 type); u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end); @@ -2793,10 +2795,10 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct extent_buffer **cow_ret, u64 new_root_objectid); int btrfs_block_can_be_shared(struct btrfs_root *root, struct extent_buffer *buf); -void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, - u32 new_size, int from_end); +void btrfs_truncate_item(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -2872,7 +2874,8 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) { return btrfs_next_old_item(root, p, 0); } -int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_leaf_free_space(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf); int __must_check btrfs_drop_snapshot(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, int update_ref, int for_reloc); @@ -2898,10 +2901,9 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) * anything except sleeping. This function is used to check the status of * the fs. */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root) +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { - return (root->fs_info->sb->s_flags & MS_RDONLY || - btrfs_fs_closing(root->fs_info)); + return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info); } static inline void free_fs_info(struct btrfs_fs_info *fs_info) @@ -2931,11 +2933,11 @@ int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2950,7 +2952,7 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); -int btrfs_find_orphan_roots(struct btrfs_root *tree_root); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); void btrfs_set_root_node(struct btrfs_root_item *item, struct extent_buffer *node); void btrfs_check_and_init_root_item(struct btrfs_root_item *item); @@ -2959,10 +2961,10 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, /* uuid-tree.c */ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_root *uuid_root, u8 *uuid, u8 type, + struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_root *uuid_root, u8 *uuid, u8 type, + struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, int (*check_func)(struct btrfs_fs_info *, u8 *, u8, @@ -3004,10 +3006,10 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod); -int verify_dir_item(struct btrfs_root *root, +int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len); @@ -3051,11 +3053,10 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path, /* file-item.c */ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst); -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 logical_offset); + struct btrfs_fs_info *fs_info, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst); +int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, + u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -3069,8 +3070,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 file_start, int contig); +int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + u64 file_start, int contig); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct inode *inode, @@ -3173,7 +3174,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); void btrfs_invalidate_inodes(struct btrfs_root *root); void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_root *root); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); @@ -3227,14 +3228,10 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, +int btrfs_dirty_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags); int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len); @@ -3252,7 +3249,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info); ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); /* super.c */ -int btrfs_parse_options(struct btrfs_root *root, char *options, +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); int btrfs_sync_fs(struct super_block *sb, int wait); @@ -3445,9 +3442,14 @@ do { \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ - WARN(1, KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno)); \ + if ((errno) != -EIO) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } else { \ + pr_debug("BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ } \ __btrfs_abort_transaction((trans), __func__, \ __LINE__, (errno)); \ @@ -3609,7 +3611,7 @@ static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, #endif /* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, @@ -3628,12 +3630,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace); -void btrfs_scrub_pause(struct btrfs_root *root); -void btrfs_scrub_continue(struct btrfs_root *root); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); int btrfs_scrub_cancel(struct btrfs_fs_info *info); int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); /* dev-replace.c */ @@ -3648,7 +3650,7 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) /* reada.c */ struct reada_control { - struct btrfs_root *root; /* tree to prefetch */ + struct btrfs_fs_info *fs_info; /* tree to prefetch */ struct btrfs_key key_start; struct btrfs_key key_end; /* exclusive */ atomic_t elems; @@ -3660,7 +3662,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, u64 start, int err); + struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0fcf5f25d524..80982a83c9fd 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -72,12 +72,6 @@ static inline int btrfs_is_continuous_delayed_item( return 0; } -static inline struct btrfs_delayed_root *btrfs_get_delayed_root( - struct btrfs_root *root) -{ - return root->fs_info->delayed_root; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode) { struct btrfs_inode *btrfs_inode = BTRFS_I(inode); @@ -535,7 +529,7 @@ static struct btrfs_delayed_item *__btrfs_next_delayed_item( } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *src_rsv; @@ -547,12 +541,12 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return 0; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->delayed_block_rsv; + dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_item", + trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, num_bytes, 1); item->bytes_reserved = num_bytes; @@ -561,7 +555,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, return ret; } -static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, +static void btrfs_delayed_item_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_delayed_item *item) { struct btrfs_block_rsv *rsv; @@ -569,11 +563,11 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, if (!item->bytes_reserved) return; - rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_item", + rsv = &fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, item->bytes_reserved, 0); - btrfs_block_rsv_release(root, rsv, + btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved); } @@ -583,6 +577,7 @@ static int btrfs_delayed_inode_reserve_metadata( struct inode *inode, struct btrfs_delayed_node *node) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *src_rsv; struct btrfs_block_rsv *dst_rsv; u64 num_bytes; @@ -590,9 +585,9 @@ static int btrfs_delayed_inode_reserve_metadata( bool release = false; src_rsv = trans->block_rsv; - dst_rsv = &root->fs_info->delayed_block_rsv; + dst_rsv = &fs_info->delayed_block_rsv; - num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); /* * If our block_rsv is the delalloc block reserve then check and see if @@ -640,7 +635,7 @@ static int btrfs_delayed_inode_reserve_metadata( ret = -ENOSPC; if (!ret) { node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(root->fs_info, + trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); @@ -664,21 +659,21 @@ static int btrfs_delayed_inode_reserve_metadata( * how block rsvs. work. */ if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + trace_btrfs_space_reservation(fs_info, "delayed_inode", btrfs_ino(inode), num_bytes, 1); node->bytes_reserved = num_bytes; } if (release) { - trace_btrfs_space_reservation(root->fs_info, "delalloc", + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(root, src_rsv, num_bytes); + btrfs_block_rsv_release(fs_info, src_rsv, num_bytes); } return ret; } -static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, +static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_delayed_node *node) { struct btrfs_block_rsv *rsv; @@ -686,10 +681,10 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root, if (!node->bytes_reserved) return; - rsv = &root->fs_info->delayed_block_rsv; - trace_btrfs_space_reservation(root->fs_info, "delayed_inode", + rsv = &fs_info->delayed_block_rsv; + trace_btrfs_space_reservation(fs_info, "delayed_inode", node->inode_id, node->bytes_reserved, 0); - btrfs_block_rsv_release(root, rsv, + btrfs_block_rsv_release(fs_info, rsv, node->bytes_reserved); node->bytes_reserved = 0; } @@ -702,6 +697,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; int free_space; int total_data_size = 0, total_size = 0; @@ -718,7 +714,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, BUG_ON(!path->nodes[0]); leaf = path->nodes[0]; - free_space = btrfs_leaf_free_space(root, leaf); + free_space = btrfs_leaf_free_space(fs_info, leaf); INIT_LIST_HEAD(&head); next = item; @@ -791,7 +787,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, curr->data_len); slot++; - btrfs_delayed_item_release_metadata(root, curr); + btrfs_delayed_item_release_metadata(fs_info, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); @@ -813,6 +809,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *delayed_item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; char *ptr; int ret; @@ -830,7 +827,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, delayed_item->data_len); btrfs_mark_buffer_dirty(leaf); - btrfs_delayed_item_release_metadata(root, delayed_item); + btrfs_delayed_item_release_metadata(fs_info, delayed_item); return 0; } @@ -882,6 +879,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; struct extent_buffer *leaf; struct btrfs_key key; @@ -931,7 +929,7 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, goto out; list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); + btrfs_delayed_item_release_metadata(fs_info, curr); list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } @@ -1017,6 +1015,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_node *node) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_inode_item *inode_item; struct extent_buffer *leaf; @@ -1073,7 +1072,7 @@ out: no_iref: btrfs_release_path(path); err_out: - btrfs_delayed_inode_release_metadata(root, node); + btrfs_delayed_inode_release_metadata(fs_info, node); btrfs_release_delayed_inode(node); return ret; @@ -1138,7 +1137,7 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, * outstanding delayed items cleaned up. */ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; @@ -1156,9 +1155,9 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->delayed_block_rsv; + trans->block_rsv = &fs_info->delayed_block_rsv; - delayed_root = btrfs_get_delayed_root(root); + delayed_root = fs_info->delayed_root; curr_node = btrfs_first_delayed_node(delayed_root); while (curr_node && (!count || (count && nr--))) { @@ -1185,15 +1184,15 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, } int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - return __btrfs_run_delayed_items(trans, root, -1); + return __btrfs_run_delayed_items(trans, fs_info, -1); } int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { - return __btrfs_run_delayed_items(trans, root, nr); + return __btrfs_run_delayed_items(trans, fs_info, nr); } int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, @@ -1236,6 +1235,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, int btrfs_commit_inode_delayed_inode(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); struct btrfs_path *path; @@ -1267,7 +1267,7 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) path->leave_spinning = 1; block_rsv = trans->block_rsv; - trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; + trans->block_rsv = &fs_info->delayed_block_rsv; mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) @@ -1280,8 +1280,8 @@ int btrfs_commit_inode_delayed_inode(struct inode *inode) btrfs_free_path(path); trans->block_rsv = block_rsv; trans_out: - btrfs_end_transaction(trans, delayed_node->root); - btrfs_btree_balance_dirty(delayed_node->root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); out: btrfs_release_delayed_node(delayed_node); @@ -1345,15 +1345,16 @@ again: __btrfs_commit_inode_delayed_items(trans, path, delayed_node); trans->block_rsv = block_rsv; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty_nodelay(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty_nodelay(root->fs_info); release_path: btrfs_release_path(path); total_done++; btrfs_release_prepared_delayed_node(delayed_node); - if (async_work->nr == 0 || total_done < async_work->nr) + if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) || + total_done < async_work->nr) goto again; free_path: @@ -1369,7 +1370,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, { struct btrfs_async_delayed_work *async_work; - if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) + if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND || + btrfs_workqueue_normal_congested(fs_info->delayed_workers)) return 0; async_work = kmalloc(sizeof(*async_work), GFP_NOFS); @@ -1385,11 +1387,9 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return 0; } -void btrfs_assert_delayed_root_empty(struct btrfs_root *root) +void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_root *delayed_root; - delayed_root = btrfs_get_delayed_root(root); - WARN_ON(btrfs_first_delayed_node(delayed_root)); + WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); } static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) @@ -1405,12 +1405,9 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) return 0; } -void btrfs_balance_delayed_items(struct btrfs_root *root) +void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_root *delayed_root; - struct btrfs_fs_info *fs_info = root->fs_info; - - delayed_root = btrfs_get_delayed_root(root); + struct btrfs_delayed_root *delayed_root = fs_info->delayed_root; if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) return; @@ -1435,8 +1432,9 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) /* Will return 0 or -ENOMEM */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, + struct btrfs_fs_info *fs_info, + const char *name, int name_len, + struct inode *dir, struct btrfs_disk_key *disk_key, u8 type, u64 index) { @@ -1467,7 +1465,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item); + ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, delayed_item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible @@ -1478,7 +1476,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); if (unlikely(ret)) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", name_len, name, delayed_node->root->objectid, delayed_node->inode_id, ret); @@ -1491,7 +1489,7 @@ release_node: return ret; } -static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, +static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, struct btrfs_delayed_node *node, struct btrfs_key *key) { @@ -1504,15 +1502,15 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root, return 1; } - btrfs_delayed_item_release_metadata(root, item); + btrfs_delayed_item_release_metadata(fs_info, item); btrfs_release_delayed_item(item); mutex_unlock(&node->mutex); return 0; } int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *dir, - u64 index) + struct btrfs_fs_info *fs_info, + struct inode *dir, u64 index) { struct btrfs_delayed_node *node; struct btrfs_delayed_item *item; @@ -1527,7 +1525,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, item_key.type = BTRFS_DIR_INDEX_KEY; item_key.offset = index; - ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); + ret = btrfs_delete_delayed_insertion_item(fs_info, node, &item_key); if (!ret) goto end; @@ -1539,7 +1537,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, item->key = item_key; - ret = btrfs_delayed_item_reserve_metadata(trans, root, item); + ret = btrfs_delayed_item_reserve_metadata(trans, fs_info, item); /* * we have reserved enough space when we start a new transaction, * so reserving metadata failure is impossible. @@ -1549,7 +1547,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&node->mutex); ret = __btrfs_add_delayed_deletion_item(node, item); if (unlikely(ret)) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", index, node->root->objectid, node->inode_id, ret); BUG(); @@ -1686,7 +1684,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, * */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list, bool *emitted) + struct list_head *ins_list) { struct btrfs_dir_item *di; struct btrfs_delayed_item *curr, *next; @@ -1730,7 +1728,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; - *emitted = true; } return 0; } @@ -1861,6 +1858,7 @@ release_node: int btrfs_delayed_delete_inode_ref(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_delayed_node *delayed_node; /* @@ -1868,8 +1866,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode) * leads to enospc problems. This means we also can't do * delayed inode refs */ - if (test_bit(BTRFS_FS_LOG_RECOVERING, - &BTRFS_I(inode)->root->fs_info->flags)) + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return -EAGAIN; delayed_node = btrfs_get_or_create_delayed_node(inode); @@ -1896,7 +1893,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode) set_bit(BTRFS_DELAYED_NODE_DEL_IREF, &delayed_node->flags); delayed_node->count++; - atomic_inc(&BTRFS_I(inode)->root->fs_info->delayed_root->items); + atomic_inc(&fs_info->delayed_root->items); release_node: mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1906,12 +1903,13 @@ release_node: static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) { struct btrfs_root *root = delayed_node->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr_item, *prev_item; mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); + btrfs_delayed_item_release_metadata(fs_info, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1919,7 +1917,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); + btrfs_delayed_item_release_metadata(fs_info, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); @@ -1929,7 +1927,7 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) btrfs_release_delayed_iref(delayed_node); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - btrfs_delayed_inode_release_metadata(root, delayed_node); + btrfs_delayed_inode_release_metadata(fs_info, delayed_node); btrfs_release_delayed_inode(delayed_node); } mutex_unlock(&delayed_node->mutex); @@ -1976,14 +1974,11 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) } } -void btrfs_destroy_delayed_inodes(struct btrfs_root *root) +void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info) { - struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_node *curr_node, *prev_node; - delayed_root = btrfs_get_delayed_root(root); - - curr_node = btrfs_first_delayed_node(delayed_root); + curr_node = btrfs_first_delayed_node(fs_info->delayed_root); while (curr_node) { __btrfs_kill_delayed_node(curr_node); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 2495b3d4075f..8a2bf5e3e4cf 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -99,23 +99,24 @@ static inline void btrfs_init_delayed_root( } int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - int name_len, struct inode *dir, + struct btrfs_fs_info *fs_info, + const char *name, int name_len, + struct inode *dir, struct btrfs_disk_key *disk_key, u8 type, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *dir, - u64 index); + struct btrfs_fs_info *fs_info, + struct inode *dir, u64 index); int btrfs_inode_delayed_dir_index_count(struct inode *inode); int btrfs_run_delayed_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int nr); + struct btrfs_fs_info *fs_info, int nr); -void btrfs_balance_delayed_items(struct btrfs_root *root); +void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info); int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct inode *inode); @@ -134,7 +135,7 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode); void btrfs_kill_all_delayed_nodes(struct btrfs_root *root); /* Used for clean the transaction */ -void btrfs_destroy_delayed_inodes(struct btrfs_root *root); +void btrfs_destroy_delayed_inodes(struct btrfs_fs_info *fs_info); /* Used for readdir() */ bool btrfs_readdir_get_delayed_items(struct inode *inode, @@ -146,13 +147,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, int btrfs_should_delete_dir_index(struct list_head *del_list, u64 index); int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, - struct list_head *ins_list, bool *emitted); + struct list_head *ins_list); /* for init */ int __init btrfs_delayed_inode_init(void); void btrfs_delayed_inode_exit(void); /* for debugging */ -void btrfs_assert_delayed_root_empty(struct btrfs_root *root); +void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8d93854a4b4f..ef724a5fc30e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -189,6 +189,8 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, } else { assert_spin_locked(&head->lock); list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -431,6 +433,15 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, exist->action = ref->action; mod = -exist->ref_mod; exist->ref_mod = ref->ref_mod; + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&exist->add_list, + &href->ref_add_list); + else if (ref->action == BTRFS_DROP_DELAYED_REF) { + ASSERT(!list_empty(&exist->add_list)); + list_del(&exist->add_list); + } else { + ASSERT(0); + } } else mod = -ref->ref_mod; } @@ -444,6 +455,8 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, add_tail: list_add_tail(&ref->list, &href->ref_list); + if (ref->action == BTRFS_ADD_DELAYED_REF) + list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); trans->delayed_ref_updates++; spin_unlock(&href->lock); @@ -590,6 +603,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; INIT_LIST_HEAD(&head_ref->ref_list); + INIT_LIST_HEAD(&head_ref->ref_add_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; head_ref->qgroup_reserved = 0; @@ -606,7 +620,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, qrecord->num_bytes = num_bytes; qrecord->old_roots = NULL; - if(btrfs_qgroup_insert_dirty_extent_nolock(fs_info, + if(btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, qrecord)) kfree(qrecord); } @@ -671,6 +685,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + INIT_LIST_HEAD(&ref->list); + INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_tree_ref(ref); full_ref->parent = parent; @@ -726,6 +742,8 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, ref->is_head = 0; ref->in_tree = 1; ref->seq = seq; + INIT_LIST_HEAD(&ref->list); + INIT_LIST_HEAD(&ref->add_list); full_ref = btrfs_delayed_node_to_data_ref(ref); full_ref->parent = parent; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 43f3629760e9..50947b5a9152 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -34,14 +34,14 @@ * ref_head. Must clean this mess up later. */ struct btrfs_delayed_ref_node { - /* - * ref_head use rb tree, stored in ref_root->href. - * indexed by bytenr - */ - struct rb_node rb_node; - /*data/tree ref use list, stored in ref_head->ref_list. */ struct list_head list; + /* + * If action is BTRFS_ADD_DELAYED_REF, also link this node to + * ref_head->ref_add_list, then we do not need to iterate the + * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes. + */ + struct list_head add_list; /* the starting bytenr of the extent */ u64 bytenr; @@ -99,6 +99,8 @@ struct btrfs_delayed_ref_head { spinlock_t lock; struct list_head ref_list; + /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */ + struct list_head ref_add_list; struct rb_node href_node; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 05169ef30596..5de280b9ad73 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found: * missing */ if (!dev_replace->srcdev && - !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { + !btrfs_test_opt(fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found: src_devid); } if (!dev_replace->tgtdev && - !btrfs_test_opt(dev_root->fs_info, DEGRADED)) { + !btrfs_test_opt(fs_info, DEGRADED)) { ret = -EIO; btrfs_warn(fs_info, "cannot mount because device replace operation is ongoing and"); @@ -304,11 +304,11 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info) dev_replace->cursor_left_last_write_of_item; } -int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, u64 srcdevid, char *srcdev_name, int read_src) { + struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int ret; struct btrfs_device *tgt_device = NULL; @@ -316,14 +316,14 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, /* the disk copy procedure reuses the scrub code */ mutex_lock(&fs_info->volume_mutex); - ret = btrfs_find_device_by_devspec(root, srcdevid, + ret = btrfs_find_device_by_devspec(fs_info, srcdevid, srcdev_name, &src_device); if (ret) { mutex_unlock(&fs_info->volume_mutex); return ret; } - ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name, + ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name, src_device, &tgt_device); mutex_unlock(&fs_info->volume_mutex); if (ret) @@ -335,7 +335,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, */ trans = btrfs_attach_transaction(root); if (!IS_ERR(trans)) { - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; } else if (PTR_ERR(trans) != -ENOENT) { @@ -387,7 +387,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); - btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); /* force writing the updated state information to disk */ trans = btrfs_start_transaction(root, 0); @@ -397,7 +397,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, goto leave; } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); WARN_ON(ret); /* the disk copy procedure reuses the scrub code */ @@ -422,7 +422,7 @@ leave: return ret; } -int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { int ret; @@ -439,7 +439,7 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - ret = btrfs_dev_replace_start(root, args->start.tgtdev_name, + ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name, args->start.srcdevid, args->start.srcdev_name, args->start.cont_reading_from_srcdev_mode); @@ -501,25 +501,25 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; } - btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); WARN_ON(ret); mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace, 1); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED @@ -535,15 +535,15 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device, tgt_device); } else { - btrfs_err_in_rcu(root->fs_info, - "btrfs_scrub_dev(%s, %llu, %s) failed %d", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), - src_device->devid, - rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_err_in_rcu(fs_info, + "btrfs_scrub_dev(%s, %llu, %s) failed %d", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace, 1); - mutex_unlock(&root->fs_info->chunk_mutex); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); @@ -552,12 +552,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return scrub_ret; } - btrfs_info_in_rcu(root->fs_info, - "dev_replace from %s (devid %llu) to %s finished", - src_device->missing ? "<missing disk>" : - rcu_str_deref(src_device->name), - src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_info_in_rcu(fs_info, + "dev_replace from %s (devid %llu) to %s finished", + src_device->missing ? "<missing disk>" : + rcu_str_deref(src_device->name), + src_device->devid, + rcu_str_deref(tgt_device->name)); tgt_device->is_tgtdev_for_dev_replace = 0; tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; @@ -592,8 +592,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - mutex_unlock(&root->fs_info->chunk_mutex); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ @@ -603,7 +603,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); if (!IS_ERR(trans)) - btrfs_commit_transaction(trans, root); + btrfs_commit_transaction(trans); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -718,7 +718,7 @@ static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info) mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return PTR_ERR(trans); } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); WARN_ON(ret); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index e922b42d91df..54ea12bda15b 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,9 +25,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info); int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); -int btrfs_dev_replace_by_ioctl(struct btrfs_root *root, +int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); -int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name, +int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name, u64 srcdevid, char *srcdev_name, int read_src); void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 0dc1a033275e..b039fe0c751a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -38,6 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle const char *name, int name_len) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *ptr; struct btrfs_item *item; @@ -46,10 +47,10 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; - di = btrfs_match_dir_item_name(root, path, name, name_len); + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(root, path, data_size); + btrfs_extend_item(fs_info, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -79,7 +80,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)); key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; @@ -172,8 +173,9 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir, - &disk_key, type, index); + ret2 = btrfs_insert_delayed_dir_index(trans, root->fs_info, name, + name_len, dir, &disk_key, type, + index); out_free: btrfs_free_path(path); if (ret) @@ -210,7 +212,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; - return btrfs_match_dir_item_name(root, path, name, name_len); + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, @@ -246,7 +248,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, } /* we found an item, look for our name in the item */ - di = btrfs_match_dir_item_name(root, path, name, name_len); + di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len); if (di) { /* our exact name was found */ ret = -EEXIST; @@ -261,7 +263,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size_nr(leaf, slot) + - sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) { + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { ret = -EOVERFLOW; } else { /* plenty of insertion room */ @@ -301,7 +303,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); - return btrfs_match_dir_item_name(root, path, name, name_len); + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); } struct btrfs_dir_item * @@ -342,7 +344,8 @@ btrfs_search_dir_index_item(struct btrfs_root *root, if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; - di = btrfs_match_dir_item_name(root, path, name, name_len); + di = btrfs_match_dir_item_name(root->fs_info, path, + name, name_len); if (di) return di; @@ -371,7 +374,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; - return btrfs_match_dir_item_name(root, path, name, name_len); + return btrfs_match_dir_item_name(root->fs_info, path, name, name_len); } /* @@ -379,7 +382,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, * this walks through all the entries in a dir item and finds one * for a specific name. */ -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const char *name, int name_len) { @@ -392,7 +395,7 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); - if (verify_dir_item(root, leaf, dir_item)) + if (verify_dir_item(fs_info, leaf, dir_item)) return NULL; total_len = btrfs_item_size_nr(leaf, path->slots[0]); @@ -442,12 +445,13 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(root, path, item_len - sub_item_len, 1); + btrfs_truncate_item(root->fs_info, path, + item_len - sub_item_len, 1); } return ret; } -int verify_dir_item(struct btrfs_root *root, +int verify_dir_item(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_dir_item *dir_item) { @@ -455,8 +459,7 @@ int verify_dir_item(struct btrfs_root *root, u8 type = btrfs_dir_type(leaf, dir_item); if (type >= BTRFS_FT_MAX) { - btrfs_crit(root->fs_info, "invalid dir item type: %d", - (int)type); + btrfs_crit(fs_info, "invalid dir item type: %d", (int)type); return 1; } @@ -464,16 +467,16 @@ int verify_dir_item(struct btrfs_root *root, namelen = XATTR_NAME_MAX; if (btrfs_dir_name_len(leaf, dir_item) > namelen) { - btrfs_crit(root->fs_info, "invalid dir item name len: %u", + btrfs_crit(fs_info, "invalid dir item name len: %u", (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; } /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */ if ((btrfs_dir_data_len(leaf, dir_item) + - btrfs_dir_name_len(leaf, dir_item)) > BTRFS_MAX_XATTR_SIZE(root)) { - btrfs_crit(root->fs_info, - "invalid dir item name + data len: %u + %u", + btrfs_dir_name_len(leaf, dir_item)) > + BTRFS_MAX_XATTR_SIZE(fs_info)) { + btrfs_crit(fs_info, "invalid dir item name + data len: %u + %u", (unsigned)btrfs_dir_name_len(leaf, dir_item), (unsigned)btrfs_dir_data_len(leaf, dir_item)); return 1; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3a57f99d96aa..18004169552c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -68,15 +68,15 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root); -static int btrfs_destroy_marked_extents(struct btrfs_root *root, +static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); -static int btrfs_destroy_pinned_extent(struct btrfs_root *root, +static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, struct extent_io_tree *pinned_extents); -static int btrfs_cleanup_transaction(struct btrfs_root *root); -static void btrfs_error_commit_super(struct btrfs_root *root); +static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); +static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); /* * btrfs_end_io_wq structs are used to do processing in task context when an IO @@ -224,6 +224,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret; @@ -231,8 +232,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) { - em->bdev = - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); goto out; } @@ -247,7 +247,7 @@ static struct extent_map *btree_get_extent(struct inode *inode, em->len = (u64)-1; em->block_len = (u64)-1; em->block_start = 0; - em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); @@ -271,7 +271,7 @@ u32 btrfs_csum_data(char *data, u32 seed, size_t len) return btrfs_crc32c(seed, data, len); } -void btrfs_csum_final(u32 crc, char *result) +void btrfs_csum_final(u32 crc, u8 *result) { put_unaligned_le32(~crc, result); } @@ -440,7 +440,7 @@ static int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. */ -static int btree_read_extent_buffer_pages(struct btrfs_root *root, +static int btree_read_extent_buffer_pages(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, u64 parent_transid) { @@ -452,7 +452,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, int failed_mirror = 0; clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { ret = read_extent_buffer_pages(io_tree, eb, WAIT_COMPLETE, btree_get_extent, mirror_num); @@ -472,7 +472,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - num_copies = btrfs_num_copies(root->fs_info, + num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); if (num_copies == 1) break; @@ -491,7 +491,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, } if (failed && !ret && failed_mirror) - repair_eb_io_failure(root, eb, failed_mirror); + repair_eb_io_failure(fs_info, eb, failed_mirror); return ret; } @@ -545,47 +545,63 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, return ret; } -#define CORRUPT(reason, eb, root, slot) \ - btrfs_crit(root->fs_info, "corrupt %s, %s: block=%llu," \ - " root=%llu, slot=%d", \ - btrfs_header_level(eb) == 0 ? "leaf" : "node",\ +#define CORRUPT(reason, eb, root, slot) \ + btrfs_crit(root->fs_info, \ + "corrupt %s, %s: block=%llu, root=%llu, slot=%d", \ + btrfs_header_level(eb) == 0 ? "leaf" : "node", \ reason, btrfs_header_bytenr(eb), root->objectid, slot) static noinline int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key leaf_key; u32 nritems = btrfs_header_nritems(leaf); int slot; - if (nritems == 0) { + /* + * Extent buffers from a relocation tree have a owner field that + * corresponds to the subvolume tree they are based on. So just from an + * extent buffer alone we can not find out what is the id of the + * corresponding subvolume tree, so we can not figure out if the extent + * buffer corresponds to the root of the relocation tree or not. So skip + * this check for relocation trees. + */ + if (nritems == 0 && !btrfs_header_flag(leaf, BTRFS_HEADER_FLAG_RELOC)) { struct btrfs_root *check_root; key.objectid = btrfs_header_owner(leaf); key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - check_root = btrfs_get_fs_root(root->fs_info, &key, false); + check_root = btrfs_get_fs_root(fs_info, &key, false); /* * The only reason we also check NULL here is that during * open_ctree() some roots has not yet been set up. */ if (!IS_ERR_OR_NULL(check_root)) { + struct extent_buffer *eb; + + eb = btrfs_root_node(check_root); /* if leaf is the root, then it's fine */ - if (leaf->start != - btrfs_root_bytenr(&check_root->root_item)) { + if (leaf != eb) { CORRUPT("non-root leaf's nritems is 0", - leaf, root, 0); + leaf, check_root, 0); + free_extent_buffer(eb); return -EIO; } + free_extent_buffer(eb); } return 0; } + if (nritems == 0) + return 0; + /* Check the 0 item */ if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) != - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { CORRUPT("invalid item offset size pair", leaf, root, 0); return -EIO; } @@ -624,7 +640,7 @@ static noinline int check_leaf(struct btrfs_root *root, * all point outside of the leaf. */ if (btrfs_item_end_nr(leaf, slot) > - BTRFS_LEAF_DATA_SIZE(root)) { + BTRFS_LEAF_DATA_SIZE(fs_info)) { CORRUPT("slot end outside of leaf", leaf, root, slot); return -EIO; } @@ -641,7 +657,7 @@ static int check_node(struct btrfs_root *root, struct extent_buffer *node) u64 bytenr; int ret = 0; - if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root)) { + if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(root->fs_info)) { btrfs_crit(root->fs_info, "corrupt node: block %llu root %llu nritems %lu", node->start, root->objectid, nr); @@ -747,7 +763,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, eb->start, ret); + btree_readahead_hook(fs_info, eb, ret); if (ret) { /* @@ -772,7 +788,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, eb->start, -EIO); + btree_readahead_hook(eb->fs_info, eb, -EIO); return -EIO; /* we fixed nothing */ } @@ -930,7 +946,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, atomic_inc(&fs_info->nr_async_submits); - if (bio->bi_opf & REQ_SYNC) + if (op_is_sync(bio->bi_opf)) btrfs_set_work_high_priority(&async->work); btrfs_queue_work(fs_info->workers, &async->work); @@ -981,7 +997,7 @@ static int __btree_submit_bio_done(struct inode *inode, struct bio *bio, * when we're called for a write, we're already in the async * submission context. Just jump into btrfs_map_bio */ - ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 1); + ret = btrfs_map_bio(btrfs_sb(inode->i_sb), bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1004,6 +1020,7 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(inode, bio_flags); int ret; @@ -1012,23 +1029,22 @@ static int btree_submit_bio_hook(struct inode *inode, struct bio *bio, * called for a read, do the setup so that checksum validation * can happen in the async kernel threads */ - ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, BTRFS_WQ_ENDIO_METADATA); + ret = btrfs_bio_wq_end_io(fs_info, bio, + BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); } else if (!async) { ret = btree_csum_one_bio(bio); if (ret) goto out_w_error; - ret = btrfs_map_bio(BTRFS_I(inode)->root, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, bio, mirror_num, 0, + ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, 0, bio_offset, __btree_submit_bio_start, __btree_submit_bio_done); @@ -1146,12 +1162,12 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -void readahead_tree_block(struct btrfs_root *root, u64 bytenr) +void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; + struct inode *btree_inode = fs_info->btree_inode; - buf = btrfs_find_create_tree_block(root, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, @@ -1159,15 +1175,15 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr) free_extent_buffer(buf); } -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, +int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, int mirror_num, struct extent_buffer **eb) { struct extent_buffer *buf = NULL; - struct inode *btree_inode = root->fs_info->btree_inode; + struct inode *btree_inode = fs_info->btree_inode; struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree; int ret; - buf = btrfs_find_create_tree_block(root, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return 0; @@ -1191,19 +1207,13 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr) -{ - return find_extent_buffer(fs_info, bytenr); -} - -struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr) +struct extent_buffer *btrfs_find_create_tree_block( + struct btrfs_fs_info *fs_info, + u64 bytenr) { - if (btrfs_is_testing(root->fs_info)) - return alloc_test_extent_buffer(root->fs_info, bytenr, - root->nodesize); - return alloc_extent_buffer(root->fs_info, bytenr); + if (btrfs_is_testing(fs_info)) + return alloc_test_extent_buffer(fs_info, bytenr); + return alloc_extent_buffer(fs_info, bytenr); } @@ -1219,17 +1229,17 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) buf->start, buf->start + buf->len - 1); } -struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, +struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 parent_transid) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(root, buf, parent_transid); + ret = btree_read_extent_buffer_pages(fs_info, buf, parent_transid); if (ret) { free_extent_buffer(buf); return ERR_PTR(ret); @@ -1283,16 +1293,12 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) kfree(writers); } -static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, - struct btrfs_root *root, struct btrfs_fs_info *fs_info, +static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); root->node = NULL; root->commit_root = NULL; - root->sectorsize = sectorsize; - root->nodesize = nodesize; - root->stripesize = stripesize; root->state = 0; root->orphan_cleanup_state = 0; @@ -1370,8 +1376,7 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* Should only be used by the testing infrastructure */ -struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, - u32 sectorsize, u32 nodesize) +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; @@ -1381,9 +1386,9 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, root = btrfs_alloc_root(fs_info, GFP_KERNEL); if (!root) return ERR_PTR(-ENOMEM); + /* We don't use the stripesize in selftest, set it as sectorsize */ - __setup_root(nodesize, sectorsize, sectorsize, root, fs_info, - BTRFS_ROOT_TREE_OBJECTID); + __setup_root(root, fs_info, BTRFS_ROOT_TREE_OBJECTID); root->alloc_bytenr = 0; return root; @@ -1405,8 +1410,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->sectorsize, - tree_root->stripesize, root, fs_info, objectid); + __setup_root(root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; @@ -1418,18 +1422,15 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, goto fail; } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); root->node = leaf; - write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - write_extent_buffer(leaf, fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(leaf, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); root->commit_root = btrfs_root_node(root); @@ -1474,16 +1475,13 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; root = btrfs_alloc_root(fs_info, GFP_NOFS); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->sectorsize, - tree_root->stripesize, root, fs_info, - BTRFS_TREE_LOG_OBJECTID); + __setup_root(root, fs_info, BTRFS_TREE_LOG_OBJECTID); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -1505,15 +1503,14 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, return ERR_CAST(leaf); } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); root->node = leaf; - write_extent_buffer(root->node, root->fs_info->fsid, - btrfs_header_fsid(), BTRFS_FSID_SIZE); + write_extent_buffer_fsid(root->node, fs_info->fsid); btrfs_mark_buffer_dirty(root->node); btrfs_tree_unlock(root->node); return root; @@ -1535,10 +1532,11 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log_root; struct btrfs_inode_item *inode_item; - log_root = alloc_log_tree(trans, root->fs_info); + log_root = alloc_log_tree(trans, fs_info); if (IS_ERR(log_root)) return PTR_ERR(log_root); @@ -1549,7 +1547,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); + btrfs_set_stack_inode_nbytes(inode_item, + fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1581,8 +1580,7 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, goto alloc_fail; } - __setup_root(tree_root->nodesize, tree_root->sectorsize, - tree_root->stripesize, root, fs_info, key->objectid); + __setup_root(root, fs_info, key->objectid); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); @@ -1593,7 +1591,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); - root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + root->node = read_tree_block(fs_info, + btrfs_root_bytenr(&root->root_item), generation); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); @@ -1848,6 +1847,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; + struct btrfs_fs_info *fs_info = root->fs_info; int again; struct btrfs_trans_handle *trans; @@ -1855,40 +1855,40 @@ static int cleaner_kthread(void *arg) again = 0; /* Make the cleaner go to sleep early. */ - if (btrfs_need_cleaner_sleep(root)) + if (btrfs_need_cleaner_sleep(fs_info)) goto sleep; /* * Do not do anything if we might cause open_ctree() to block * before we have finished mounting the filesystem. */ - if (!test_bit(BTRFS_FS_OPEN, &root->fs_info->flags)) + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) goto sleep; - if (!mutex_trylock(&root->fs_info->cleaner_mutex)) + if (!mutex_trylock(&fs_info->cleaner_mutex)) goto sleep; /* * Avoid the problem that we change the status of the fs * during the above check and trylock. */ - if (btrfs_need_cleaner_sleep(root)) { - mutex_unlock(&root->fs_info->cleaner_mutex); + if (btrfs_need_cleaner_sleep(fs_info)) { + mutex_unlock(&fs_info->cleaner_mutex); goto sleep; } - mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); - btrfs_run_delayed_iputs(root); - mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); again = btrfs_clean_one_deleted_snapshot(root); - mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&fs_info->cleaner_mutex); /* * The defragger has dealt with the R/O remount and umount, * needn't do anything special here. */ - btrfs_run_defrag_inodes(root->fs_info); + btrfs_run_defrag_inodes(fs_info); /* * Acquires fs_info->delete_unused_bgs_mutex to avoid racing @@ -1898,7 +1898,7 @@ static int cleaner_kthread(void *arg) * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ - btrfs_delete_unused_bgs(root->fs_info); + btrfs_delete_unused_bgs(fs_info); sleep: if (!again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1922,15 +1922,15 @@ sleep: trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT) - btrfs_err(root->fs_info, + btrfs_err(fs_info, "cleaner transaction attach returned %ld", PTR_ERR(trans)); } else { int ret; - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret) - btrfs_err(root->fs_info, + btrfs_err(fs_info, "cleaner open transaction commit returned %d", ret); } @@ -1941,6 +1941,7 @@ sleep: static int transaction_kthread(void *arg) { struct btrfs_root *root = arg; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_transaction *cur; u64 transid; @@ -1950,26 +1951,26 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = HZ * root->fs_info->commit_interval; - mutex_lock(&root->fs_info->transaction_kthread_mutex); + delay = HZ * fs_info->commit_interval; + mutex_lock(&fs_info->transaction_kthread_mutex); - spin_lock(&root->fs_info->trans_lock); - cur = root->fs_info->running_transaction; + spin_lock(&fs_info->trans_lock); + cur = fs_info->running_transaction; if (!cur) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); goto sleep; } now = get_seconds(); if (cur->state < TRANS_STATE_BLOCKED && (now < cur->start_time || - now - cur->start_time < root->fs_info->commit_interval)) { - spin_unlock(&root->fs_info->trans_lock); + now - cur->start_time < fs_info->commit_interval)) { + spin_unlock(&fs_info->trans_lock); delay = HZ * 5; goto sleep; } transid = cur->transid; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); /* If the file system is aborted, this will always fail. */ trans = btrfs_attach_transaction(root); @@ -1979,20 +1980,20 @@ static int transaction_kthread(void *arg) goto sleep; } if (transid == trans->transid) { - btrfs_commit_transaction(trans, root); + btrfs_commit_transaction(trans); } else { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } sleep: - wake_up_process(root->fs_info->cleaner_kthread); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + wake_up_process(fs_info->cleaner_kthread); + mutex_unlock(&fs_info->transaction_kthread_mutex); if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, - &root->fs_info->fs_state))) - btrfs_cleanup_transaction(root); + &fs_info->fs_state))) + btrfs_cleanup_transaction(fs_info); set_current_state(TASK_INTERRUPTIBLE); if (!kthread_should_stop() && - (!btrfs_transaction_blocked(root->fs_info) || + (!btrfs_transaction_blocked(fs_info) || cannot_commit)) schedule_timeout(delay); __set_current_state(TASK_RUNNING); @@ -2279,8 +2280,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_free_log_root_tree(NULL, fs_info); - btrfs_destroy_pinned_extent(fs_info->tree_root, - fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); } } @@ -2306,33 +2306,31 @@ static void btrfs_init_balance(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->balance_wait_q); } -static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, - struct btrfs_root *tree_root) +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) { - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); + struct inode *inode = fs_info->btree_inode; + + inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(inode, 1); /* * we set the i_size on the btree inode to the max possible int. * the real end of the address space is determined by all of * the devices in the system */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &btree_aops; - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, inode->i_mapping); + BTRFS_I(inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - set_bit(BTRFS_INODE_DUMMY, - &BTRFS_I(fs_info->btree_inode)->runtime_flags); - btrfs_insert_inode_hash(fs_info->btree_inode); + BTRFS_I(inode)->root = fs_info->tree_root; + memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + btrfs_insert_inode_hash(inode); } static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) @@ -2453,7 +2451,6 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; - struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); @@ -2467,12 +2464,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (!log_tree_root) return -ENOMEM; - __setup_root(tree_root->nodesize, tree_root->sectorsize, - tree_root->stripesize, log_tree_root, fs_info, - BTRFS_TREE_LOG_OBJECTID); + __setup_root(log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - log_tree_root->node = read_tree_block(tree_root, bytenr, - fs_info->generation + 1); + log_tree_root->node = read_tree_block(fs_info, bytenr, + fs_info->generation + 1); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2487,15 +2482,15 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { - btrfs_handle_fs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); + btrfs_handle_fs_error(fs_info, ret, + "Failed to recover log tree"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); return ret; } if (fs_info->sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); + ret = btrfs_commit_super(fs_info); if (ret) return ret; } @@ -2503,13 +2498,15 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_read_roots(struct btrfs_fs_info *fs_info, - struct btrfs_root *tree_root) +static int btrfs_read_roots(struct btrfs_fs_info *fs_info) { + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root; struct btrfs_key location; int ret; + BUG_ON(!fs_info->tree_root); + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; location.type = BTRFS_ROOT_ITEM_KEY; location.offset = 0; @@ -2720,7 +2717,7 @@ int open_ctree(struct super_block *sb, sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; - btrfs_init_btree_inode(fs_info, tree_root); + btrfs_init_btree_inode(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2758,14 +2755,18 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->pinned_chunks); + /* Usable values until the real ones are cached from the superblock */ + fs_info->nodesize = 4096; + fs_info->sectorsize = 4096; + fs_info->stripesize = 4096; + ret = btrfs_alloc_stripe_hash_table(fs_info); if (ret) { err = ret; goto fail_alloc; } - __setup_root(4096, 4096, 4096, tree_root, - fs_info, BTRFS_ROOT_TREE_OBJECTID); + __setup_root(tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); @@ -2829,7 +2830,7 @@ int open_ctree(struct super_block *sb, */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - ret = btrfs_parse_options(tree_root, options, sb->s_flags); + ret = btrfs_parse_options(fs_info, options, sb->s_flags); if (ret) { err = ret; goto fail_alloc; @@ -2847,7 +2848,7 @@ int open_ctree(struct super_block *sb, features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; - if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) @@ -2870,6 +2871,11 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); + /* Cache block sizes */ + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + fs_info->stripesize = stripesize; + /* * mixed block groups end up with duplicate but slightly offset * extent buffers for the same range. It leads to corruptions @@ -2910,15 +2916,11 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, SZ_4M / PAGE_SIZE); - tree_root->nodesize = nodesize; - tree_root->sectorsize = sectorsize; - tree_root->stripesize = stripesize; - sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); mutex_lock(&fs_info->chunk_mutex); - ret = btrfs_read_sys_array(tree_root); + ret = btrfs_read_sys_array(fs_info); mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_err(fs_info, "failed to read the system array: %d", ret); @@ -2927,10 +2929,9 @@ int open_ctree(struct super_block *sb, generation = btrfs_super_chunk_root_generation(disk_super); - __setup_root(nodesize, sectorsize, stripesize, chunk_root, - fs_info, BTRFS_CHUNK_TREE_OBJECTID); + __setup_root(chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); - chunk_root->node = read_tree_block(chunk_root, + chunk_root->node = read_tree_block(fs_info, btrfs_super_chunk_root(disk_super), generation); if (IS_ERR(chunk_root->node) || @@ -2947,7 +2948,7 @@ int open_ctree(struct super_block *sb, read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE); - ret = btrfs_read_chunk_tree(chunk_root); + ret = btrfs_read_chunk_tree(fs_info); if (ret) { btrfs_err(fs_info, "failed to read chunk tree: %d", ret); goto fail_tree_roots; @@ -2967,7 +2968,7 @@ int open_ctree(struct super_block *sb, retry_root_backup: generation = btrfs_super_generation(disk_super); - tree_root->node = read_tree_block(tree_root, + tree_root->node = read_tree_block(fs_info, btrfs_super_root(disk_super), generation); if (IS_ERR(tree_root->node) || @@ -2995,7 +2996,7 @@ retry_root_backup: mutex_unlock(&tree_root->objectid_mutex); - ret = btrfs_read_roots(fs_info, tree_root); + ret = btrfs_read_roots(fs_info); if (ret) goto recovery_tree_root; @@ -3048,7 +3049,7 @@ retry_root_backup: goto fail_sysfs; } - ret = btrfs_read_block_groups(fs_info->extent_root); + ret = btrfs_read_block_groups(fs_info); if (ret) { btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; @@ -3076,8 +3077,8 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(tree_root->fs_info, SSD) && - !btrfs_test_opt(tree_root->fs_info, NOSSD) && + if (!btrfs_test_opt(fs_info, SSD) && + !btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); btrfs_set_opt(fs_info->mount_opt, SSD); @@ -3090,9 +3091,9 @@ retry_root_backup: btrfs_apply_pending_changes(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) { - ret = btrfsic_mount(tree_root, fs_devices, - btrfs_test_opt(tree_root->fs_info, + if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { + ret = btrfsic_mount(fs_info, fs_devices, + btrfs_test_opt(fs_info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ? 1 : 0, fs_info->check_integrity_print_mask); @@ -3108,7 +3109,7 @@ retry_root_backup: /* do not make disk changes in broken FS or nologreplay is given */ if (btrfs_super_log_root(disk_super) != 0 && - !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) { + !btrfs_test_opt(fs_info, NOLOGREPLAY)) { ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { err = ret; @@ -3116,7 +3117,7 @@ retry_root_backup: } } - ret = btrfs_find_orphan_roots(tree_root); + ret = btrfs_find_orphan_roots(fs_info); if (ret) goto fail_qgroup; @@ -3164,19 +3165,19 @@ retry_root_backup: if (ret) { btrfs_warn(fs_info, "failed to clear free space tree: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } } - if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) && + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { btrfs_info(fs_info, "creating free space tree"); ret = btrfs_create_free_space_tree(fs_info); if (ret) { btrfs_warn(fs_info, "failed to create free space tree: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } } @@ -3185,7 +3186,7 @@ retry_root_backup: if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { up_read(&fs_info->cleanup_work_sem); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } up_read(&fs_info->cleanup_work_sem); @@ -3193,14 +3194,14 @@ retry_root_backup: ret = btrfs_resume_balance_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume balance: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } ret = btrfs_resume_dev_replace_async(fs_info); if (ret) { btrfs_warn(fs_info, "failed to resume device replace: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } @@ -3212,10 +3213,10 @@ retry_root_backup: if (ret) { btrfs_warn(fs_info, "failed to create the UUID tree: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } - } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) || + } else if (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || fs_info->generation != btrfs_super_uuid_tree_generation(disk_super)) { btrfs_info(fs_info, "checking UUID tree"); @@ -3223,7 +3224,7 @@ retry_root_backup: if (ret) { btrfs_warn(fs_info, "failed to check the UUID tree: %d", ret); - close_ctree(tree_root); + close_ctree(fs_info); return ret; } } else { @@ -3243,7 +3244,7 @@ fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); - btrfs_cleanup_transaction(fs_info->tree_root); + btrfs_cleanup_transaction(fs_info); btrfs_free_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3291,7 +3292,7 @@ fail: return err; recovery_tree_root: - if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT)) + if (!btrfs_test_opt(fs_info, USEBACKUPROOT)) goto fail_tree_roots; free_root_pointers(fs_info, 0); @@ -3317,7 +3318,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) struct btrfs_device *device = (struct btrfs_device *) bh->b_private; - btrfs_warn_rl_in_rcu(device->dev_root->fs_info, + btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s", rcu_str_deref(device->name)); /* note, we don't set_buffer_write_io_error because we have @@ -3462,7 +3463,7 @@ static int write_dev_supers(struct btrfs_device *device, bh = __getblk(device->bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); if (!bh) { - btrfs_err(device->dev_root->fs_info, + btrfs_err(device->fs_info, "couldn't get super buffer head for bytenr %llu", bytenr); errors++; @@ -3485,9 +3486,9 @@ static int write_dev_supers(struct btrfs_device *device, * to go down lazy. */ if (i == 0) - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh); else - ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); if (ret) errors++; } @@ -3551,7 +3552,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) bio->bi_end_io = btrfs_end_empty_barrier; bio->bi_bdev = device->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; device->flush_bio = bio; @@ -3695,7 +3696,7 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( return num_tolerated_disk_barrier_failures; } -static int write_all_supers(struct btrfs_root *root, int max_mirrors) +static int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; struct btrfs_device *dev; @@ -3707,23 +3708,23 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) int total_errors = 0; u64 flags; - do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER); - backup_super_roots(root->fs_info); + do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); + backup_super_roots(fs_info); - sb = root->fs_info->super_for_commit; + sb = fs_info->super_for_commit; dev_item = &sb->dev_item; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - head = &root->fs_info->fs_devices->devices; - max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1; + mutex_lock(&fs_info->fs_devices->device_list_mutex); + head = &fs_info->fs_devices->devices; + max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1; if (do_barriers) { - ret = barrier_all_devices(root->fs_info); + ret = barrier_all_devices(fs_info); if (ret) { mutex_unlock( - &root->fs_info->fs_devices->device_list_mutex); - btrfs_handle_fs_error(root->fs_info, ret, - "errors while submitting device barriers."); + &fs_info->fs_devices->device_list_mutex); + btrfs_handle_fs_error(fs_info, ret, + "errors while submitting device barriers."); return ret; } } @@ -3757,13 +3758,14 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) total_errors++; } if (total_errors > max_errors) { - btrfs_err(root->fs_info, "%d errors while writing supers", - total_errors); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + btrfs_err(fs_info, "%d errors while writing supers", + total_errors); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_handle_fs_error(root->fs_info, -EIO, - "%d errors while writing supers", total_errors); + btrfs_handle_fs_error(fs_info, -EIO, + "%d errors while writing supers", + total_errors); return -EIO; } @@ -3778,19 +3780,20 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) if (ret) total_errors++; } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (total_errors > max_errors) { - btrfs_handle_fs_error(root->fs_info, -EIO, - "%d errors while writing supers", total_errors); + btrfs_handle_fs_error(fs_info, -EIO, + "%d errors while writing supers", + total_errors); return -EIO; } return 0; } int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int max_mirrors) + struct btrfs_fs_info *fs_info, int max_mirrors) { - return write_all_supers(root, max_mirrors); + return write_all_supers(fs_info, max_mirrors); } /* Drop a fs root from the radix tree and free it. */ @@ -3826,7 +3829,7 @@ static void free_fs_root(struct btrfs_root *root) { iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); - btrfs_free_block_rsv(root, root->orphan_block_rsv); + btrfs_free_block_rsv(root->fs_info, root->orphan_block_rsv); root->orphan_block_rsv = NULL; if (root->anon_dev) free_anon_bdev(root->anon_dev); @@ -3896,28 +3899,29 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) return err; } -int btrfs_commit_super(struct btrfs_root *root) +int btrfs_commit_super(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(root); - mutex_unlock(&root->fs_info->cleaner_mutex); - wake_up_process(root->fs_info->cleaner_kthread); + mutex_lock(&fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); + wake_up_process(fs_info->cleaner_kthread); /* wait until ongoing cleanup work done */ - down_write(&root->fs_info->cleanup_work_sem); - up_write(&root->fs_info->cleanup_work_sem); + down_write(&fs_info->cleanup_work_sem); + up_write(&fs_info->cleanup_work_sem); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); } -void close_ctree(struct btrfs_root *root) +void close_ctree(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = fs_info->tree_root; int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); @@ -3952,15 +3956,15 @@ void close_ctree(struct btrfs_root *root) * block groups queued for removal, the deletion will be * skipped when we quit the cleaner thread. */ - btrfs_delete_unused_bgs(root->fs_info); + btrfs_delete_unused_bgs(fs_info); - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(fs_info); if (ret) btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) - btrfs_error_commit_super(root); + btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -3996,8 +4000,8 @@ void close_ctree(struct btrfs_root *root) iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY)) - btrfsic_unmount(root, fs_info->fs_devices); + if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) + btrfsic_unmount(fs_info->fs_devices); #endif btrfs_close_devices(fs_info->fs_devices); @@ -4014,7 +4018,7 @@ void close_ctree(struct btrfs_root *root) __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; @@ -4023,7 +4027,7 @@ void close_ctree(struct btrfs_root *root) list_del_init(&em->list); free_extent_map(em); } - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4045,6 +4049,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { + struct btrfs_fs_info *fs_info; struct btrfs_root *root; u64 transid = btrfs_header_generation(buf); int was_dirty; @@ -4059,24 +4064,25 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) return; #endif root = BTRFS_I(buf->pages[0]->mapping->host)->root; + fs_info = root->fs_info; btrfs_assert_tree_locked(buf); - if (transid != root->fs_info->generation) + if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", - buf->start, transid, root->fs_info->generation); + buf->start, transid, fs_info->generation); was_dirty = set_extent_buffer_dirty(buf); if (!was_dirty) - __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, + __percpu_counter_add(&fs_info->dirty_metadata_bytes, buf->len, - root->fs_info->dirty_metadata_batch); + fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { - btrfs_print_leaf(root, buf); + btrfs_print_leaf(fs_info, buf); ASSERT(0); } #endif } -static void __btrfs_btree_balance_dirty(struct btrfs_root *root, +static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, int flush_delayed) { /* @@ -4089,30 +4095,31 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, return; if (flush_delayed) - btrfs_balance_delayed_items(root); + btrfs_balance_delayed_items(fs_info); - ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, + ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, BTRFS_DIRTY_METADATA_THRESH); if (ret > 0) { - balance_dirty_pages_ratelimited( - root->fs_info->btree_inode->i_mapping); + balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping); } } -void btrfs_btree_balance_dirty(struct btrfs_root *root) +void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info) { - __btrfs_btree_balance_dirty(root, 1); + __btrfs_btree_balance_dirty(fs_info, 1); } -void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root) +void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) { - __btrfs_btree_balance_dirty(root, 0); + __btrfs_btree_balance_dirty(fs_info, 0); } int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) { struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root; - return btree_read_extent_buffer_pages(root, buf, parent_transid); + struct btrfs_fs_info *fs_info = root->fs_info; + + return btree_read_extent_buffer_pages(fs_info, buf, parent_transid); } static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -4263,17 +4270,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, return ret; } -static void btrfs_error_commit_super(struct btrfs_root *root) +static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { - mutex_lock(&root->fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(root); - mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_lock(&fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(fs_info); + mutex_unlock(&fs_info->cleaner_mutex); - down_write(&root->fs_info->cleanup_work_sem); - up_write(&root->fs_info->cleanup_work_sem); + down_write(&fs_info->cleanup_work_sem); + up_write(&fs_info->cleanup_work_sem); /* cleanup FS via transaction */ - btrfs_cleanup_transaction(root); + btrfs_cleanup_transaction(fs_info); } static void btrfs_destroy_ordered_extents(struct btrfs_root *root) @@ -4316,7 +4323,7 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) } static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -4328,7 +4335,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, spin_lock(&delayed_refs->lock); if (atomic_read(&delayed_refs->num_entries) == 0) { spin_unlock(&delayed_refs->lock); - btrfs_info(root->fs_info, "delayed_refs has NO entry"); + btrfs_info(fs_info, "delayed_refs has NO entry"); return ret; } @@ -4354,6 +4361,8 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list) { ref->in_tree = 0; list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } @@ -4371,7 +4380,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, mutex_unlock(&head->mutex); if (pin_bytes) - btrfs_pin_extent(root, head->node.bytenr, + btrfs_pin_extent(fs_info, head->node.bytenr, head->node.num_bytes, 1); btrfs_put_delayed_ref(&head->node); cond_resched(); @@ -4435,7 +4444,7 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delalloc_root_lock); } -static int btrfs_destroy_marked_extents(struct btrfs_root *root, +static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { @@ -4452,8 +4461,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { - eb = btrfs_find_tree_block(root->fs_info, start); - start += root->nodesize; + eb = find_extent_buffer(fs_info, start); + start += fs_info->nodesize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); @@ -4468,7 +4477,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, return ret; } -static int btrfs_destroy_pinned_extent(struct btrfs_root *root, +static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, struct extent_io_tree *pinned_extents) { struct extent_io_tree *unpin; @@ -4486,15 +4495,15 @@ again: break; clear_extent_dirty(unpin, start, end); - btrfs_error_unpin_extent_range(root, start, end); + btrfs_error_unpin_extent_range(fs_info, start, end); cond_resched(); } if (loop) { - if (unpin == &root->fs_info->freed_extents[0]) - unpin = &root->fs_info->freed_extents[1]; + if (unpin == &fs_info->freed_extents[0]) + unpin = &fs_info->freed_extents[1]; else - unpin = &root->fs_info->freed_extents[0]; + unpin = &fs_info->freed_extents[0]; loop = false; goto again; } @@ -4517,7 +4526,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache) } void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *cache; @@ -4527,8 +4536,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_block_group_cache, dirty_list); if (!cache) { - btrfs_err(root->fs_info, - "orphan block group dirty_bgs list"); + btrfs_err(fs_info, "orphan block group dirty_bgs list"); spin_unlock(&cur_trans->dirty_bgs_lock); return; } @@ -4556,8 +4564,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, struct btrfs_block_group_cache, io_list); if (!cache) { - btrfs_err(root->fs_info, - "orphan block group on io_bgs list"); + btrfs_err(fs_info, "orphan block group on io_bgs list"); return; } @@ -4570,27 +4577,27 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - btrfs_cleanup_dirty_bgs(cur_trans, root); + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); - btrfs_destroy_delayed_refs(cur_trans, root); + btrfs_destroy_delayed_refs(cur_trans, fs_info); cur_trans->state = TRANS_STATE_COMMIT_START; - wake_up(&root->fs_info->transaction_blocked_wait); + wake_up(&fs_info->transaction_blocked_wait); cur_trans->state = TRANS_STATE_UNBLOCKED; - wake_up(&root->fs_info->transaction_wait); + wake_up(&fs_info->transaction_wait); - btrfs_destroy_delayed_inodes(root); - btrfs_assert_delayed_root_empty(root); + btrfs_destroy_delayed_inodes(fs_info); + btrfs_assert_delayed_root_empty(fs_info); - btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages, + btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages, EXTENT_DIRTY); - btrfs_destroy_pinned_extent(root, - root->fs_info->pinned_extents); + btrfs_destroy_pinned_extent(fs_info, + fs_info->pinned_extents); cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); @@ -4601,27 +4608,27 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, */ } -static int btrfs_cleanup_transaction(struct btrfs_root *root) +static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *t; - mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&fs_info->transaction_kthread_mutex); - spin_lock(&root->fs_info->trans_lock); - while (!list_empty(&root->fs_info->trans_list)) { - t = list_first_entry(&root->fs_info->trans_list, + spin_lock(&fs_info->trans_lock); + while (!list_empty(&fs_info->trans_list)) { + t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { atomic_inc(&t->use_count); - spin_unlock(&root->fs_info->trans_lock); - btrfs_wait_for_commit(root, t->transid); + spin_unlock(&fs_info->trans_lock); + btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); continue; } - if (t == root->fs_info->running_transaction) { + if (t == fs_info->running_transaction) { t->state = TRANS_STATE_COMMIT_DOING; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); /* * We wait for 0 num_writers since we don't hold a trans * handle open currently for this transaction. @@ -4629,27 +4636,27 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) wait_event(t->writer_wait, atomic_read(&t->num_writers) == 0); } else { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } - btrfs_cleanup_one_transaction(t, root); + btrfs_cleanup_one_transaction(t, fs_info); - spin_lock(&root->fs_info->trans_lock); - if (t == root->fs_info->running_transaction) - root->fs_info->running_transaction = NULL; + spin_lock(&fs_info->trans_lock); + if (t == fs_info->running_transaction) + fs_info->running_transaction = NULL; list_del_init(&t->list); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(t); - trace_btrfs_transaction_commit(root); - spin_lock(&root->fs_info->trans_lock); - } - spin_unlock(&root->fs_info->trans_lock); - btrfs_destroy_all_ordered_extents(root->fs_info); - btrfs_destroy_delayed_inodes(root); - btrfs_assert_delayed_root_empty(root); - btrfs_destroy_pinned_extent(root, root->fs_info->pinned_extents); - btrfs_destroy_all_delalloc_inodes(root->fs_info); - mutex_unlock(&root->fs_info->transaction_kthread_mutex); + trace_btrfs_transaction_commit(fs_info->tree_root); + spin_lock(&fs_info->trans_lock); + } + spin_unlock(&fs_info->trans_lock); + btrfs_destroy_all_ordered_extents(fs_info); + btrfs_destroy_delayed_inodes(fs_info); + btrfs_assert_delayed_root_empty(fs_info); + btrfs_destroy_pinned_extent(fs_info, fs_info->pinned_extents); + btrfs_destroy_all_delalloc_inodes(fs_info); + mutex_unlock(&fs_info->transaction_kthread_mutex); return 0; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 1a3237e5700f..44dcd9af6b7c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -44,27 +44,26 @@ static inline u64 btrfs_sb_offset(int mirror) struct btrfs_device; struct btrfs_fs_devices; -struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u64 parent_transid); -void readahead_tree_block(struct btrfs_root *root, u64 bytenr); -int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, +struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, + u64 bytenr, u64 parent_transid); +void readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); +int reada_tree_block_flagged(struct btrfs_fs_info *fs_info, u64 bytenr, int mirror_num, struct extent_buffer **eb); -struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, - u64 bytenr); +struct extent_buffer *btrfs_find_create_tree_block( + struct btrfs_fs_info *fs_info, + u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -void close_ctree(struct btrfs_root *root); +void close_ctree(struct btrfs_fs_info *fs_info); int write_ctree_super(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int max_mirrors); + struct btrfs_fs_info *fs_info, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, struct buffer_head **bh_ret); -int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, - u64 bytenr); +int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); @@ -85,15 +84,14 @@ btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, } int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); -void btrfs_btree_balance_dirty(struct btrfs_root *root); -void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root); +void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); +void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); void btrfs_free_fs_root(struct btrfs_root *root); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info, - u32 sectorsize, u32 nodesize); +struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif /* @@ -121,7 +119,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); -void btrfs_csum_final(u32 crc, char *result); +void btrfs_csum_final(u32 crc, u8 *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, @@ -137,9 +135,9 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, - struct btrfs_root *root); + struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 objectid); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 2513a7f53334..340d90751263 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -153,6 +153,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { struct inode *dir = d_inode(child); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -169,7 +170,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child) key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; - root = root->fs_info->tree_root; + root = fs_info->tree_root; } else { key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; @@ -205,13 +206,13 @@ static struct dentry *btrfs_get_parent(struct dentry *child) btrfs_free_path(path); if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { - return btrfs_get_dentry(root->fs_info->sb, key.objectid, + return btrfs_get_dentry(fs_info->sb, key.objectid, found_key.offset, 0, 0); } key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); + return d_obtain_alias(btrfs_iget(fs_info->sb, &key, root, NULL)); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -222,6 +223,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, { struct inode *inode = d_inode(child); struct inode *dir = d_inode(parent); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; @@ -250,7 +252,7 @@ static int btrfs_get_name(struct dentry *parent, char *name, key.objectid = BTRFS_I(inode)->root->root_key.objectid; key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; - root = root->fs_info->tree_root; + root = fs_info->tree_root; } else { key.objectid = ino; key.offset = btrfs_ino(dir); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 4607af38c72e..dcd2e798767e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -61,10 +61,10 @@ enum { }; static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, @@ -73,17 +73,17 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins); static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 flags, + struct btrfs_fs_info *fs_info, u64 flags, int force); static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key); @@ -96,8 +96,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, u64 num_bytes, int delalloc); static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_pin_extent(struct btrfs_root *root, - u64 bytenr, u64 num_bytes, int reserved); static int __reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 orig_bytes, @@ -223,18 +221,18 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, return ret; } -static int add_excluded_extent(struct btrfs_root *root, +static int add_excluded_extent(struct btrfs_fs_info *fs_info, u64 start, u64 num_bytes) { u64 end = start + num_bytes - 1; - set_extent_bits(&root->fs_info->freed_extents[0], + set_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_UPTODATE); - set_extent_bits(&root->fs_info->freed_extents[1], + set_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_UPTODATE); return 0; } -static void free_excluded_extents(struct btrfs_root *root, +static void free_excluded_extents(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache) { u64 start, end; @@ -242,13 +240,13 @@ static void free_excluded_extents(struct btrfs_root *root, start = cache->key.objectid; end = start + cache->key.offset - 1; - clear_extent_bits(&root->fs_info->freed_extents[0], + clear_extent_bits(&fs_info->freed_extents[0], start, end, EXTENT_UPTODATE); - clear_extent_bits(&root->fs_info->freed_extents[1], + clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_UPTODATE); } -static int exclude_super_stripes(struct btrfs_root *root, +static int exclude_super_stripes(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache) { u64 bytenr; @@ -259,7 +257,7 @@ static int exclude_super_stripes(struct btrfs_root *root, if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; cache->bytes_super += stripe_len; - ret = add_excluded_extent(root, cache->key.objectid, + ret = add_excluded_extent(fs_info, cache->key.objectid, stripe_len); if (ret) return ret; @@ -267,7 +265,7 @@ static int exclude_super_stripes(struct btrfs_root *root, for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(root->fs_info, cache->key.objectid, + ret = btrfs_rmap_block(fs_info, cache->key.objectid, bytenr, 0, &logical, &nr, &stripe_len); if (ret) return ret; @@ -293,7 +291,7 @@ static int exclude_super_stripes(struct btrfs_root *root, } cache->bytes_super += len; - ret = add_excluded_extent(root, start, len); + ret = add_excluded_extent(fs_info, start, len); if (ret) { kfree(logical); return ret; @@ -329,13 +327,13 @@ static void put_caching_control(struct btrfs_caching_control *ctl) } #ifdef CONFIG_BTRFS_DEBUG -static void fragment_free_space(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +static void fragment_free_space(struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; u64 start = block_group->key.objectid; u64 len = block_group->key.offset; u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ? - root->nodesize : root->sectorsize; + fs_info->nodesize : fs_info->sectorsize; u64 step = chunk << 1; while (len > chunk) { @@ -394,9 +392,9 @@ u64 add_new_free_space(struct btrfs_block_group_cache *block_group, static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { - struct btrfs_block_group_cache *block_group; - struct btrfs_fs_info *fs_info; - struct btrfs_root *extent_root; + struct btrfs_block_group_cache *block_group = caching_ctl->block_group; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -406,10 +404,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) int ret; bool wakeup = true; - block_group = caching_ctl->block_group; - fs_info = block_group->fs_info; - extent_root = fs_info->extent_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -422,7 +416,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) * allocate from this block group until we've had a chance to fragment * the free space. */ - if (btrfs_should_fragment_free_space(extent_root, block_group)) + if (btrfs_should_fragment_free_space(block_group)) wakeup = false; #endif /* @@ -510,7 +504,7 @@ next: key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->nodesize; + fs_info->nodesize; else last = key.objectid + key.offset; @@ -561,7 +555,7 @@ static noinline void caching_thread(struct btrfs_work *work) spin_unlock(&block_group->lock); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(extent_root, block_group)) { + if (btrfs_should_fragment_free_space(block_group)) { u64 bytes_used; spin_lock(&block_group->space_info->lock); @@ -571,14 +565,14 @@ static noinline void caching_thread(struct btrfs_work *work) block_group->space_info->bytes_used += bytes_used >> 1; spin_unlock(&block_group->lock); spin_unlock(&block_group->space_info->lock); - fragment_free_space(extent_root, block_group); + fragment_free_space(block_group); } #endif caching_ctl->progress = (u64)-1; up_read(&fs_info->commit_root_sem); - free_excluded_extents(fs_info->extent_root, block_group); + free_excluded_extents(fs_info, block_group); mutex_unlock(&caching_ctl->mutex); wake_up(&caching_ctl->wait); @@ -668,8 +662,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, spin_unlock(&cache->lock); #ifdef CONFIG_BTRFS_DEBUG if (ret == 1 && - btrfs_should_fragment_free_space(fs_info->extent_root, - cache)) { + btrfs_should_fragment_free_space(cache)) { u64 bytes_used; spin_lock(&cache->space_info->lock); @@ -679,7 +672,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, cache->space_info->bytes_used += bytes_used >> 1; spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - fragment_free_space(fs_info->extent_root, cache); + fragment_free_space(cache); } #endif mutex_unlock(&caching_ctl->mutex); @@ -687,7 +680,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, wake_up(&caching_ctl->wait); if (ret == 1) { put_caching_control(caching_ctl); - free_excluded_extents(fs_info->extent_root, cache); + free_excluded_extents(fs_info, cache); return 0; } } else { @@ -778,7 +771,7 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) } /* simple helper to search for an existing data extent at a given offset */ -int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { int ret; struct btrfs_key key; @@ -791,8 +784,7 @@ int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) key.objectid = start; key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, - 0, 0); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); btrfs_free_path(path); return ret; } @@ -807,7 +799,7 @@ int btrfs_lookup_data_extent(struct btrfs_root *root, u64 start, u64 len) * the delayed refs are not processed. */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 offset, int metadata, u64 *refs, u64 *flags) { struct btrfs_delayed_ref_head *head; @@ -825,8 +817,8 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, * If we don't have skinny metadata, don't bother doing anything * different */ - if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->nodesize; + if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) { + offset = fs_info->nodesize; metadata = 0; } @@ -847,8 +839,7 @@ search_again: else key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, - &key, path, 0, 0); + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out_free; @@ -859,7 +850,7 @@ search_again: path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == root->nodesize) + key.offset == fs_info->nodesize) ret = 0; } } @@ -1101,7 +1092,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, return ret; BUG_ON(ret); /* Corruption */ - btrfs_extend_item(root, path, new_size); + btrfs_extend_item(root->fs_info, path, new_size); leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -1114,7 +1105,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, BTRFS_BLOCK_FLAG_FULL_BACKREF); bi = (struct btrfs_tree_block_info *)(item + 1); /* FIXME: get first key of the block */ - memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi)); + memzero_extent_buffer(leaf, (unsigned long)bi, sizeof(*bi)); btrfs_set_tree_block_level(leaf, bi, (int)owner); } else { btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA); @@ -1540,6 +1531,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 owner, u64 offset, int insert) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -1553,8 +1545,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int want; int ret; int err = 0; - bool skinny_metadata = btrfs_fs_incompat(root->fs_info, - SKINNY_METADATA); + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1748,7 +1739,7 @@ void setup_inline_extent_backref(struct btrfs_root *root, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(root, path, size); + btrfs_extend_item(root->fs_info, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1875,7 +1866,7 @@ void update_inline_extent_backref(struct btrfs_root *root, memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(root, path, item_size, 1); + btrfs_truncate_item(root->fs_info, path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); } @@ -2022,7 +2013,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, +int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes) { int ret; @@ -2034,10 +2025,10 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, * Avoid races with device replace and make sure our bbio has devices * associated to its stripes that don't go away while we are discarding. */ - btrfs_bio_counter_inc_blocked(root->fs_info); + btrfs_bio_counter_inc_blocked(fs_info); /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(root->fs_info, REQ_OP_DISCARD, - bytenr, &num_bytes, &bbio, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes, + &bbio, 0); /* Error condition is -ENOMEM */ if (!ret) { struct btrfs_bio_stripe *stripe = bbio->stripes; @@ -2067,7 +2058,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, } btrfs_put_bbio(bbio); } - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); if (actual_bytes) *actual_bytes = discarded_bytes; @@ -2080,12 +2071,11 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, /* Can return -ENOMEM */ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID && root_objectid == BTRFS_TREE_LOG_OBJECTID); @@ -2105,13 +2095,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_extent_item *item; @@ -2154,7 +2143,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ - ret = insert_extent_backref(trans, root->fs_info->extent_root, + ret = insert_extent_backref(trans, fs_info->extent_root, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); if (ret) @@ -2165,7 +2154,7 @@ out: } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2182,7 +2171,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ins.type = BTRFS_EXTENT_ITEM_KEY; ref = btrfs_delayed_node_to_data_ref(node); - trace_run_delayed_data_ref(root->fs_info, node, ref, node->action); + trace_run_delayed_data_ref(fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; @@ -2191,17 +2180,17 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, root, + ret = alloc_reserved_file_extent(trans, fs_info, parent, ref_root, flags, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node, parent, + ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node, parent, + ret = __btrfs_free_extent(trans, fs_info, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, extent_op); @@ -2230,7 +2219,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, } static int run_delayed_extent_op(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { @@ -2246,7 +2235,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; - if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2266,8 +2255,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: path->reada = READA_FORWARD; path->leave_spinning = 1; - ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, - path, 0, 1); + ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1); if (ret < 0) { err = ret; goto out; @@ -2302,7 +2290,7 @@ again: item_size = btrfs_item_size_nr(leaf, path->slots[0]); #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 if (item_size < sizeof(*ei)) { - ret = convert_extent_item_v0(trans, root->fs_info->extent_root, + ret = convert_extent_item_v0(trans, fs_info->extent_root, path, (u64)-1, 0); if (ret < 0) { err = ret; @@ -2323,7 +2311,7 @@ out: } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2333,11 +2321,10 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_key ins; u64 parent = 0; u64 ref_root = 0; - bool skinny_metadata = btrfs_fs_incompat(root->fs_info, - SKINNY_METADATA); + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); ref = btrfs_delayed_node_to_tree_ref(node); - trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action); + trace_run_delayed_tree_ref(fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) parent = ref->parent; @@ -2353,7 +2340,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (node->ref_mod != 1) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); @@ -2361,18 +2348,18 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { BUG_ON(!extent_op || !extent_op->update_flags); - ret = alloc_reserved_tree_block(trans, root, + ret = alloc_reserved_tree_block(trans, fs_info, parent, ref_root, extent_op->flags_to_set, &extent_op->key, ref->level, &ins); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node, + ret = __btrfs_inc_extent_ref(trans, fs_info, node, parent, ref_root, ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node, + ret = __btrfs_free_extent(trans, fs_info, node, parent, ref_root, ref->level, 0, 1, extent_op); } else { @@ -2383,7 +2370,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, int insert_reserved) @@ -2392,7 +2379,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (trans->aborted) { if (insert_reserved) - btrfs_pin_extent(root, node->bytenr, + btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); return 0; } @@ -2407,33 +2394,31 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, */ BUG_ON(extent_op); head = btrfs_delayed_node_to_head(node); - trace_run_delayed_ref_head(root->fs_info, node, head, - node->action); + trace_run_delayed_ref_head(fs_info, node, head, node->action); if (insert_reserved) { - btrfs_pin_extent(root, node->bytenr, + btrfs_pin_extent(fs_info, node->bytenr, node->num_bytes, 1); if (head->is_data) { - ret = btrfs_del_csums(trans, root, + ret = btrfs_del_csums(trans, fs_info, node->bytenr, node->num_bytes); } } /* Also free its reserved qgroup space */ - btrfs_qgroup_free_delayed_ref(root->fs_info, - head->qgroup_ref_root, + btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root, head->qgroup_reserved); return ret; } if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, root, node, extent_op, + ret = run_delayed_tree_ref(trans, fs_info, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, root, node, extent_op, + ret = run_delayed_data_ref(trans, fs_info, node, extent_op, insert_reserved); else BUG(); @@ -2454,13 +2439,14 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) * the extent item from the extent tree, when there still are references * to add, which would fail because they would not find the extent item. */ - list_for_each_entry(ref, &head->ref_list, list) { - if (ref->action == BTRFS_ADD_DELAYED_REF) - return ref; - } + if (!list_empty(&head->ref_add_list)) + return list_first_entry(&head->ref_add_list, + struct btrfs_delayed_ref_node, add_list); - return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, - list); + ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node, + list); + ASSERT(list_empty(&ref->add_list)); + return ref; } /* @@ -2468,14 +2454,13 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head) * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; - struct btrfs_fs_info *fs_info = root->fs_info; ktime_t start = ktime_get(); int ret; unsigned long count = 0; @@ -2537,11 +2522,11 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { spin_unlock(&locked_ref->lock); - btrfs_delayed_ref_unlock(locked_ref); spin_lock(&delayed_refs->lock); locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; cond_resched(); count++; @@ -2574,7 +2559,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (extent_op) { spin_unlock(&locked_ref->lock); - ret = run_delayed_extent_op(trans, root, + ret = run_delayed_extent_op(trans, fs_info, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); @@ -2587,7 +2572,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + spin_lock(&delayed_refs->lock); locked_ref->processing = 0; + delayed_refs->num_heads_ready++; + spin_unlock(&delayed_refs->lock); btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); @@ -2620,6 +2608,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, actual_count++; ref->in_tree = 0; list_del(&ref->list); + if (!list_empty(&ref->add_list)) + list_del(&ref->add_list); } atomic_dec(&delayed_refs->num_entries); @@ -2642,7 +2632,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, root, ref, extent_op, + ret = run_one_delayed_ref(trans, fs_info, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); @@ -2743,43 +2733,43 @@ static u64 find_middle(struct rb_root *root) } #endif -static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) +static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads) { u64 num_bytes; num_bytes = heads * (sizeof(struct btrfs_extent_item) + sizeof(struct btrfs_extent_inline_ref)); - if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) num_bytes += heads * sizeof(struct btrfs_tree_block_info); /* * We don't ever fill up leaves all the way so multiply by 2 just to be * closer to what we're really going to want to use. */ - return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info)); } /* * Takes the number of bytes to be csumm'ed and figures out how many leaves it * would require to store the csums for that many bytes. */ -u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes) { u64 csum_size; u64 num_csums_per_leaf; u64 num_csums; - csum_size = BTRFS_MAX_ITEM_SIZE(root); + csum_size = BTRFS_MAX_ITEM_SIZE(fs_info); num_csums_per_leaf = div64_u64(csum_size, - (u64)btrfs_super_csum_size(root->fs_info->super_copy)); - num_csums = div64_u64(csum_bytes, root->sectorsize); + (u64)btrfs_super_csum_size(fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, fs_info->sectorsize); num_csums += num_csums_per_leaf - 1; num_csums = div64_u64(num_csums, num_csums_per_leaf); return num_csums; } int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; @@ -2788,15 +2778,16 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; - num_bytes = btrfs_calc_trans_metadata_size(root, 1); - num_heads = heads_to_leaves(root, num_heads); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + num_heads = heads_to_leaves(fs_info, num_heads); if (num_heads > 1) - num_bytes += (num_heads - 1) * root->nodesize; + num_bytes += (num_heads - 1) * fs_info->nodesize; num_bytes <<= 1; - num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; - num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) * + fs_info->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info, num_dirty_bgs); - global_rsv = &root->fs_info->global_block_rsv; + global_rsv = &fs_info->global_block_rsv; /* * If we can't allocate any more chunks lets make sure we have _lots_ of @@ -2815,9 +2806,8 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 num_entries = atomic_read(&trans->transaction->delayed_refs.num_entries); u64 avg_runtime; @@ -2826,12 +2816,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, smp_mb(); avg_runtime = fs_info->avg_delayed_ref_runtime; val = num_entries * avg_runtime; - if (num_entries * avg_runtime >= NSEC_PER_SEC) + if (val >= NSEC_PER_SEC) return 1; if (val >= NSEC_PER_SEC / 2) return 2; - return btrfs_check_space_for_delayed_refs(trans, root); + return btrfs_check_space_for_delayed_refs(trans, fs_info); } struct async_delayed_refs { @@ -2844,16 +2834,21 @@ struct async_delayed_refs { struct btrfs_work work; }; +static inline struct async_delayed_refs * +to_async_delayed_refs(struct btrfs_work *work) +{ + return container_of(work, struct async_delayed_refs, work); +} + static void delayed_ref_async_start(struct btrfs_work *work) { - struct async_delayed_refs *async; + struct async_delayed_refs *async = to_async_delayed_refs(work); struct btrfs_trans_handle *trans; + struct btrfs_fs_info *fs_info = async->root->fs_info; int ret; - async = container_of(work, struct async_delayed_refs, work); - /* if the commit is already started, we don't need to wait here */ - if (btrfs_transaction_blocked(async->root->fs_info)) + if (btrfs_transaction_blocked(fs_info)) goto done; trans = btrfs_join_transaction(async->root); @@ -2872,11 +2867,11 @@ static void delayed_ref_async_start(struct btrfs_work *work) if (trans->transid > async->transid) goto end; - ret = btrfs_run_delayed_refs(trans, async->root, async->count); + ret = btrfs_run_delayed_refs(trans, fs_info, async->count); if (ret) async->error = ret; end: - ret = btrfs_end_transaction(trans, async->root); + ret = btrfs_end_transaction(trans); if (ret && !async->error) async->error = ret; done: @@ -2886,7 +2881,7 @@ done: kfree(async); } -int btrfs_async_run_delayed_refs(struct btrfs_root *root, +int btrfs_async_run_delayed_refs(struct btrfs_fs_info *fs_info, unsigned long count, u64 transid, int wait) { struct async_delayed_refs *async; @@ -2896,7 +2891,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, if (!async) return -ENOMEM; - async->root = root->fs_info->tree_root; + async->root = fs_info->tree_root; async->count = count; async->error = 0; async->transid = transid; @@ -2909,7 +2904,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, btrfs_init_work(&async->work, btrfs_extent_refs_helper, delayed_ref_async_start, NULL, NULL); - btrfs_queue_work(root->fs_info->extent_workers, &async->work); + btrfs_queue_work(fs_info->extent_workers, &async->work); if (wait) { wait_for_completion(&async->wait); @@ -2931,7 +2926,7 @@ int btrfs_async_run_delayed_refs(struct btrfs_root *root, * Returns <0 on error and aborts the transaction */ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, unsigned long count) + struct btrfs_fs_info *fs_info, unsigned long count) { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; @@ -2944,12 +2939,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; - if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &root->fs_info->flags)) + if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags)) return 0; - if (root == root->fs_info->extent_root) - root = root->fs_info->tree_root; - delayed_refs = &trans->transaction->delayed_refs; if (count == 0) count = atomic_read(&delayed_refs->num_entries) * 2; @@ -2959,7 +2951,7 @@ again: delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif trans->can_flush_pending_bgs = false; - ret = __btrfs_run_delayed_refs(trans, root, count); + ret = __btrfs_run_delayed_refs(trans, fs_info, count); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; @@ -2967,7 +2959,7 @@ again: if (run_all) { if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); + btrfs_create_pending_block_groups(trans, fs_info); spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->href_root); @@ -3012,7 +3004,7 @@ out: } int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 flags, int level, int is_data) { @@ -3029,7 +3021,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->is_data = is_data ? true : false; extent_op->level = level; - ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, + ret = btrfs_add_delayed_extent_op(fs_info, trans, bytenr, num_bytes, extent_op); if (ret) btrfs_free_delayed_extent_op(extent_op); @@ -3103,7 +3095,8 @@ static noinline int check_committed_ref(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 objectid, u64 offset, u64 bytenr) { - struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; struct extent_buffer *leaf; struct btrfs_extent_data_ref *ref; struct btrfs_extent_inline_ref *iref; @@ -3210,6 +3203,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, struct extent_buffer *buf, int full_backref, int inc) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 num_bytes; u64 parent; @@ -3220,11 +3214,12 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int i; int level; int ret = 0; - int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + int (*process_func)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, u64, u64, u64, u64, u64, u64); - if (btrfs_is_testing(root->fs_info)) + if (btrfs_is_testing(fs_info)) return 0; ref_root = btrfs_header_owner(buf); @@ -3260,15 +3255,15 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); - ret = process_func(trans, root, bytenr, num_bytes, + ret = process_func(trans, fs_info, bytenr, num_bytes, parent, ref_root, key.objectid, key.offset); if (ret) goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = root->nodesize; - ret = process_func(trans, root, bytenr, num_bytes, + num_bytes = fs_info->nodesize; + ret = process_func(trans, fs_info, bytenr, num_bytes, parent, ref_root, level - 1, 0); if (ret) goto fail; @@ -3292,12 +3287,12 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, } static int write_one_cache_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_block_group_cache *cache) { int ret; - struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_root *extent_root = fs_info->extent_root; unsigned long bi; struct extent_buffer *leaf; @@ -3319,22 +3314,20 @@ fail: } static struct btrfs_block_group_cache * -next_block_group(struct btrfs_root *root, +next_block_group(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache) { struct rb_node *node; - spin_lock(&root->fs_info->block_group_cache_lock); + spin_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->key.objectid + cache->key.offset; - spin_unlock(&root->fs_info->block_group_cache_lock); + spin_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(root->fs_info, - next_bytenr); - return cache; + cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -3344,7 +3337,7 @@ next_block_group(struct btrfs_root *root, btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&root->fs_info->block_group_cache_lock); + spin_unlock(&fs_info->block_group_cache_lock); return cache; } @@ -3352,7 +3345,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct btrfs_trans_handle *trans, struct btrfs_path *path) { - struct btrfs_root *root = block_group->fs_info->tree_root; + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; @@ -3425,8 +3419,8 @@ again: WARN_ON(ret); if (i_size_read(inode) > 0) { - ret = btrfs_check_trunc_cache_free_space(root, - &root->fs_info->global_block_rsv); + ret = btrfs_check_trunc_cache_free_space(fs_info, + &fs_info->global_block_rsv); if (ret) goto out_put; @@ -3437,7 +3431,7 @@ again: spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root->fs_info, SPACE_CACHE)) { + !btrfs_test_opt(fs_info, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3506,14 +3500,14 @@ out: } int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_path *path; if (list_empty(&cur_trans->dirty_bgs) || - !btrfs_test_opt(root->fs_info, SPACE_CACHE)) + !btrfs_test_opt(fs_info, SPACE_CACHE)) return 0; path = btrfs_alloc_path(); @@ -3544,7 +3538,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, * we're still allowing others to join the commit. */ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; @@ -3569,7 +3563,7 @@ again: * make sure all the block groups on our dirty list actually * exist */ - btrfs_create_pending_block_groups(trans, root); + btrfs_create_pending_block_groups(trans, fs_info); if (!path) { path = btrfs_alloc_path(); @@ -3594,9 +3588,7 @@ again: */ if (!list_empty(&cache->io_list)) { list_del_init(&cache->io_list); - btrfs_wait_cache_io(root, trans, cache, - &cache->io_ctl, path, - cache->key.objectid); + btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); } @@ -3619,7 +3611,8 @@ again: if (cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(root, trans, cache, path); + ret = btrfs_write_out_cache(fs_info, trans, + cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3638,7 +3631,8 @@ again: } } if (!ret) { - ret = write_one_cache_group(trans, root, path, cache); + ret = write_one_cache_group(trans, fs_info, + path, cache); /* * Our block group might still be attached to the list * of new block groups in the transaction handle of some @@ -3683,7 +3677,7 @@ again: * go through delayed refs for all the stuff we've just kicked off * and then loop back (just once) */ - ret = btrfs_run_delayed_refs(trans, root, 0); + ret = btrfs_run_delayed_refs(trans, fs_info, 0); if (!ret && loops == 0) { loops++; spin_lock(&cur_trans->dirty_bgs_lock); @@ -3698,7 +3692,7 @@ again: } spin_unlock(&cur_trans->dirty_bgs_lock); } else if (ret < 0) { - btrfs_cleanup_dirty_bgs(cur_trans, root); + btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } btrfs_free_path(path); @@ -3706,7 +3700,7 @@ again: } int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; @@ -3749,9 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (!list_empty(&cache->io_list)) { spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); - btrfs_wait_cache_io(root, trans, cache, - &cache->io_ctl, path, - cache->key.objectid); + btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); spin_lock(&cur_trans->dirty_bgs_lock); } @@ -3767,11 +3759,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + ret = btrfs_run_delayed_refs(trans, fs_info, + (unsigned long) -1); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; - ret = btrfs_write_out_cache(root, trans, cache, path); + ret = btrfs_write_out_cache(fs_info, trans, + cache, path); if (ret == 0 && cache->io_ctl.inode) { num_started++; should_put = 0; @@ -3785,7 +3779,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } } if (!ret) { - ret = write_one_cache_group(trans, root, path, cache); + ret = write_one_cache_group(trans, fs_info, + path, cache); /* * One of the free space endio workers might have * created a new block group while updating a free space @@ -3802,8 +3797,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, if (ret == -ENOENT) { wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - ret = write_one_cache_group(trans, root, path, - cache); + ret = write_one_cache_group(trans, fs_info, + path, cache); } if (ret) btrfs_abort_transaction(trans, ret); @@ -3820,8 +3815,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = list_first_entry(io, struct btrfs_block_group_cache, io_list); list_del_init(&cache->io_list); - btrfs_wait_cache_io(root, trans, cache, - &cache->io_ctl, path, cache->key.objectid); + btrfs_wait_cache_io(trans, cache, path); btrfs_put_block_group(cache); } @@ -3829,12 +3823,12 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return ret; } -int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group_cache *block_group; int readonly = 0; - block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro) readonly = 1; if (block_group) @@ -4043,9 +4037,9 @@ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) * progress (either running or paused) picks the target profile (if it's * already available), otherwise falls back to plain reducing. */ -static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) { - u64 num_devices = root->fs_info->fs_devices->rw_devices; + u64 num_devices = fs_info->fs_devices->rw_devices; u64 target; u64 raid_type; u64 allowed = 0; @@ -4054,16 +4048,16 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) * see if restripe for this chunk_type is in progress, if so * try to reduce to the target profile */ - spin_lock(&root->fs_info->balance_lock); - target = get_restripe_target(root->fs_info, flags); + spin_lock(&fs_info->balance_lock); + target = get_restripe_target(fs_info, flags); if (target) { /* pick target profile only if it's already available */ if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) { - spin_unlock(&root->fs_info->balance_lock); + spin_unlock(&fs_info->balance_lock); return extended_to_chunk(target); } } - spin_unlock(&root->fs_info->balance_lock); + spin_unlock(&fs_info->balance_lock); /* First, mask out the RAID levels which aren't possible */ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) { @@ -4088,39 +4082,40 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return extended_to_chunk(flags | allowed); } -static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) +static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) { unsigned seq; u64 flags; do { flags = orig_flags; - seq = read_seqbegin(&root->fs_info->profiles_lock); + seq = read_seqbegin(&fs_info->profiles_lock); if (flags & BTRFS_BLOCK_GROUP_DATA) - flags |= root->fs_info->avail_data_alloc_bits; + flags |= fs_info->avail_data_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - flags |= root->fs_info->avail_system_alloc_bits; + flags |= fs_info->avail_system_alloc_bits; else if (flags & BTRFS_BLOCK_GROUP_METADATA) - flags |= root->fs_info->avail_metadata_alloc_bits; - } while (read_seqretry(&root->fs_info->profiles_lock, seq)); + flags |= fs_info->avail_metadata_alloc_bits; + } while (read_seqretry(&fs_info->profiles_lock, seq)); - return btrfs_reduce_alloc_profile(root, flags); + return btrfs_reduce_alloc_profile(fs_info, flags); } u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 flags; u64 ret; if (data) flags = BTRFS_BLOCK_GROUP_DATA; - else if (root == root->fs_info->chunk_root) + else if (root == fs_info->chunk_root) flags = BTRFS_BLOCK_GROUP_SYSTEM; else flags = BTRFS_BLOCK_GROUP_METADATA; - ret = get_alloc_profile(root, flags); + ret = get_alloc_profile(fs_info, flags); return ret; } @@ -4135,7 +4130,7 @@ int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes) int have_pinned_space; /* make sure bytes are sectorsize aligned */ - bytes = ALIGN(bytes, root->sectorsize); + bytes = ALIGN(bytes, fs_info->sectorsize); if (btrfs_is_free_space_inode(inode)) { need_commit = 0; @@ -4181,10 +4176,9 @@ alloc: if (IS_ERR(trans)) return PTR_ERR(trans); - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - alloc_target, + ret = do_chunk_alloc(trans, fs_info, alloc_target, CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret < 0) { if (ret != -ENOSPC) return ret; @@ -4213,12 +4207,13 @@ alloc: /* commit the current transaction and try again */ commit_trans: if (need_commit && - !atomic_read(&root->fs_info->open_ioctl_trans)) { + !atomic_read(&fs_info->open_ioctl_trans)) { need_commit--; if (need_commit > 0) { btrfs_start_delalloc_roots(fs_info, 0, -1); - btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, -1, 0, + (u64)-1); } trans = btrfs_join_transaction(root); @@ -4228,7 +4223,7 @@ commit_trans: test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || need_commit > 0) { - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; /* @@ -4236,21 +4231,21 @@ commit_trans: * operations. Wait for it to finish so that * more space is released. */ - mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); - mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_lock(&fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&fs_info->cleaner_delayed_iput_mutex); goto again; } else { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } } - trace_btrfs_space_reservation(root->fs_info, + trace_btrfs_space_reservation(fs_info, "space_info:enospc", data_sinfo->flags, bytes, 1); return -ENOSPC; } data_sinfo->bytes_may_use += bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", + trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, bytes, 1); spin_unlock(&data_sinfo->lock); @@ -4264,13 +4259,13 @@ commit_trans: */ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; /* align the range */ - len = round_up(start + len, root->sectorsize) - - round_down(start, root->sectorsize); - start = round_down(start, root->sectorsize); + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); ret = btrfs_alloc_data_chunk_ondemand(inode, len); if (ret < 0) @@ -4294,21 +4289,21 @@ int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len) void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, u64 len) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_space_info *data_sinfo; /* Make sure the range is aligned to sectorsize */ - len = round_up(start + len, root->sectorsize) - - round_down(start, root->sectorsize); - start = round_down(start, root->sectorsize); + len = round_up(start + len, fs_info->sectorsize) - + round_down(start, fs_info->sectorsize); + start = round_down(start, fs_info->sectorsize); - data_sinfo = root->fs_info->data_sinfo; + data_sinfo = fs_info->data_sinfo; spin_lock(&data_sinfo->lock); if (WARN_ON(data_sinfo->bytes_may_use < len)) data_sinfo->bytes_may_use = 0; else data_sinfo->bytes_may_use -= len; - trace_btrfs_space_reservation(root->fs_info, "space_info", + trace_btrfs_space_reservation(fs_info, "space_info", data_sinfo->flags, len, 0); spin_unlock(&data_sinfo->lock); } @@ -4322,6 +4317,13 @@ void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start, */ void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* Make sure the range is aligned to sectorsize */ + len = round_up(start + len, root->fs_info->sectorsize) - + round_down(start, root->fs_info->sectorsize); + start = round_down(start, root->fs_info->sectorsize); + btrfs_free_reserved_data_space_noquota(inode, start, len); btrfs_qgroup_free_data(inode, start, len); } @@ -4344,10 +4346,10 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -static int should_alloc_chunk(struct btrfs_root *root, +static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; u64 thresh; @@ -4368,7 +4370,7 @@ static int should_alloc_chunk(struct btrfs_root *root, * about 1% of the FS size. */ if (force == CHUNK_ALLOC_LIMITED) { - thresh = btrfs_super_total_bytes(root->fs_info->super_copy); + thresh = btrfs_super_total_bytes(fs_info->super_copy); thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) @@ -4380,7 +4382,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) { u64 num_dev; @@ -4388,7 +4390,7 @@ static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) - num_dev = root->fs_info->fs_devices->rw_devices; + num_dev = fs_info->fs_devices->rw_devices; else if (type & BTRFS_BLOCK_GROUP_RAID1) num_dev = 2; else @@ -4403,8 +4405,7 @@ static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) * removing a chunk. */ void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 type) + struct btrfs_fs_info *fs_info, u64 type) { struct btrfs_space_info *info; u64 left; @@ -4416,43 +4417,43 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * Needed because we can end up allocating a system chunk and for an * atomic and race free space reservation in the chunk block reserve. */ - ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); + ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); - info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); + info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly - info->bytes_may_use; spin_unlock(&info->lock); - num_devs = get_profile_num_devs(root, type); + num_devs = get_profile_num_devs(fs_info, type); /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + - btrfs_calc_trans_metadata_size(root, 1); + thresh = btrfs_calc_trunc_metadata_size(fs_info, num_devs) + + btrfs_calc_trans_metadata_size(fs_info, 1); - if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { - btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); - dump_space_info(root->fs_info, info, 0, 0); + if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", + left, thresh, type); + dump_space_info(fs_info, info, 0, 0); } if (left < thresh) { u64 flags; - flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); + flags = btrfs_get_alloc_profile(fs_info->chunk_root, 0); /* * Ignore failure to create system chunk. We might end up not * needing it, as we might not need to COW all nodes/leafs from * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - ret = btrfs_alloc_chunk(trans, root, flags); + ret = btrfs_alloc_chunk(trans, fs_info, flags); } if (!ret) { - ret = btrfs_block_rsv_add(root->fs_info->chunk_root, - &root->fs_info->chunk_block_rsv, + ret = btrfs_block_rsv_add(fs_info->chunk_root, + &fs_info->chunk_block_rsv, thresh, BTRFS_RESERVE_NO_FLUSH); if (!ret) trans->chunk_bytes_reserved += thresh; @@ -4469,10 +4470,9 @@ void check_system_chunk(struct btrfs_trans_handle *trans, * - return errors including -ENOSPC otherwise. */ static int do_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 flags, int force) + struct btrfs_fs_info *fs_info, u64 flags, int force) { struct btrfs_space_info *space_info; - struct btrfs_fs_info *fs_info = extent_root->fs_info; int wait_for_alloc = 0; int ret = 0; @@ -4480,10 +4480,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, if (trans->allocating_chunk) return -ENOSPC; - space_info = __find_space_info(extent_root->fs_info, flags); + space_info = __find_space_info(fs_info, flags); if (!space_info) { - ret = update_space_info(extent_root->fs_info, flags, - 0, 0, 0, &space_info); + ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info); BUG_ON(ret); /* -ENOMEM */ } BUG_ON(!space_info); /* Logic error */ @@ -4493,7 +4492,7 @@ again: if (force < space_info->force_alloc) force = space_info->force_alloc; if (space_info->full) { - if (should_alloc_chunk(extent_root, space_info, force)) + if (should_alloc_chunk(fs_info, space_info, force)) ret = -ENOSPC; else ret = 0; @@ -4501,7 +4500,7 @@ again: return ret; } - if (!should_alloc_chunk(extent_root, space_info, force)) { + if (!should_alloc_chunk(fs_info, space_info, force)) { spin_unlock(&space_info->lock); return 0; } else if (space_info->chunk_alloc) { @@ -4551,9 +4550,9 @@ again: * Check if we have enough space in SYSTEM chunk because we may need * to update devices. */ - check_system_chunk(trans, extent_root, flags); + check_system_chunk(trans, fs_info, flags); - ret = btrfs_alloc_chunk(trans, extent_root, flags); + ret = btrfs_alloc_chunk(trans, fs_info, flags); trans->allocating_chunk = false; spin_lock(&space_info->lock); @@ -4585,7 +4584,7 @@ out: */ if (trans->can_flush_pending_bgs && trans->chunk_bytes_reserved >= (u64)SZ_2M) { - btrfs_create_pending_block_groups(trans, extent_root); + btrfs_create_pending_block_groups(trans, fs_info); btrfs_trans_release_chunk_metadata(trans); } return ret; @@ -4595,7 +4594,8 @@ static int can_overcommit(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_block_rsv *global_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; u64 profile; u64 space_size; u64 avail; @@ -4605,8 +4605,6 @@ static int can_overcommit(struct btrfs_root *root, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - BUG_ON(root->fs_info == NULL); - global_rsv = &root->fs_info->global_block_rsv; profile = btrfs_get_alloc_profile(root, 0); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly; @@ -4625,9 +4623,9 @@ static int can_overcommit(struct btrfs_root *root, used += space_info->bytes_may_use; - spin_lock(&root->fs_info->free_chunk_lock); - avail = root->fs_info->free_chunk_space; - spin_unlock(&root->fs_info->free_chunk_lock); + spin_lock(&fs_info->free_chunk_lock); + avail = fs_info->free_chunk_space; + spin_unlock(&fs_info->free_chunk_lock); /* * If we have dup, raid1 or raid10 then only half of the free @@ -4655,10 +4653,10 @@ static int can_overcommit(struct btrfs_root *root, return 0; } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, +static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, unsigned long nr_pages, int nr_items) { - struct super_block *sb = root->fs_info->sb; + struct super_block *sb = fs_info->sb; if (down_read_trylock(&sb->s_umount)) { writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); @@ -4671,19 +4669,19 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, * the filesystem is readonly(all dirty pages are written to * the disk). */ - btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); + btrfs_start_delalloc_roots(fs_info, 0, nr_items); if (!current->journal_info) - btrfs_wait_ordered_roots(root->fs_info, nr_items, - 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); } } -static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) +static inline int calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, + u64 to_reclaim) { u64 bytes; int nr; - bytes = btrfs_calc_trans_metadata_size(root, 1); + bytes = btrfs_calc_trans_metadata_size(fs_info, 1); nr = (int)div64_u64(to_reclaim, bytes); if (!nr) nr = 1; @@ -4698,6 +4696,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, bool wait_ordered) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -4710,21 +4709,20 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, enum btrfs_reserve_flush_enum flush; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(root, to_reclaim); + items = calc_reclaim_items_nr(fs_info, to_reclaim); to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM; trans = (struct btrfs_trans_handle *)current->journal_info; - block_rsv = &root->fs_info->delalloc_block_rsv; + block_rsv = &fs_info->delalloc_block_rsv; space_info = block_rsv->space_info; delalloc_bytes = percpu_counter_sum_positive( - &root->fs_info->delalloc_bytes); + &fs_info->delalloc_bytes); if (delalloc_bytes == 0) { if (trans) return; if (wait_ordered) - btrfs_wait_ordered_roots(root->fs_info, items, - 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); return; } @@ -4732,12 +4730,12 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, while (delalloc_bytes && loops < 3) { max_reclaim = min(delalloc_bytes, to_reclaim); nr_pages = max_reclaim >> PAGE_SHIFT; - btrfs_writeback_inodes_sb_nr(root, nr_pages, items); + btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); /* * We need to wait for the async pages to actually start before * we do anything. */ - max_reclaim = atomic_read(&root->fs_info->async_delalloc_pages); + max_reclaim = atomic_read(&fs_info->async_delalloc_pages); if (!max_reclaim) goto skip_async; @@ -4746,8 +4744,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, else max_reclaim -= nr_pages; - wait_event(root->fs_info->async_submit_wait, - atomic_read(&root->fs_info->async_delalloc_pages) <= + wait_event(fs_info->async_submit_wait, + atomic_read(&fs_info->async_delalloc_pages) <= (int)max_reclaim); skip_async: if (!trans) @@ -4768,15 +4766,14 @@ skip_async: loops++; if (wait_ordered && !trans) { - btrfs_wait_ordered_roots(root->fs_info, items, - 0, (u64)-1); + btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); } else { time_left = schedule_timeout_killable(1); if (time_left) break; } delalloc_bytes = percpu_counter_sum_positive( - &root->fs_info->delalloc_bytes); + &fs_info->delalloc_bytes); } } @@ -4794,7 +4791,8 @@ static int may_commit_transaction(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 bytes, int force) { - struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_trans_handle *trans; trans = (struct btrfs_trans_handle *)current->journal_info; @@ -4829,7 +4827,7 @@ commit: if (IS_ERR(trans)) return -ENOSPC; - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); } struct reserve_ticket { @@ -4843,6 +4841,7 @@ static int flush_space(struct btrfs_root *root, struct btrfs_space_info *space_info, u64 num_bytes, u64 orig_bytes, int state) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int nr; int ret = 0; @@ -4851,7 +4850,7 @@ static int flush_space(struct btrfs_root *root, case FLUSH_DELAYED_ITEMS_NR: case FLUSH_DELAYED_ITEMS: if (state == FLUSH_DELAYED_ITEMS_NR) - nr = calc_reclaim_items_nr(root, num_bytes) * 2; + nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2; else nr = -1; @@ -4860,8 +4859,8 @@ static int flush_space(struct btrfs_root *root, ret = PTR_ERR(trans); break; } - ret = btrfs_run_delayed_items_nr(trans, root, nr); - btrfs_end_transaction(trans, root); + ret = btrfs_run_delayed_items_nr(trans, fs_info, nr); + btrfs_end_transaction(trans); break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: @@ -4874,10 +4873,10 @@ static int flush_space(struct btrfs_root *root, ret = PTR_ERR(trans); break; } - ret = do_chunk_alloc(trans, root->fs_info->extent_root, + ret = do_chunk_alloc(trans, fs_info, btrfs_get_alloc_profile(root, 0), CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -4889,7 +4888,7 @@ static int flush_space(struct btrfs_root *root, break; } - trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes, + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, orig_bytes, state, ret); return ret; } @@ -4935,6 +4934,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, struct btrfs_root *root, u64 used) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 thresh = div_factor_fine(space_info->total_bytes, 98); /* If we're just plain full then async reclaim just slows us down. */ @@ -4944,9 +4944,8 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, if (!btrfs_calc_reclaim_metadata_size(root, space_info)) return 0; - return (used >= thresh && !btrfs_fs_closing(root->fs_info) && - !test_bit(BTRFS_FS_STATE_REMOUNTING, - &root->fs_info->fs_state)); + return (used >= thresh && !btrfs_fs_closing(fs_info) && + !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } static void wake_all_tickets(struct list_head *head) @@ -5126,6 +5125,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = root->fs_info; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -5146,15 +5146,13 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, */ if (used + orig_bytes <= space_info->total_bytes) { space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - space_info->flags, orig_bytes, - 1); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); ret = 0; } else if (can_overcommit(root, space_info, orig_bytes, flush)) { space_info->bytes_may_use += orig_bytes; - trace_btrfs_space_reservation(root->fs_info, "space_info", - space_info->flags, orig_bytes, - 1); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, orig_bytes, 1); ret = 0; } @@ -5173,7 +5171,7 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; - trace_btrfs_trigger_flush(root->fs_info, + trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "enospc"); @@ -5191,15 +5189,13 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, * which means we won't have fs_info->fs_root set, so don't do * the async reclaim as we will panic. */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags) && + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && need_do_async_reclaim(space_info, root, used) && - !work_busy(&root->fs_info->async_reclaim_work)) { - trace_btrfs_trigger_flush(root->fs_info, - space_info->flags, - orig_bytes, flush, - "preempt"); + !work_busy(&fs_info->async_reclaim_work)) { + trace_btrfs_trigger_flush(fs_info, space_info->flags, + orig_bytes, flush, "preempt"); queue_work(system_unbound_wq, - &root->fs_info->async_reclaim_work); + &fs_info->async_reclaim_work); } } spin_unlock(&space_info->lock); @@ -5207,19 +5203,19 @@ static int __reserve_metadata_bytes(struct btrfs_root *root, return ret; if (flush == BTRFS_RESERVE_FLUSH_ALL) - return wait_reserve_ticket(root->fs_info, space_info, &ticket, + return wait_reserve_ticket(fs_info, space_info, &ticket, orig_bytes); ret = 0; - priority_reclaim_metadata_space(root->fs_info, space_info, &ticket); + priority_reclaim_metadata_space(fs_info, space_info, &ticket); spin_lock(&space_info->lock); if (ticket.bytes) { if (ticket.bytes < orig_bytes) { u64 num_bytes = orig_bytes - ticket.bytes; space_info->bytes_may_use -= num_bytes; - trace_btrfs_space_reservation(root->fs_info, - "space_info", space_info->flags, - num_bytes, 0); + trace_btrfs_space_reservation(fs_info, "space_info", + space_info->flags, + num_bytes, 0); } list_del_init(&ticket.list); @@ -5249,22 +5245,20 @@ static int reserve_metadata_bytes(struct btrfs_root *root, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { - struct btrfs_block_rsv *global_rsv = - &root->fs_info->global_block_rsv; - if (block_rsv != global_rsv && !block_rsv_use_bytes(global_rsv, orig_bytes)) ret = 0; } if (ret == -ENOSPC) - trace_btrfs_space_reservation(root->fs_info, - "space_info:enospc", + trace_btrfs_space_reservation(fs_info, "space_info:enospc", block_rsv->space_info->flags, orig_bytes, 1); return ret; @@ -5274,18 +5268,19 @@ static struct btrfs_block_rsv *get_block_rsv( const struct btrfs_trans_handle *trans, const struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv = NULL; if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - (root == root->fs_info->csum_root && trans->adding_csums) || - (root == root->fs_info->uuid_root)) + (root == fs_info->csum_root && trans->adding_csums) || + (root == fs_info->uuid_root)) block_rsv = trans->block_rsv; if (!block_rsv) block_rsv = root->block_rsv; if (!block_rsv) - block_rsv = &root->fs_info->empty_block_rsv; + block_rsv = &fs_info->empty_block_rsv; return block_rsv; } @@ -5507,11 +5502,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) rsv->type = type; } -struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, +struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, unsigned short type) { struct btrfs_block_rsv *block_rsv; - struct btrfs_fs_info *fs_info = root->fs_info; block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); if (!block_rsv) @@ -5523,12 +5517,12 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, return block_rsv; } -void btrfs_free_block_rsv(struct btrfs_root *root, +void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { if (!rsv) return; - btrfs_block_rsv_release(root, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); kfree(rsv); } @@ -5555,8 +5549,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root, return ret; } -int btrfs_block_rsv_check(struct btrfs_root *root, - struct btrfs_block_rsv *block_rsv, int min_factor) +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -5603,16 +5596,16 @@ int btrfs_block_rsv_refill(struct btrfs_root *root, return ret; } -void btrfs_block_rsv_release(struct btrfs_root *root, +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes) { - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; - block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, - num_bytes); + block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -5707,7 +5700,7 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info) } void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { if (!trans->block_rsv) return; @@ -5715,9 +5708,10 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, if (!trans->bytes_reserved) return; - trace_btrfs_space_reservation(root->fs_info, "transaction", + trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); + btrfs_block_rsv_release(fs_info, trans->block_rsv, + trans->bytes_reserved); trans->bytes_reserved = 0; } @@ -5743,6 +5737,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; /* * We always use trans->block_rsv here as we will have reserved space @@ -5758,19 +5753,22 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, * added it, so this takes the reservation so we can release it later * when we are truly done with the orphan item. */ - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), num_bytes, 1); return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1); } void btrfs_orphan_release_metadata(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1); - trace_btrfs_space_reservation(root->fs_info, "orphan", + u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); + + trace_btrfs_space_reservation(fs_info, "orphan", btrfs_ino(inode), num_bytes, 0); - btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); + btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, num_bytes); } /* @@ -5795,11 +5793,12 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, { u64 num_bytes; int ret; - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) { + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->nodesize; + num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta(root, num_bytes); if (ret) return ret; @@ -5809,8 +5808,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, *qgroup_reserved = num_bytes; - num_bytes = btrfs_calc_trans_metadata_size(root, items); - rsv->space_info = __find_space_info(root->fs_info, + num_bytes = btrfs_calc_trans_metadata_size(fs_info, items); + rsv->space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); ret = btrfs_block_rsv_add(root, rsv, num_bytes, BTRFS_RESERVE_FLUSH_ALL); @@ -5824,11 +5823,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_root *root, +void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, u64 qgroup_reserved) { - btrfs_block_rsv_release(root, rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rsv, (u64)-1); } /** @@ -5894,35 +5893,38 @@ static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes) static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, int reserve) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); + old_csums = btrfs_csum_bytes_to_leaves(fs_info, + BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); + num_csums = btrfs_csum_bytes_to_leaves(fs_info, + BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) return 0; if (reserve) - return btrfs_calc_trans_metadata_size(root, + return btrfs_calc_trans_metadata_size(fs_info, num_csums - old_csums); - return btrfs_calc_trans_metadata_size(root, old_csums - num_csums); + return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums); } int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; + struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv; u64 to_reserve = 0; u64 csum_bytes; unsigned nr_extents = 0; @@ -5949,13 +5951,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) } if (flush != BTRFS_RESERVE_NO_FLUSH && - btrfs_transaction_in_commit(root->fs_info)) + btrfs_transaction_in_commit(fs_info)) schedule_timeout(1); if (delalloc_lock) mutex_lock(&BTRFS_I(inode)->delalloc_mutex); - num_bytes = ALIGN(num_bytes, root->sectorsize); + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&BTRFS_I(inode)->lock); nr_extents = (unsigned)div64_u64(num_bytes + @@ -5970,28 +5972,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) BTRFS_I(inode)->reserved_extents; /* We always want to reserve a slot for updating the inode. */ - to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1); + to_reserve = btrfs_calc_trans_metadata_size(fs_info, nr_extents + 1); to_reserve += calc_csum_metadata_size(inode, num_bytes, 1); csum_bytes = BTRFS_I(inode)->csum_bytes; spin_unlock(&BTRFS_I(inode)->lock); - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) { + if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { ret = btrfs_qgroup_reserve_meta(root, - nr_extents * root->nodesize); + nr_extents * fs_info->nodesize); if (ret) goto out_fail; } ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { - btrfs_qgroup_free_meta(root, nr_extents * root->nodesize); + btrfs_qgroup_free_meta(root, + nr_extents * fs_info->nodesize); goto out_fail; } spin_lock(&BTRFS_I(inode)->lock); if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED, &BTRFS_I(inode)->runtime_flags)) { - to_reserve -= btrfs_calc_trans_metadata_size(root, 1); + to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1); release_extra = true; } BTRFS_I(inode)->reserved_extents += nr_extents; @@ -6001,12 +6004,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); if (to_reserve) - trace_btrfs_space_reservation(root->fs_info, "delalloc", + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), to_reserve, 1); if (release_extra) - btrfs_block_rsv_release(root, block_rsv, - btrfs_calc_trans_metadata_size(root, - 1)); + btrfs_block_rsv_release(fs_info, block_rsv, + btrfs_calc_trans_metadata_size(fs_info, 1)); return 0; out_fail: @@ -6061,11 +6063,11 @@ out_fail: } spin_unlock(&BTRFS_I(inode)->lock); if (dropped) - to_free += btrfs_calc_trans_metadata_size(root, dropped); + to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); if (to_free) { - btrfs_block_rsv_release(root, block_rsv, to_free); - trace_btrfs_space_reservation(root->fs_info, "delalloc", + btrfs_block_rsv_release(fs_info, block_rsv, to_free); + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), to_free, 0); } if (delalloc_lock) @@ -6084,11 +6086,11 @@ out_fail: */ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 to_free = 0; unsigned dropped; - num_bytes = ALIGN(num_bytes, root->sectorsize); + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); spin_lock(&BTRFS_I(inode)->lock); dropped = drop_outstanding_extent(inode, num_bytes); @@ -6096,16 +6098,15 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) to_free = calc_csum_metadata_size(inode, num_bytes, 0); spin_unlock(&BTRFS_I(inode)->lock); if (dropped > 0) - to_free += btrfs_calc_trans_metadata_size(root, dropped); + to_free += btrfs_calc_trans_metadata_size(fs_info, dropped); - if (btrfs_is_testing(root->fs_info)) + if (btrfs_is_testing(fs_info)) return; - trace_btrfs_space_reservation(root->fs_info, "delalloc", + trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, - to_free); + btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free); } /** @@ -6166,11 +6167,10 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len) } static int update_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, + struct btrfs_fs_info *info, u64 bytenr, u64 num_bytes, int alloc) { struct btrfs_block_group_cache *cache = NULL; - struct btrfs_fs_info *info = root->fs_info; u64 total = num_bytes; u64 old_val; u64 byte_in_group; @@ -6211,7 +6211,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && + if (btrfs_test_opt(info, SPACE_CACHE) && cache->disk_cache_state < BTRFS_DC_CLEAR) cache->disk_cache_state = BTRFS_DC_CLEAR; @@ -6236,7 +6236,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - trace_btrfs_space_reservation(root->fs_info, "pinned", + trace_btrfs_space_reservation(info, "pinned", cache->space_info->flags, num_bytes, 1); set_extent_dirty(info->pinned_extents, @@ -6276,19 +6276,19 @@ static int update_block_group(struct btrfs_trans_handle *trans, return 0; } -static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) { struct btrfs_block_group_cache *cache; u64 bytenr; - spin_lock(&root->fs_info->block_group_cache_lock); - bytenr = root->fs_info->first_logical_byte; - spin_unlock(&root->fs_info->block_group_cache_lock); + spin_lock(&fs_info->block_group_cache_lock); + bytenr = fs_info->first_logical_byte; + spin_unlock(&fs_info->block_group_cache_lock); if (bytenr < (u64)-1) return bytenr; - cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + cache = btrfs_lookup_first_block_group(fs_info, search_start); if (!cache) return 0; @@ -6298,7 +6298,7 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) return bytenr; } -static int pin_down_extent(struct btrfs_root *root, +static int pin_down_extent(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *cache, u64 bytenr, u64 num_bytes, int reserved) { @@ -6313,9 +6313,9 @@ static int pin_down_extent(struct btrfs_root *root, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - trace_btrfs_space_reservation(root->fs_info, "pinned", + trace_btrfs_space_reservation(fs_info, "pinned", cache->space_info->flags, num_bytes, 1); - set_extent_dirty(root->fs_info->pinned_extents, bytenr, + set_extent_dirty(fs_info->pinned_extents, bytenr, bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL); return 0; } @@ -6323,15 +6323,15 @@ static int pin_down_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent(struct btrfs_root *root, +int btrfs_pin_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, int reserved) { struct btrfs_block_group_cache *cache; - cache = btrfs_lookup_block_group(root->fs_info, bytenr); + cache = btrfs_lookup_block_group(fs_info, bytenr); BUG_ON(!cache); /* Logic error */ - pin_down_extent(root, cache, bytenr, num_bytes, reserved); + pin_down_extent(fs_info, cache, bytenr, num_bytes, reserved); btrfs_put_block_group(cache); return 0; @@ -6340,13 +6340,13 @@ int btrfs_pin_extent(struct btrfs_root *root, /* * this function must be called within transaction */ -int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, +int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { struct btrfs_block_group_cache *cache; int ret; - cache = btrfs_lookup_block_group(root->fs_info, bytenr); + cache = btrfs_lookup_block_group(fs_info, bytenr); if (!cache) return -EINVAL; @@ -6358,7 +6358,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, */ cache_block_group(cache, 1); - pin_down_extent(root, cache, bytenr, num_bytes, 0); + pin_down_extent(fs_info, cache, bytenr, num_bytes, 0); /* remove us from the free space cache (if we're there at all) */ ret = btrfs_remove_free_space(cache, bytenr, num_bytes); @@ -6366,13 +6366,14 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, return ret; } -static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes) +static int __exclude_logged_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes) { int ret; struct btrfs_block_group_cache *block_group; struct btrfs_caching_control *caching_ctl; - block_group = btrfs_lookup_block_group(root->fs_info, start); + block_group = btrfs_lookup_block_group(fs_info, start); if (!block_group) return -EINVAL; @@ -6387,7 +6388,7 @@ static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_b mutex_lock(&caching_ctl->mutex); if (start >= caching_ctl->progress) { - ret = add_excluded_extent(root, start, num_bytes); + ret = add_excluded_extent(fs_info, start, num_bytes); } else if (start + num_bytes <= caching_ctl->progress) { ret = btrfs_remove_free_space(block_group, start, num_bytes); @@ -6401,7 +6402,7 @@ static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_b num_bytes = (start + num_bytes) - caching_ctl->progress; start = caching_ctl->progress; - ret = add_excluded_extent(root, start, num_bytes); + ret = add_excluded_extent(fs_info, start, num_bytes); } out_lock: mutex_unlock(&caching_ctl->mutex); @@ -6411,7 +6412,7 @@ out_lock: return ret; } -int btrfs_exclude_logged_extents(struct btrfs_root *log, +int btrfs_exclude_logged_extents(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { struct btrfs_file_extent_item *item; @@ -6419,7 +6420,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, int found_type; int i; - if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) return 0; for (i = 0; i < btrfs_header_nritems(eb); i++) { @@ -6434,7 +6435,7 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log, continue; key.objectid = btrfs_file_extent_disk_bytenr(eb, item); key.offset = btrfs_file_extent_disk_num_bytes(eb, item); - __exclude_logged_extent(log, key.objectid, key.offset); + __exclude_logged_extent(fs_info, key.objectid, key.offset); } return 0; @@ -6499,16 +6500,9 @@ void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg) * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write * - * This is called by the allocator when it reserves space. Metadata - * reservations should be called with RESERVE_ALLOC so we do the proper - * ENOSPC accounting. For data we handle the reservation through clearing the - * delalloc bits in the io_tree. We have to do this since we could end up - * allocating less disk space for the amount of data we have reserved in the - * case of compression. - * - * If this is a reservation and the block group has become read only we cannot - * make the reservation and return -EAGAIN, otherwise this function always - * succeeds. + * This is called by the allocator when it reserves space. If this is a + * reservation and the block group has become read only we cannot make the + * reservation and return -EAGAIN, otherwise this function always succeeds. */ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache, u64 ram_bytes, u64 num_bytes, int delalloc) @@ -6568,9 +6562,8 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache, return ret; } void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_caching_control *next; struct btrfs_caching_control *caching_ctl; struct btrfs_block_group_cache *cache; @@ -6604,11 +6597,11 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, * what it should be based on the mount options. */ static struct btrfs_free_cluster * -fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, - u64 *empty_cluster) +fetch_cluster_info(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(root->fs_info, SSD); + bool ssd = btrfs_test_opt(fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) @@ -6617,20 +6610,20 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, if (ssd) *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { - ret = &root->fs_info->meta_alloc_cluster; + ret = &fs_info->meta_alloc_cluster; if (!ssd) *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { - ret = &root->fs_info->data_alloc_cluster; + ret = &fs_info->data_alloc_cluster; } return ret; } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, +static int unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end, const bool return_free_space) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; struct btrfs_space_info *space_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; @@ -6650,7 +6643,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, cache = btrfs_lookup_block_group(fs_info, start); BUG_ON(!cache); /* Logic error */ - cluster = fetch_cluster_info(root, + cluster = fetch_cluster_info(fs_info, cache->space_info, &empty_cluster); empty_cluster <<= 1; @@ -6729,9 +6722,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, } int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *block_group, *tmp; struct list_head *deleted_bgs; struct extent_io_tree *unpin; @@ -6753,12 +6745,12 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, break; } - if (btrfs_test_opt(root->fs_info, DISCARD)) - ret = btrfs_discard_extent(root, start, + if (btrfs_test_opt(fs_info, DISCARD)) + ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end); - unpin_extent_range(root, start, end, true); + unpin_extent_range(fs_info, start, end, true); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } @@ -6774,7 +6766,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, ret = -EROFS; if (!trans->aborted) - ret = btrfs_discard_extent(root, + ret = btrfs_discard_extent(fs_info, block_group->key.objectid, block_group->key.offset, &trimmed); @@ -6816,7 +6808,7 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *info, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, @@ -6824,7 +6816,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, { struct btrfs_key key; struct btrfs_path *path; - struct btrfs_fs_info *info = root->fs_info; struct btrfs_root *extent_root = info->extent_root; struct extent_buffer *leaf; struct btrfs_extent_item *ei; @@ -6839,8 +6830,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; int last_ref = 0; - bool skinny_metadata = btrfs_fs_incompat(root->fs_info, - SKINNY_METADATA); + bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); path = btrfs_alloc_path(); if (!path) @@ -6937,8 +6927,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu", ret, bytenr); if (ret > 0) - btrfs_print_leaf(extent_root, - path->nodes[0]); + btrfs_print_leaf(info, path->nodes[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -6947,7 +6936,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(extent_root, path->nodes[0]); + btrfs_print_leaf(info, path->nodes[0]); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, @@ -6984,7 +6973,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "umm, got %d back from search, was looking for %llu", ret, bytenr); - btrfs_print_leaf(extent_root, path->nodes[0]); + btrfs_print_leaf(info, path->nodes[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -7040,7 +7029,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } } - add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid, + add_pinned_bytes(info, -num_bytes, owner_objectid, root_objectid); } else { if (found_extent) { @@ -7065,21 +7054,20 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (is_data) { - ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + ret = btrfs_del_csums(trans, info, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } - ret = add_to_free_space_tree(trans, root->fs_info, bytenr, - num_bytes); + ret = add_to_free_space_tree(trans, info, bytenr, num_bytes); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - ret = update_block_group(trans, root, bytenr, num_bytes, 0); + ret = update_block_group(trans, info, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -7099,7 +7087,7 @@ out: * removes it from the tree. */ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr) + u64 bytenr) { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; @@ -7169,15 +7157,17 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, u64 parent, int last_ref) { + struct btrfs_fs_info *fs_info = root->fs_info; int pin = 1; int ret; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - buf->start, buf->len, - parent, root->root_key.objectid, - btrfs_header_level(buf), - BTRFS_DROP_DELAYED_REF, NULL); + ret = btrfs_add_delayed_tree_ref(fs_info, trans, + buf->start, buf->len, + parent, + root->root_key.objectid, + btrfs_header_level(buf), + BTRFS_DROP_DELAYED_REF, NULL); BUG_ON(ret); /* -ENOMEM */ } @@ -7188,15 +7178,16 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *cache; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, root, buf->start); + ret = check_ref_cleanup(trans, buf->start); if (!ret) goto out; } - cache = btrfs_lookup_block_group(root->fs_info, buf->start); + cache = btrfs_lookup_block_group(fs_info, buf->start); if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(root, cache, buf->start, buf->len, 1); + pin_down_extent(fs_info, cache, buf->start, + buf->len, 1); btrfs_put_block_group(cache); goto out; } @@ -7206,13 +7197,12 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_add_free_space(cache, buf->start, buf->len); btrfs_free_reserved_bytes(cache, buf->len, 0); btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(root, buf->start, buf->len); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); pin = 0; } out: if (pin) - add_pinned_bytes(root->fs_info, buf->len, - btrfs_header_level(buf), + add_pinned_bytes(fs_info, buf->len, btrfs_header_level(buf), root->root_key.objectid); /* @@ -7223,17 +7213,17 @@ out: } /* Can return -ENOMEM */ -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; if (btrfs_is_testing(fs_info)) return 0; - add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); + add_pinned_bytes(fs_info, num_bytes, owner, root_objectid); /* * tree log blocks never actually go into the extent allocation @@ -7242,7 +7232,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (root_objectid == BTRFS_TREE_LOG_OBJECTID) { WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID); /* unlocks the pinned mutex */ - btrfs_pin_extent(root, bytenr, num_bytes, 1); + btrfs_pin_extent(fs_info, bytenr, num_bytes, 1); ret = 0; } else if (owner < BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, @@ -7397,7 +7387,8 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group, spin_unlock(&cluster->refill_lock); - down_read(&used_bg->data_rwsem); + /* We should only have one-level nested. */ + down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING); spin_lock(&cluster->refill_lock); if (used_bg == cluster->block_group) @@ -7433,8 +7424,9 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, u64 hint_byte, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = orig_root->fs_info; int ret = 0; - struct btrfs_root *root = orig_root->fs_info->extent_root; + struct btrfs_root *root = fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; u64 search_start = 0; @@ -7450,16 +7442,16 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool orig_have_caching_bg = false; bool full_search = false; - WARN_ON(num_bytes < root->sectorsize); + WARN_ON(num_bytes < fs_info->sectorsize); ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(orig_root, num_bytes, empty_size, flags); + trace_find_free_extent(fs_info, num_bytes, empty_size, flags); - space_info = __find_space_info(root->fs_info, flags); + space_info = __find_space_info(fs_info, flags); if (!space_info) { - btrfs_err(root->fs_info, "No space info for %llu", flags); + btrfs_err(fs_info, "No space info for %llu", flags); return -ENOSPC; } @@ -7486,7 +7478,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, spin_unlock(&space_info->lock); } - last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster); + last_ptr = fetch_cluster_info(fs_info, space_info, &empty_cluster); if (last_ptr) { spin_lock(&last_ptr->lock); if (last_ptr->block_group) @@ -7503,11 +7495,10 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, spin_unlock(&last_ptr->lock); } - search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, first_logical_byte(fs_info, 0)); search_start = max(search_start, hint_byte); if (search_start == hint_byte) { - block_group = btrfs_lookup_block_group(root->fs_info, - search_start); + block_group = btrfs_lookup_block_group(fs_info, search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -7615,7 +7606,7 @@ have_block_group: if (offset) { /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, + trace_btrfs_reserve_extent_cluster(fs_info, used_block_group, search_start, num_bytes); if (used_block_group != block_group) { @@ -7671,7 +7662,7 @@ refill_cluster: block_group->full_stripe_len); /* allocate a cluster in this block group */ - ret = btrfs_find_space_cluster(root, block_group, + ret = btrfs_find_space_cluster(fs_info, block_group, last_ptr, search_start, num_bytes, aligned_cluster); @@ -7688,7 +7679,7 @@ refill_cluster: if (offset) { /* we found one, proceed */ spin_unlock(&last_ptr->refill_lock); - trace_btrfs_reserve_extent_cluster(root, + trace_btrfs_reserve_extent_cluster(fs_info, block_group, search_start, num_bytes); goto checks; @@ -7760,7 +7751,7 @@ unclustered_alloc: goto loop; } checks: - search_start = ALIGN(offset, root->stripesize); + search_start = ALIGN(offset, fs_info->stripesize); /* move on to the next group */ if (search_start + num_bytes > @@ -7786,7 +7777,7 @@ checks: ins->objectid = search_start; ins->offset = num_bytes; - trace_btrfs_reserve_extent(orig_root, block_group, + trace_btrfs_reserve_extent(fs_info, block_group, search_start, num_bytes); btrfs_release_block_group(block_group, delalloc); break; @@ -7847,7 +7838,7 @@ loop: goto out; } - ret = do_chunk_alloc(trans, root, flags, + ret = do_chunk_alloc(trans, fs_info, flags, CHUNK_ALLOC_FORCE); /* @@ -7867,7 +7858,7 @@ loop: else ret = 0; if (!exist) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) goto out; } @@ -7959,7 +7950,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = btrfs_get_alloc_profile(root, is_data); again: - WARN_ON(num_bytes < root->sectorsize); + WARN_ON(num_bytes < fs_info->sectorsize); ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { @@ -7967,7 +7958,8 @@ again: } else if (ret == -ENOSPC) { if (!final_tried && ins->offset) { num_bytes = min(num_bytes >> 1, ins->offset); - num_bytes = round_down(num_bytes, root->sectorsize); + num_bytes = round_down(num_bytes, + fs_info->sectorsize); num_bytes = max(num_bytes, min_alloc_size); ram_bytes = num_bytes; if (num_bytes == min_alloc_size) @@ -7977,7 +7969,7 @@ again: struct btrfs_space_info *sinfo; sinfo = __find_space_info(fs_info, flags); - btrfs_err(root->fs_info, + btrfs_err(fs_info, "allocation failed flags %llu, wanted %llu", flags, num_bytes); if (sinfo) @@ -7988,54 +7980,53 @@ again: return ret; } -static int __btrfs_free_reserved_extent(struct btrfs_root *root, +static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int pin, int delalloc) { struct btrfs_block_group_cache *cache; int ret = 0; - cache = btrfs_lookup_block_group(root->fs_info, start); + cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { - btrfs_err(root->fs_info, "Unable to find block group for %llu", - start); + btrfs_err(fs_info, "Unable to find block group for %llu", + start); return -ENOSPC; } if (pin) - pin_down_extent(root, cache, start, len, 1); + pin_down_extent(fs_info, cache, start, len, 1); else { - if (btrfs_test_opt(root->fs_info, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); + if (btrfs_test_opt(fs_info, DISCARD)) + ret = btrfs_discard_extent(fs_info, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(root, start, len); + trace_btrfs_reserved_extent_free(fs_info, start, len); } btrfs_put_block_group(cache); return ret; } -int btrfs_free_reserved_extent(struct btrfs_root *root, +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc) { - return __btrfs_free_reserved_extent(root, start, len, 0, delalloc); + return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); } -int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root, +int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - return __btrfs_free_reserved_extent(root, start, len, 1, 0); + return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, struct btrfs_key *ins, int ref_mod) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_extent_item *extent_item; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; @@ -8094,24 +8085,23 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); + ret = update_block_group(trans, fs_info, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset); + trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); return ret; } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 parent, u64 root_objectid, u64 flags, struct btrfs_disk_key *key, int level, struct btrfs_key *ins) { int ret; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_extent_item *extent_item; struct btrfs_tree_block_info *block_info; struct btrfs_extent_inline_ref *iref; @@ -8119,16 +8109,15 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 size = sizeof(*extent_item) + sizeof(*iref); u64 num_bytes = ins->offset; - bool skinny_metadata = btrfs_fs_incompat(root->fs_info, - SKINNY_METADATA); + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); if (!skinny_metadata) size += sizeof(*block_info); path = btrfs_alloc_path(); if (!path) { - btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->nodesize); + btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + fs_info->nodesize); return -ENOMEM; } @@ -8137,8 +8126,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins, size); if (ret) { btrfs_free_path(path); - btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->nodesize); + btrfs_free_and_pin_reserved_extent(fs_info, ins->objectid, + fs_info->nodesize); return ret; } @@ -8152,7 +8141,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = root->nodesize; + num_bytes = fs_info->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -8179,29 +8168,30 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = update_block_group(trans, root, ins->objectid, root->nodesize, - 1); + ret = update_block_group(trans, fs_info, ins->objectid, + fs_info->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); + trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, + fs_info->nodesize); return ret; } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 root_objectid, u64 owner, u64 offset, u64 ram_bytes, struct btrfs_key *ins) { + struct btrfs_fs_info *fs_info = trans->fs_info; int ret; BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, + ret = btrfs_add_delayed_data_ref(fs_info, trans, ins->objectid, ins->offset, 0, root_objectid, owner, offset, ram_bytes, BTRFS_ADD_DELAYED_EXTENT, @@ -8215,7 +8205,7 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, * space cache bits as well */ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, u64 root_objectid, u64 owner, u64 offset, struct btrfs_key *ins) { @@ -8227,13 +8217,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, * Mixed block groups will exclude before processing the log so we only * need to do the exclude dance if this fs isn't mixed. */ - if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) { - ret = __exclude_logged_extent(root, ins->objectid, ins->offset); + if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + ret = __exclude_logged_extent(fs_info, ins->objectid, + ins->offset); if (ret) return ret; } - block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + block_group = btrfs_lookup_block_group(fs_info, ins->objectid); if (!block_group) return -EINVAL; @@ -8245,7 +8236,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); - ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, + ret = alloc_reserved_file_extent(trans, fs_info, 0, root_objectid, 0, owner, offset, ins, 1); btrfs_put_block_group(block_group); return ret; @@ -8255,16 +8246,17 @@ static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; - buf = btrfs_find_create_tree_block(root, bytenr); + buf = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(buf)) return buf; btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root->fs_info, buf); + clean_tree_block(trans, fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -8296,8 +8288,9 @@ static struct btrfs_block_rsv * use_block_rsv(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 blocksize) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; - struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; + struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; bool global_updated = false; @@ -8315,11 +8308,11 @@ again: if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) { global_updated = true; - update_global_block_rsv(root->fs_info); + update_global_block_rsv(fs_info); goto again; } - if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL * 10, /*DEFAULT_RATELIMIT_BURST*/ 1); @@ -8363,18 +8356,18 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; - u32 blocksize = root->nodesize; - bool skinny_metadata = btrfs_fs_incompat(root->fs_info, - SKINNY_METADATA); + u32 blocksize = fs_info->nodesize; + bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (btrfs_is_testing(root->fs_info)) { + if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, level); if (!IS_ERR(buf)) @@ -8421,7 +8414,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->is_data = false; extent_op->level = level; - ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, + ret = btrfs_add_delayed_tree_ref(fs_info, trans, ins.objectid, ins.offset, parent, root_objectid, level, BTRFS_ADD_DELAYED_EXTENT, @@ -8436,9 +8429,9 @@ out_free_delayed: out_free_buf: free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); out_unuse: - unuse_block_rsv(root->fs_info, block_rsv, blocksize); + unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); } @@ -8464,6 +8457,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, struct walk_control *wc, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 refs; @@ -8481,7 +8475,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, } else { wc->reada_count = wc->reada_count * 3 / 2; wc->reada_count = min_t(int, wc->reada_count, - BTRFS_NODEPTRS_PER_BLOCK(root)); + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); } eb = path->nodes[wc->level]; @@ -8503,7 +8497,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; /* We don't lock the tree block, it's OK to be racy here */ - ret = btrfs_lookup_extent_info(trans, root, bytenr, + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, &flags); /* We don't care about errors in readahead. */ @@ -8532,226 +8526,12 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - readahead_tree_block(root, bytenr); + readahead_tree_block(fs_info, bytenr); nread++; } wc->reada_slot = slot; } -static int account_leaf_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *eb) -{ - int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; - struct btrfs_key key; - struct btrfs_file_extent_item *fi; - u64 bytenr, num_bytes; - - /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) - return 0; - - for (i = 0; i < nr; i++) { - btrfs_item_key_to_cpu(eb, &key, i); - - if (key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); - /* filter out non qgroup-accountable extents */ - extent_type = btrfs_file_extent_type(eb, fi); - - if (extent_type == BTRFS_FILE_EXTENT_INLINE) - continue; - - bytenr = btrfs_file_extent_disk_bytenr(eb, fi); - if (!bytenr) - continue; - - num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, - bytenr, num_bytes, GFP_NOFS); - if (ret) - return ret; - } - return 0; -} - -/* - * Walk up the tree from the bottom, freeing leaves and any interior - * nodes which have had all slots visited. If a node (leaf or - * interior) is freed, the node above it will have it's slot - * incremented. The root node will never be freed. - * - * At the end of this function, we should have a path which has all - * slots incremented to the next position for a search. If we need to - * read a new node it will be NULL and the node above it will have the - * correct slot selected for a later read. - * - * If we increment the root nodes slot counter past the number of - * elements, 1 is returned to signal completion of the search. - */ -static int adjust_slots_upwards(struct btrfs_root *root, - struct btrfs_path *path, int root_level) -{ - int level = 0; - int nr, slot; - struct extent_buffer *eb; - - if (root_level == 0) - return 1; - - while (level <= root_level) { - eb = path->nodes[level]; - nr = btrfs_header_nritems(eb); - path->slots[level]++; - slot = path->slots[level]; - if (slot >= nr || level == 0) { - /* - * Don't free the root - we will detect this - * condition after our loop and return a - * positive value for caller to stop walking the tree. - */ - if (level != root_level) { - btrfs_tree_unlock_rw(eb, path->locks[level]); - path->locks[level] = 0; - - free_extent_buffer(eb); - path->nodes[level] = NULL; - path->slots[level] = 0; - } - } else { - /* - * We have a valid slot to walk back down - * from. Stop here so caller can process these - * new nodes. - */ - break; - } - - level++; - } - - eb = path->nodes[root_level]; - if (path->slots[root_level] >= btrfs_header_nritems(eb)) - return 1; - - return 0; -} - -/* - * root_eb is the subtree root and is locked before this function is called. - */ -static int account_shared_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *root_eb, - u64 root_gen, - int root_level) -{ - int ret = 0; - int level; - struct extent_buffer *eb = root_eb; - struct btrfs_path *path = NULL; - - BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); - BUG_ON(root_eb == NULL); - - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags)) - return 0; - - if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen); - if (ret) - goto out; - } - - if (root_level == 0) { - ret = account_leaf_items(trans, root, root_eb); - goto out; - } - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - /* - * Walk down the tree. Missing extent blocks are filled in as - * we go. Metadata is accounted every time we read a new - * extent block. - * - * When we reach a leaf, we account for file extent items in it, - * walk back up the tree (adjusting slot pointers as we go) - * and restart the search process. - */ - extent_buffer_get(root_eb); /* For path */ - path->nodes[root_level] = root_eb; - path->slots[root_level] = 0; - path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ -walk_down: - level = root_level; - while (level >= 0) { - if (path->nodes[level] == NULL) { - int parent_slot; - u64 child_gen; - u64 child_bytenr; - - /* We need to get child blockptr/gen from - * parent before we can read it. */ - eb = path->nodes[level + 1]; - parent_slot = path->slots[level + 1]; - child_bytenr = btrfs_node_blockptr(eb, parent_slot); - child_gen = btrfs_node_ptr_generation(eb, parent_slot); - - eb = read_tree_block(root, child_bytenr, child_gen); - if (IS_ERR(eb)) { - ret = PTR_ERR(eb); - goto out; - } else if (!extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); - ret = -EIO; - goto out; - } - - path->nodes[level] = eb; - path->slots[level] = 0; - - btrfs_tree_read_lock(eb); - btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); - path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_insert_dirty_extent(trans, - root->fs_info, child_bytenr, - root->nodesize, GFP_NOFS); - if (ret) - goto out; - } - - if (level == 0) { - ret = account_leaf_items(trans, root, path->nodes[level]); - if (ret) - goto out; - - /* Nonzero return here means we completed our search */ - ret = adjust_slots_upwards(root, path, root_level); - if (ret) - break; - - /* Restart search with new slots */ - goto walk_down; - } - - level--; - } - - ret = 0; -out: - btrfs_free_path(path); - - return ret; -} - /* * helper to process tree block while walking down the tree. * @@ -8765,6 +8545,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc, int lookup_info) { + struct btrfs_fs_info *fs_info = root->fs_info; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF; @@ -8782,7 +8563,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) || (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) { BUG_ON(!path->locks[level]); - ret = btrfs_lookup_extent_info(trans, root, + ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level]); @@ -8810,7 +8591,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = btrfs_set_disk_extent_flags(trans, root, eb->start, + ret = btrfs_set_disk_extent_flags(trans, fs_info, eb->start, eb->len, flag, btrfs_header_level(eb), 0); BUG_ON(ret); /* -ENOMEM */ @@ -8846,6 +8627,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc, int *lookup_info) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 bytenr; u64 generation; u64 parent; @@ -8871,11 +8653,11 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = root->nodesize; + blocksize = fs_info->nodesize; - next = btrfs_find_tree_block(root->fs_info, bytenr); + next = find_extent_buffer(fs_info, bytenr); if (!next) { - next = btrfs_find_create_tree_block(root, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) return PTR_ERR(next); @@ -8886,14 +8668,14 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1, + ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], &wc->flags[level - 1]); if (ret < 0) goto out_unlock; if (unlikely(wc->refs[level - 1] == 0)) { - btrfs_err(root->fs_info, "Missing references."); + btrfs_err(fs_info, "Missing references."); ret = -EIO; goto out_unlock; } @@ -8935,7 +8717,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, generation); + next = read_tree_block(fs_info, bytenr, generation); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { @@ -8980,16 +8762,17 @@ skip: } if (need_account) { - ret = account_shared_subtree(trans, root, next, - generation, level - 1); + ret = btrfs_qgroup_trace_subtree(trans, root, next, + generation, level - 1); if (ret) { - btrfs_err_rl(root->fs_info, + btrfs_err_rl(fs_info, "Error %d accounting shared subtree. Quota is out of sync, rescan required.", ret); } } - ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, - root->root_key.objectid, level - 1, 0); + ret = btrfs_free_extent(trans, fs_info, bytenr, blocksize, + parent, root->root_key.objectid, + level - 1, 0); if (ret) goto out_unlock; } @@ -9021,6 +8804,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct walk_control *wc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int level = wc->level; struct extent_buffer *eb = path->nodes[level]; @@ -9050,7 +8834,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - ret = btrfs_lookup_extent_info(trans, root, + ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], &wc->flags[level]); @@ -9078,9 +8862,9 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, else ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ - ret = account_leaf_items(trans, root, eb); + ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, eb); if (ret) { - btrfs_err_rl(root->fs_info, + btrfs_err_rl(fs_info, "error %d accounting leaf items. Quota is out of sync, rescan required.", ret); } @@ -9092,7 +8876,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root->fs_info, eb); + clean_tree_block(trans, fs_info, eb); } if (eb == root->node) { @@ -9270,7 +9054,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_set_lock_blocking(path->nodes[level]); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; - ret = btrfs_lookup_extent_info(trans, root, + ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], &wc->flags[level]); @@ -9296,7 +9080,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, wc->update_ref = update_ref; wc->keep_locks = 0; wc->for_reloc = for_reloc; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { @@ -9326,8 +9110,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } BUG_ON(wc->level == 0); - if (btrfs_should_end_transaction(trans, tree_root) || - (!for_reloc && btrfs_need_cleaner_sleep(root))) { + if (btrfs_should_end_transaction(trans) || + (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) { ret = btrfs_update_root(trans, tree_root, &root->root_key, root_item); @@ -9337,8 +9121,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - btrfs_end_transaction_throttle(trans, tree_root); - if (!for_reloc && btrfs_need_cleaner_sleep(root)) { + btrfs_end_transaction_throttle(trans); + if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) { btrfs_debug(fs_info, "drop snapshot early exit"); err = -EAGAIN; @@ -9391,7 +9175,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - btrfs_end_transaction_throttle(trans, tree_root); + btrfs_end_transaction_throttle(trans); out_free: kfree(wc); btrfs_free_path(path); @@ -9421,6 +9205,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *node, struct extent_buffer *parent) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct walk_control *wc; int level; @@ -9460,7 +9245,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, wc->update_ref = 0; wc->keep_locks = 1; wc->for_reloc = 1; - wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root); + wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info); while (1) { wret = walk_down_tree(trans, root, path, wc); @@ -9481,7 +9266,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +static u64 update_block_group_flags(struct btrfs_fs_info *fs_info, u64 flags) { u64 num_devices; u64 stripped; @@ -9490,11 +9275,11 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) * if restripe for this chunk_type is on pick target profile and * return, otherwise do the usual balance */ - stripped = get_restripe_target(root->fs_info, flags); + stripped = get_restripe_target(fs_info, flags); if (stripped) return extended_to_chunk(stripped); - num_devices = root->fs_info->fs_devices->rw_devices; + num_devices = fs_info->fs_devices->rw_devices; stripped = BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | @@ -9579,6 +9364,7 @@ int btrfs_inc_block_group_ro(struct btrfs_root *root, struct btrfs_block_group_cache *cache) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; @@ -9593,14 +9379,14 @@ again: * block groups cache has started writing. If it already started, * back off and let this transaction commit */ - mutex_lock(&root->fs_info->ro_block_group_mutex); + mutex_lock(&fs_info->ro_block_group_mutex); if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { u64 transid = trans->transid; - mutex_unlock(&root->fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans, root); + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); - ret = btrfs_wait_for_commit(root, transid); + ret = btrfs_wait_for_commit(fs_info, transid); if (ret) return ret; goto again; @@ -9610,9 +9396,9 @@ again: * if we are changing raid levels, try to allocate a corresponding * block group with the new raid level. */ - alloc_flags = update_block_group_flags(root, cache->flags); + alloc_flags = update_block_group_flags(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = do_chunk_alloc(trans, root, alloc_flags, + ret = do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -9628,31 +9414,31 @@ again: ret = inc_block_group_ro(cache, 0); if (!ret) goto out; - alloc_flags = get_alloc_profile(root, cache->space_info->flags); - ret = do_chunk_alloc(trans, root, alloc_flags, + alloc_flags = get_alloc_profile(fs_info, cache->space_info->flags); + ret = do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; ret = inc_block_group_ro(cache, 0); out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { - alloc_flags = update_block_group_flags(root, cache->flags); - lock_chunks(root->fs_info->chunk_root); - check_system_chunk(trans, root, alloc_flags); - unlock_chunks(root->fs_info->chunk_root); + alloc_flags = update_block_group_flags(fs_info, cache->flags); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, fs_info, alloc_flags); + mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&root->fs_info->ro_block_group_mutex); + mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) + struct btrfs_fs_info *fs_info, u64 type) { - u64 alloc_flags = get_alloc_profile(root, type); - return do_chunk_alloc(trans, root, alloc_flags, - CHUNK_ALLOC_FORCE); + u64 alloc_flags = get_alloc_profile(fs_info, type); + + return do_chunk_alloc(trans, fs_info, alloc_flags, CHUNK_ALLOC_FORCE); } /* @@ -9696,8 +9482,7 @@ u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) return free_bytes; } -void btrfs_dec_block_group_ro(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) +void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; @@ -9723,11 +9508,12 @@ void btrfs_dec_block_group_ro(struct btrfs_root *root, * @return - -1 if it's not a good idea to relocate this block group, 0 if its * ok to go ahead and try. */ -int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) +int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct btrfs_trans_handle *trans; u64 min_free; @@ -9739,14 +9525,14 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) int full = 0; int ret = 0; - debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG); + debug = btrfs_test_opt(fs_info, ENOSPC_DEBUG); - block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + block_group = btrfs_lookup_block_group(fs_info, bytenr); /* odd, couldn't find the block group, leave it alone */ if (!block_group) { if (debug) - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "can't find block group for bytenr %llu", bytenr); return -1; @@ -9796,7 +9582,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) * 3: raid0 * 4: single */ - target = get_restripe_target(root->fs_info, block_group->flags); + target = get_restripe_target(fs_info, block_group->flags); if (target) { index = __get_raid_index(extended_to_chunk(target)); } else { @@ -9806,9 +9592,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) */ if (full) { if (debug) - btrfs_warn(root->fs_info, - "no space to alloc new chunk for block group %llu", - block_group->key.objectid); + btrfs_warn(fs_info, + "no space to alloc new chunk for block group %llu", + block_group->key.objectid); goto out; } @@ -9836,7 +9622,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) goto out; } - mutex_lock(&root->fs_info->chunk_mutex); + mutex_lock(&fs_info->chunk_mutex); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 dev_offset; @@ -9858,19 +9644,21 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) } } if (debug && ret == -1) - btrfs_warn(root->fs_info, - "no space to allocate a new chunk for block group %llu", - block_group->key.objectid); - mutex_unlock(&root->fs_info->chunk_mutex); - btrfs_end_transaction(trans, root); + btrfs_warn(fs_info, + "no space to allocate a new chunk for block group %llu", + block_group->key.objectid); + mutex_unlock(&fs_info->chunk_mutex); + btrfs_end_transaction(trans); out: btrfs_put_block_group(block_group); return ret; } -static int find_first_block_group(struct btrfs_root *root, - struct btrfs_path *path, struct btrfs_key *key) +static int find_first_block_group(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct btrfs_key *key) { + struct btrfs_root *root = fs_info->extent_root; int ret = 0; struct btrfs_key found_key; struct extent_buffer *leaf; @@ -9904,7 +9692,7 @@ static int find_first_block_group(struct btrfs_root *root, found_key.offset); read_unlock(&em_tree->lock); if (!em) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", found_key.objectid, found_key.offset); ret = -ENOENT; @@ -9934,8 +9722,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) if (block_group->iref) break; spin_unlock(&block_group->lock); - block_group = next_block_group(info->tree_root, - block_group); + block_group = next_block_group(info, block_group); } if (!block_group) { if (last == 0) @@ -10003,7 +9790,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) */ if (block_group->cached == BTRFS_CACHE_NO || block_group->cached == BTRFS_CACHE_ERROR) - free_excluded_extents(info->extent_root, block_group); + free_excluded_extents(info, block_group); btrfs_remove_free_space_cache(block_group); ASSERT(list_empty(&block_group->dirty_list)); @@ -10094,7 +9881,8 @@ out_err: } static struct btrfs_block_group_cache * -btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, + u64 start, u64 size) { struct btrfs_block_group_cache *cache; @@ -10113,11 +9901,11 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - start); + cache->sectorsize = fs_info->sectorsize; + cache->fs_info = fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, + &fs_info->mapping_tree, + start); set_free_space_tree_thresholds(cache); atomic_set(&cache->count, 1); @@ -10136,12 +9924,11 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) return cache; } -int btrfs_read_block_groups(struct btrfs_root *root) +int btrfs_read_block_groups(struct btrfs_fs_info *info) { struct btrfs_path *path; int ret; struct btrfs_block_group_cache *cache; - struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *space_info; struct btrfs_key key; struct btrfs_key found_key; @@ -10154,7 +9941,6 @@ int btrfs_read_block_groups(struct btrfs_root *root) feature = btrfs_super_incompat_flags(info->super_copy); mixed = !!(feature & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS); - root = info->extent_root; key.objectid = 0; key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -10163,15 +9949,15 @@ int btrfs_read_block_groups(struct btrfs_root *root) return -ENOMEM; path->reada = READA_FORWARD; - cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_test_opt(root->fs_info, SPACE_CACHE) && - btrfs_super_generation(root->fs_info->super_copy) != cache_gen) + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_test_opt(info, SPACE_CACHE) && + btrfs_super_generation(info->super_copy) != cache_gen) need_clear = 1; - if (btrfs_test_opt(root->fs_info, CLEAR_CACHE)) + if (btrfs_test_opt(info, CLEAR_CACHE)) need_clear = 1; while (1) { - ret = find_first_block_group(root, path, &key); + ret = find_first_block_group(info, path, &key); if (ret > 0) break; if (ret != 0) @@ -10180,7 +9966,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = btrfs_create_block_group_cache(root, found_key.objectid, + cache = btrfs_create_block_group_cache(info, found_key.objectid, found_key.offset); if (!cache) { ret = -ENOMEM; @@ -10198,7 +9984,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * b) Setting 'dirty flag' makes sure that we flush * the new space cache info onto disk. */ - if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) + if (btrfs_test_opt(info, SPACE_CACHE)) cache->disk_cache_state = BTRFS_DC_CLEAR; } @@ -10224,13 +10010,13 @@ int btrfs_read_block_groups(struct btrfs_root *root) * info has super bytes accounted for, otherwise we'll think * we have more space than we actually do. */ - ret = exclude_super_stripes(root, cache); + ret = exclude_super_stripes(info, cache); if (ret) { /* * We may have excluded something, so call this just in * case. */ - free_excluded_extents(root, cache); + free_excluded_extents(info, cache); btrfs_put_block_group(cache); goto error; } @@ -10245,25 +10031,25 @@ int btrfs_read_block_groups(struct btrfs_root *root) if (found_key.offset == btrfs_block_group_used(&cache->item)) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - free_excluded_extents(root, cache); + free_excluded_extents(info, cache); } else if (btrfs_block_group_used(&cache->item) == 0) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; - add_new_free_space(cache, root->fs_info, + add_new_free_space(cache, info, found_key.objectid, found_key.objectid + found_key.offset); - free_excluded_extents(root, cache); + free_excluded_extents(info, cache); } - ret = btrfs_add_block_group_cache(root->fs_info, cache); + ret = btrfs_add_block_group_cache(info, cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); goto error; } - trace_btrfs_add_block_group(root->fs_info, cache, 0); + trace_btrfs_add_block_group(info, cache, 0); ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), cache->bytes_super, &space_info); @@ -10282,8 +10068,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); - set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) { + set_avail_alloc_bits(info, cache->flags); + if (btrfs_chunk_readonly(info, cache->key.objectid)) { inc_block_group_ro(cache, 1); } else if (btrfs_block_group_used(&cache->item) == 0) { spin_lock(&info->unused_bgs_lock); @@ -10297,8 +10083,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) } } - list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { - if (!(get_alloc_profile(root, space_info->flags) & + list_for_each_entry_rcu(space_info, &info->space_info, list) { + if (!(get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | @@ -10327,10 +10113,10 @@ error: } void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *block_group, *tmp; - struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_root *extent_root = fs_info->extent_root; struct btrfs_block_group_item item; struct btrfs_key key; int ret = 0; @@ -10350,11 +10136,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, sizeof(item)); if (ret) btrfs_abort_transaction(trans, ret); - ret = btrfs_finish_chunk_alloc(trans, extent_root, - key.objectid, key.offset); + ret = btrfs_finish_chunk_alloc(trans, fs_info, key.objectid, + key.offset); if (ret) btrfs_abort_transaction(trans, ret); - add_block_group_free_space(trans, root->fs_info, block_group); + add_block_group_free_space(trans, fs_info, block_group); /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); @@ -10363,18 +10149,16 @@ next: } int btrfs_make_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytes_used, + struct btrfs_fs_info *fs_info, u64 bytes_used, u64 type, u64 chunk_objectid, u64 chunk_offset, u64 size) { - int ret; - struct btrfs_root *extent_root; struct btrfs_block_group_cache *cache; - extent_root = root->fs_info->extent_root; + int ret; - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); - cache = btrfs_create_block_group_cache(root, chunk_offset, size); + cache = btrfs_create_block_group_cache(fs_info, chunk_offset, size); if (!cache) return -ENOMEM; @@ -10386,28 +10170,27 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->needs_free_space = 1; - ret = exclude_super_stripes(root, cache); + ret = exclude_super_stripes(fs_info, cache); if (ret) { /* * We may have excluded something, so call this just in * case. */ - free_excluded_extents(root, cache); + free_excluded_extents(fs_info, cache); btrfs_put_block_group(cache); return ret; } - add_new_free_space(cache, root->fs_info, chunk_offset, - chunk_offset + size); + add_new_free_space(cache, fs_info, chunk_offset, chunk_offset + size); - free_excluded_extents(root, cache); + free_excluded_extents(fs_info, cache); #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(root, cache)) { + if (btrfs_should_fragment_free_space(cache)) { u64 new_bytes_used = size - bytes_used; bytes_used += new_bytes_used >> 1; - fragment_free_space(root, cache); + fragment_free_space(cache); } #endif /* @@ -10415,7 +10198,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * assigned to our block group, but don't update its counters just yet. * We want our bg to be added to the rbtree with its ->space_info set. */ - ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0, + ret = update_space_info(fs_info, cache->flags, 0, 0, 0, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); @@ -10423,7 +10206,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_add_block_group_cache(root->fs_info, cache); + ret = btrfs_add_block_group_cache(fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -10434,26 +10217,26 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * Now that our block group has its ->space_info set and is inserted in * the rbtree, update the space info's counters. */ - trace_btrfs_add_block_group(root->fs_info, cache, 1); - ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + trace_btrfs_add_block_group(fs_info, cache, 1); + ret = update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, &cache->space_info); if (ret) { btrfs_remove_free_space_cache(cache); - spin_lock(&root->fs_info->block_group_cache_lock); + spin_lock(&fs_info->block_group_cache_lock); rb_erase(&cache->cache_node, - &root->fs_info->block_group_cache_tree); + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&cache->cache_node); - spin_unlock(&root->fs_info->block_group_cache_lock); + spin_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); return ret; } - update_global_block_rsv(root->fs_info); + update_global_block_rsv(fs_info); __link_block_group(cache->space_info, cache); list_add_tail(&cache->bg_list, &trans->new_bgs); - set_avail_alloc_bits(extent_root->fs_info, type); + set_avail_alloc_bits(fs_info, type); return 0; } @@ -10473,13 +10256,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 group_start, + struct btrfs_fs_info *fs_info, u64 group_start, struct extent_map *em) { + struct btrfs_root *root = fs_info->extent_root; struct btrfs_path *path; struct btrfs_block_group_cache *block_group; struct btrfs_free_cluster *cluster; - struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_key key; struct inode *inode; struct kobject *kobj = NULL; @@ -10489,9 +10273,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_caching_control *caching_ctl = NULL; bool remove_em; - root = root->fs_info->extent_root; - - block_group = btrfs_lookup_block_group(root->fs_info, group_start); + block_group = btrfs_lookup_block_group(fs_info, group_start); BUG_ON(!block_group); BUG_ON(!block_group->ro); @@ -10499,7 +10281,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * Free the reserved super bytes from this block group before * remove it. */ - free_excluded_extents(root, block_group); + free_excluded_extents(fs_info, block_group); memcpy(&key, &block_group->key, sizeof(key)); index = get_block_group_index(block_group); @@ -10511,7 +10293,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, factor = 1; /* make sure this block group isn't part of an allocation cluster */ - cluster = &root->fs_info->data_alloc_cluster; + cluster = &fs_info->data_alloc_cluster; spin_lock(&cluster->refill_lock); btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); @@ -10520,7 +10302,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * make sure this block group isn't part of a metadata * allocation cluster */ - cluster = &root->fs_info->meta_alloc_cluster; + cluster = &fs_info->meta_alloc_cluster; spin_lock(&cluster->refill_lock); btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&cluster->refill_lock); @@ -10549,9 +10331,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_wait_cache_io(root, trans, block_group, - &block_group->io_ctl, path, - block_group->key.objectid); + btrfs_wait_cache_io(trans, block_group, path); btrfs_put_block_group(block_group); spin_lock(&trans->transaction->dirty_bgs_lock); } @@ -10600,14 +10380,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, btrfs_release_path(path); } - spin_lock(&root->fs_info->block_group_cache_lock); + spin_lock(&fs_info->block_group_cache_lock); rb_erase(&block_group->cache_node, - &root->fs_info->block_group_cache_tree); + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - if (root->fs_info->first_logical_byte == block_group->key.objectid) - root->fs_info->first_logical_byte = (u64)-1; - spin_unlock(&root->fs_info->block_group_cache_lock); + if (fs_info->first_logical_byte == block_group->key.objectid) + fs_info->first_logical_byte = (u64)-1; + spin_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -10618,7 +10398,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (list_empty(&block_group->space_info->block_groups[index])) { kobj = block_group->space_info->block_group_kobjs[index]; block_group->space_info->block_group_kobjs[index] = NULL; - clear_avail_alloc_bits(root->fs_info, block_group->flags); + clear_avail_alloc_bits(fs_info, block_group->flags); } up_write(&block_group->space_info->groups_sem); if (kobj) { @@ -10631,12 +10411,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - down_write(&root->fs_info->commit_root_sem); + down_write(&fs_info->commit_root_sem); if (!caching_ctl) { struct btrfs_caching_control *ctl; list_for_each_entry(ctl, - &root->fs_info->caching_block_groups, list) + &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; atomic_inc(&caching_ctl->count); @@ -10645,7 +10425,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - up_write(&root->fs_info->commit_root_sem); + up_write(&fs_info->commit_root_sem); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ put_caching_control(caching_ctl); @@ -10666,7 +10446,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); - if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) { + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { WARN_ON(block_group->space_info->total_bytes < block_group->key.offset); WARN_ON(block_group->space_info->bytes_readonly @@ -10682,7 +10462,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&em->list)) { /* We're in the transaction->pending_chunks list. */ free_extent_map(em); @@ -10730,14 +10510,14 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * sees the em, either in the pending_chunks list or in the * pinned_chunks list. */ - list_move_tail(&em->list, &root->fs_info->pinned_chunks); + list_move_tail(&em->list, &fs_info->pinned_chunks); } spin_unlock(&block_group->lock); if (remove_em) { struct extent_map_tree *em_tree; - em_tree = &root->fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); /* * The em might be in the pending_chunks list, so make sure the @@ -10750,9 +10530,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, free_extent_map(em); } - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); - ret = remove_block_group_free_space(trans, root->fs_info, block_group); + ret = remove_block_group_free_space(trans, fs_info, block_group); if (ret) goto out; @@ -10820,7 +10600,6 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) { struct btrfs_block_group_cache *block_group; struct btrfs_space_info *space_info; - struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; int ret = 0; @@ -10881,7 +10660,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) trans = btrfs_start_trans_remove_block_group(fs_info, block_group->key.objectid); if (IS_ERR(trans)) { - btrfs_dec_block_group_ro(root, block_group); + btrfs_dec_block_group_ro(block_group); ret = PTR_ERR(trans); goto next; } @@ -10908,14 +10687,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(root, block_group); + btrfs_dec_block_group_ro(block_group); goto end_trans; } ret = clear_extent_bits(&fs_info->freed_extents[1], start, end, EXTENT_DIRTY); if (ret) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); - btrfs_dec_block_group_ro(root, block_group); + btrfs_dec_block_group_ro(block_group); goto end_trans; } mutex_unlock(&fs_info->unused_bg_unpin_mutex); @@ -10934,7 +10713,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(root->fs_info, DISCARD); + trimming = btrfs_test_opt(fs_info, DISCARD); /* Implicit trim during transaction commit. */ if (trimming) @@ -10944,7 +10723,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. */ - ret = btrfs_remove_chunk(trans, root, + ret = btrfs_remove_chunk(trans, fs_info, block_group->key.objectid); if (ret) { @@ -10971,7 +10750,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) btrfs_get_block_group(block_group); } end_trans: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); next: mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); @@ -11018,9 +10797,10 @@ out: return ret; } -int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, + u64 start, u64 end) { - return unpin_extent_range(root, start, end, false); + return unpin_extent_range(fs_info, start, end, false); } /* @@ -11060,7 +10840,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, ret = 0; while (1) { - struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_transaction *trans; u64 bytes; @@ -11110,9 +10890,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, return ret; } -int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) +int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; struct btrfs_device *device; struct list_head *devices; @@ -11167,11 +10946,11 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) } } - cache = next_block_group(fs_info->tree_root, cache); + cache = next_block_group(fs_info, cache); } - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - devices = &root->fs_info->fs_devices->alloc_list; + mutex_lock(&fs_info->fs_devices->device_list_mutex); + devices = &fs_info->fs_devices->alloc_list; list_for_each_entry(device, devices, dev_alloc_list) { ret = btrfs_trim_free_extents(device, range->minlen, &group_trimmed); @@ -11180,7 +10959,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) trimmed += group_trimmed; } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); range->len = trimmed; return ret; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8ed05d95584a..4ac383a3a649 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -127,7 +127,7 @@ struct extent_page_data { */ unsigned int extent_locked:1; - /* tells the submit_bio code to use a WRITE_SYNC */ + /* tells the submit_bio code to use REQ_SYNC */ unsigned int sync_io:1; }; @@ -2029,7 +2029,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, WRITE, logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bbio, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); @@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return -EIO; } bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(bio)) { @@ -2067,20 +2067,20 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, return 0; } -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num) +int repair_eb_io_failure(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int mirror_num) { u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); int ret = 0; - if (root->fs_info->sb->s_flags & MS_RDONLY) + if (fs_info->sb->s_flags & MS_RDONLY) return -EROFS; for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; - ret = repair_io_failure(root->fs_info->btree_inode, start, + ret = repair_io_failure(fs_info->btree_inode, start, PAGE_SIZE, start, p, start - page_offset(p), mirror_num); if (ret) @@ -2341,6 +2341,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, struct page *page, int pg_offset, int icsum, bio_end_io_t *endio_func, void *data) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio; struct btrfs_io_bio *btrfs_failed_bio; struct btrfs_io_bio *btrfs_bio; @@ -2351,13 +2352,12 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + bio->bi_bdev = fs_info->fs_devices->latest_bdev; bio->bi_iter.bi_size = 0; bio->bi_private = data; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); btrfs_bio = btrfs_io_bio(bio); @@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, struct inode *inode = page->mapping->host; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct bio *bio; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, } if (failed_bio->bi_vcnt > 1) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + read_mode |= REQ_FAILFAST_DEV; phy_offset >>= inode->i_sb->s_blocksize_bits; bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, @@ -2476,6 +2474,8 @@ static void end_bio_extent_writepage(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); /* We always issue full-page reads, but if some block * in a page fails to read, blk_update_request() will @@ -2484,11 +2484,11 @@ static void end_bio_extent_writepage(struct bio *bio) * if they don't add up to a full page. */ if (bvec->bv_offset || bvec->bv_len != PAGE_SIZE) { if (bvec->bv_offset + bvec->bv_len != PAGE_SIZE) - btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info, + btrfs_err(fs_info, "partial page write in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); else - btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info, + btrfs_info(fs_info, "incomplete page write in btrfs with offset %u and length %u", bvec->bv_offset, bvec->bv_len); } @@ -3484,7 +3484,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) - write_flags = WRITE_SYNC; + write_flags = REQ_SYNC; trace___extent_writepage(page, inode, wbc); @@ -3729,7 +3729,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META; + int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3743,16 +3743,15 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (btrfs_header_level(eb) > 0) { end = btrfs_node_key_ptr_offset(nritems); - memset_extent_buffer(eb, 0, end, eb->len - end); + memzero_extent_buffer(eb, end, eb->len - end); } else { /* * leaf: * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = btrfs_leaf_data(eb) + - leaf_data_end(fs_info->tree_root, eb); - memset_extent_buffer(eb, 0, start, end - start); + end = btrfs_leaf_data(eb) + leaf_data_end(fs_info, eb); + memzero_extent_buffer(eb, start, end - start); } for (i = 0; i < num_pages; i++) { @@ -4076,7 +4075,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd) int ret; bio_set_op_attrs(epd->bio, REQ_OP_WRITE, - epd->sync_io ? WRITE_SYNC : 0); + epd->sync_io ? REQ_SYNC : 0); ret = submit_one_bio(epd->bio, 0, epd->bio_flags); BUG_ON(ret < 0); /* -ENOMEM */ @@ -4343,7 +4342,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, u64 last, get_extent_t *get_extent) { - u64 sectorsize = BTRFS_I(inode)->root->sectorsize; + u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; u64 len; @@ -4404,8 +4403,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -ENOMEM; path->leave_spinning = 1; - start = round_down(start, BTRFS_I(inode)->root->sectorsize); - len = round_up(max, BTRFS_I(inode)->root->sectorsize) - start; + start = round_down(start, btrfs_inode_sectorsize(inode)); + len = round_up(max, btrfs_inode_sectorsize(inode)) - start; /* * lookup the last file extent. We're not using i_size here @@ -4539,7 +4538,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, root->objectid, btrfs_ino(inode), bytenr); if (trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret < 0) goto out_free; if (ret) @@ -4720,9 +4719,9 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) WARN_ON(PageDirty(p)); SetPageUptodate(p); new->pages[i] = p; + copy_page(page_address(p), page_address(src->pages[i])); } - copy_extent_buffer(new, src, 0, 0, src->len); set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags); set_bit(EXTENT_BUFFER_DUMMY, &new->bflags); @@ -4760,21 +4759,9 @@ err: } struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, u32 nodesize) + u64 start) { - unsigned long len; - - if (!fs_info) { - /* - * Called only from tests that don't always have a fs_info - * available - */ - len = nodesize; - } else { - len = fs_info->tree_root->nodesize; - } - - return __alloc_dummy_extent_buffer(fs_info, start, len); + return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); } static void check_buffer_tree_ref(struct extent_buffer *eb) @@ -4865,7 +4852,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, u32 nodesize) + u64 start) { struct extent_buffer *eb, *exists = NULL; int ret; @@ -4873,7 +4860,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, eb = find_extent_buffer(fs_info, start); if (eb) return eb; - eb = alloc_dummy_extent_buffer(fs_info, start, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, start); if (!eb) return NULL; eb->fs_info = fs_info; @@ -4913,7 +4900,7 @@ free_eb: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { - unsigned long len = fs_info->tree_root->nodesize; + unsigned long len = fs_info->nodesize; unsigned long num_pages = num_extent_pages(start, len); unsigned long i; unsigned long index = start >> PAGE_SHIFT; @@ -4924,7 +4911,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->tree_root->sectorsize)) { + if (!IS_ALIGNED(start, fs_info->sectorsize)) { btrfs_err(fs_info, "bad tree block start %llu", start); return ERR_PTR(-EINVAL); } @@ -5465,6 +5452,27 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, return ret; } +void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + const void *srcv) +{ + char *kaddr; + + WARN_ON(!PageUptodate(eb->pages[0])); + kaddr = page_address(eb->pages[0]); + memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, + BTRFS_FSID_SIZE); +} + +void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv) +{ + char *kaddr; + + WARN_ON(!PageUptodate(eb->pages[0])); + kaddr = page_address(eb->pages[0]); + memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, + BTRFS_FSID_SIZE); +} + void write_extent_buffer(struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len) { @@ -5496,8 +5504,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, } } -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len) +void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long len) { size_t cur; size_t offset; @@ -5517,7 +5525,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memset(kaddr + offset, c, cur); + memset(kaddr + offset, 0, cur); len -= cur; offset = 0; @@ -5525,6 +5533,20 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, } } +void copy_extent_buffer_full(struct extent_buffer *dst, + struct extent_buffer *src) +{ + int i; + unsigned num_pages; + + ASSERT(dst->len == src->len); + + num_pages = num_extent_pages(dst->start, dst->len); + for (i = 0; i < num_pages; i++) + copy_page(page_address(dst->pages[i]), + page_address(src->pages[i])); +} + void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len) @@ -5766,6 +5788,7 @@ static void copy_pages(struct page *dst_page, struct page *src_page, void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -5774,13 +5797,13 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - btrfs_err(dst->fs_info, + btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu dst len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - btrfs_err(dst->fs_info, + btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu dst len %lu", dst_offset, len, dst->len); BUG_ON(1); @@ -5812,6 +5835,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -5822,13 +5846,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_i; if (src_offset + len > dst->len) { - btrfs_err(dst->fs_info, + btrfs_err(fs_info, "memmove bogus src_offset %lu move len %lu len %lu", src_offset, len, dst->len); BUG_ON(1); } if (dst_offset + len > dst->len) { - btrfs_err(dst->fs_info, + btrfs_err(fs_info, "memmove bogus dst_offset %lu move len %lu len %lu", dst_offset, len, dst->len); BUG_ON(1); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ab31d145227e..17f9ce479ed7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -371,7 +371,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, u32 nodesize); + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -405,8 +405,13 @@ void read_extent_buffer(struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); +void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); +void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, + const void *src); void write_extent_buffer(struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); +void copy_extent_buffer_full(struct extent_buffer *dst, + struct extent_buffer *src); void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, unsigned long dst_offset, unsigned long src_offset, unsigned long len); @@ -414,8 +419,8 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); -void memset_extent_buffer(struct extent_buffer *eb, char c, - unsigned long start, unsigned long len); +void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long len); int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, unsigned long pos); void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, @@ -452,8 +457,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, - int mirror_num); +int repair_eb_io_failure(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we @@ -491,5 +496,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode, u64 *end, u64 max_bytes); #endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, u32 nodesize); + u64 start); #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d0d571c47d33..e97e322c28f0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -34,9 +34,9 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ +#define MAX_ORDERED_SUM_BYTES(fs_info) ((PAGE_SIZE - \ sizeof(struct btrfs_ordered_sum)) / \ - sizeof(u32) * (r)->sectorsize) + sizeof(u32) * (fs_info)->sectorsize) int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -90,13 +90,14 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 bytenr, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_csum_item *item; struct extent_buffer *leaf; u64 csum_offset = 0; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -116,7 +117,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; csum_offset = (bytenr - found_key.offset) >> - root->fs_info->sb->s_blocksize_bits; + fs_info->sb->s_blocksize_bits; csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); csums_in_item /= csum_size; @@ -159,11 +160,11 @@ static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err) kfree(bio->csum_allocated); } -static int __btrfs_lookup_bio_sums(struct btrfs_root *root, - struct inode *inode, struct bio *bio, +static int __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u64 logical_offset, u32 *dst, int dio) { - struct bio_vec *bvec = bio->bi_io_vec; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct bio_vec *bvec; struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -176,9 +177,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, u64 page_bytes_left; u32 diff; int nblocks; - int bio_index = 0; - int count; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + int count = 0, i; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -223,8 +223,11 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (dio) offset = logical_offset; - page_bytes_left = bvec->bv_len; - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, i) { + page_bytes_left = bvec->bv_len; + if (count) + goto next; + if (!dio) offset = page_offset(bvec->bv_page) + bvec->bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, @@ -239,7 +242,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (item) btrfs_release_path(path); - item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + item = btrfs_lookup_csum(NULL, fs_info->csum_root, path, disk_bytenr, 0); if (IS_ERR(item)) { count = 1; @@ -247,10 +250,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { set_extent_bits(io_tree, offset, - offset + root->sectorsize - 1, + offset + fs_info->sectorsize - 1, EXTENT_NODATASUM); } else { - btrfs_info_rl(BTRFS_I(inode)->root->fs_info, + btrfs_info_rl(fs_info, "no csum found for inode %llu start %llu", btrfs_ino(inode), offset); } @@ -266,7 +269,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, path->slots[0]); item_last_offset = item_start_offset + (item_size / csum_size) * - root->sectorsize; + fs_info->sectorsize; item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); } @@ -275,7 +278,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, * a single leaf so it will also fit inside a u32 */ diff = disk_bytenr - item_start_offset; - diff = diff / root->sectorsize; + diff = diff / fs_info->sectorsize; diff = diff * csum_size; count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >> inode->i_sb->s_blocksize_bits); @@ -285,48 +288,35 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, found: csum += count * csum_size; nblocks -= count; - +next: while (count--) { - disk_bytenr += root->sectorsize; - offset += root->sectorsize; - page_bytes_left -= root->sectorsize; - if (!page_bytes_left) { - bio_index++; - /* - * make sure we're still inside the - * bio before we update page_bytes_left - */ - if (bio_index >= bio->bi_vcnt) { - WARN_ON_ONCE(count); - goto done; - } - bvec++; - page_bytes_left = bvec->bv_len; - } - + disk_bytenr += fs_info->sectorsize; + offset += fs_info->sectorsize; + page_bytes_left -= fs_info->sectorsize; + if (!page_bytes_left) + break; /* move to next bio */ } } -done: + WARN_ON_ONCE(count); btrfs_free_path(path); return 0; } -int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u32 *dst) +int btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst) { - return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0); + return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); } -int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 offset) +int btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) { - return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); + return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; struct extent_buffer *leaf; @@ -337,10 +327,10 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int ret; size_t size; u64 csum_end; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); - ASSERT(IS_ALIGNED(start, root->sectorsize) && - IS_ALIGNED(end + 1, root->sectorsize)); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); path = btrfs_alloc_path(); if (!path) @@ -365,7 +355,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { offset = (start - key.offset) >> - root->fs_info->sb->s_blocksize_bits; + fs_info->sb->s_blocksize_bits; if (offset * csum_size < btrfs_item_size_nr(leaf, path->slots[0] - 1)) path->slots[0]--; @@ -393,7 +383,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, start = key.offset; size = btrfs_item_size_nr(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * root->sectorsize; + csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; if (csum_end <= start) { path->slots[0]++; continue; @@ -404,8 +394,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_csum_item); while (start < csum_end) { size = min_t(size_t, csum_end - start, - MAX_ORDERED_SUM_BYTES(root)); - sums = kzalloc(btrfs_ordered_sum_size(root, size), + MAX_ORDERED_SUM_BYTES(fs_info)); + sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { ret = -ENOMEM; @@ -416,16 +406,16 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->len = (int)size; offset = (start - key.offset) >> - root->fs_info->sb->s_blocksize_bits; + fs_info->sb->s_blocksize_bits; offset *= csum_size; - size >>= root->fs_info->sb->s_blocksize_bits; + size >>= fs_info->sb->s_blocksize_bits; read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, csum_size * size); - start += root->sectorsize * size; + start += fs_info->sectorsize * size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; @@ -443,23 +433,23 @@ fail: return ret; } -int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, - struct bio *bio, u64 file_start, int contig) +int btrfs_csum_one_bio(struct inode *inode, struct bio *bio, + u64 file_start, int contig) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; - struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *ordered = NULL; char *data; - struct bio_vec *bvec = bio->bi_io_vec; - int bio_index = 0; + struct bio_vec *bvec; int index; int nr_sectors; - int i; + int i, j; unsigned long total_bytes = 0; unsigned long this_sum_bytes = 0; u64 offset; WARN_ON(bio->bi_vcnt <= 0); - sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_iter.bi_size), + sums = kzalloc(btrfs_ordered_sum_size(fs_info, bio->bi_iter.bi_size), GFP_NOFS); if (!sums) return -ENOMEM; @@ -470,22 +460,25 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (contig) offset = file_start; else - offset = page_offset(bvec->bv_page) + bvec->bv_offset; + offset = 0; /* shut up gcc */ - ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ sums->bytenr = (u64)bio->bi_iter.bi_sector << 9; index = 0; - while (bio_index < bio->bi_vcnt) { + bio_for_each_segment_all(bvec, bio, j) { if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; + if (!ordered) { + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); /* Logic error */ + } + data = kmap_atomic(bvec->bv_page); - nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - bvec->bv_len + root->sectorsize - - 1); + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, + bvec->bv_len + fs_info->sectorsize + - 1); for (i = 0; i < nr_sectors; i++) { if (offset >= ordered->file_offset + ordered->len || @@ -500,8 +493,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, bytes_left = bio->bi_iter.bi_size - total_bytes; - sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), - GFP_NOFS); + sums = kzalloc(btrfs_ordered_sum_size(fs_info, bytes_left), + GFP_NOFS); BUG_ON(!sums); /* -ENOMEM */ sums->len = bytes_left; ordered = btrfs_lookup_ordered_extent(inode, @@ -517,21 +510,18 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, sums->sums[index] = ~(u32)0; sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset - + (i * root->sectorsize), + + (i * fs_info->sectorsize), sums->sums[index], - root->sectorsize); + fs_info->sectorsize); btrfs_csum_final(sums->sums[index], (char *)(sums->sums + index)); index++; - offset += root->sectorsize; - this_sum_bytes += root->sectorsize; - total_bytes += root->sectorsize; + offset += fs_info->sectorsize; + this_sum_bytes += fs_info->sectorsize; + total_bytes += fs_info->sectorsize; } kunmap_atomic(data); - - bio_index++; - bvec++; } this_sum_bytes = 0; btrfs_add_ordered_sum(inode, ordered, sums); @@ -550,20 +540,20 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, * This calls btrfs_truncate_item with the correct args based on the * overlap, and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_root *root, +static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) { struct extent_buffer *leaf; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); u64 csum_end; u64 end_byte = bytenr + len; - u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + u32 blocksize_bits = fs_info->sb->s_blocksize_bits; leaf = path->nodes[0]; csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; - csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end <<= fs_info->sb->s_blocksize_bits; csum_end += key->offset; if (key->offset < bytenr && csum_end <= end_byte) { @@ -575,7 +565,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(root, path, new_size, 1); + btrfs_truncate_item(fs_info, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -587,10 +577,10 @@ static noinline void truncate_one_csum(struct btrfs_root *root, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(root, path, new_size, 0); + btrfs_truncate_item(fs_info, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(root->fs_info, path, key); + btrfs_set_item_key_safe(fs_info, path, key); } else { BUG(); } @@ -601,18 +591,17 @@ static noinline void truncate_one_csum(struct btrfs_root *root, * range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len) + struct btrfs_fs_info *fs_info, u64 bytenr, u64 len) { + struct btrfs_root *root = fs_info->csum_root; struct btrfs_path *path; struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; int ret; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int blocksize_bits = root->fs_info->sb->s_blocksize_bits; - - root = root->fs_info->csum_root; + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); + int blocksize_bits = fs_info->sb->s_blocksize_bits; path = btrfs_alloc_path(); if (!path) @@ -689,7 +678,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, item_offset = btrfs_item_ptr_offset(leaf, path->slots[0]); - memset_extent_buffer(leaf, 0, item_offset + offset, + memzero_extent_buffer(leaf, item_offset + offset, shift_len); key.offset = bytenr; @@ -705,7 +694,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(root, path, &key, bytenr, len); + truncate_one_csum(fs_info, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -721,6 +710,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; struct btrfs_path *path; @@ -736,7 +726,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, int index = 0; int found_next; int ret; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); path = btrfs_alloc_path(); if (!path) @@ -769,7 +759,7 @@ again: leaf = path->nodes[0]; item_size = btrfs_item_size_nr(leaf, path->slots[0]); if ((item_size / csum_size) >= - MAX_CSUM_ITEMS(root, csum_size)) { + MAX_CSUM_ITEMS(fs_info, csum_size)) { /* already at max size, make a new one */ goto insert; } @@ -815,11 +805,11 @@ again: leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); csum_offset = (bytenr - found_key.offset) >> - root->fs_info->sb->s_blocksize_bits; + fs_info->sb->s_blocksize_bits; if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + csum_offset >= MAX_CSUM_ITEMS(fs_info, csum_size)) { goto insert; } @@ -830,26 +820,27 @@ again: u32 diff; u32 free_space; - if (btrfs_leaf_free_space(root, leaf) < + if (btrfs_leaf_free_space(fs_info, leaf) < sizeof(struct btrfs_item) + csum_size * 2) goto insert; - free_space = btrfs_leaf_free_space(root, leaf) - + free_space = btrfs_leaf_free_space(fs_info, leaf) - sizeof(struct btrfs_item) - csum_size; tmp = sums->len - total_bytes; - tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sb->s_blocksize_bits; WARN_ON(tmp < 1); extend_nr = max_t(int, 1, (int)tmp); diff = (csum_offset + extend_nr) * csum_size; - diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); + diff = min(diff, + MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); diff = min(free_space, diff); diff /= csum_size; diff *= csum_size; - btrfs_extend_item(root, path, diff); + btrfs_extend_item(fs_info, path, diff); ret = 0; goto csum; } @@ -861,12 +852,12 @@ insert: u64 tmp; tmp = sums->len - total_bytes; - tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp >>= fs_info->sb->s_blocksize_bits; tmp = min(tmp, (next_offset - file_key.offset) >> - root->fs_info->sb->s_blocksize_bits); + fs_info->sb->s_blocksize_bits); tmp = max((u64)1, tmp); - tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(fs_info, csum_size)); ins_size = csum_size * tmp; } else { ins_size = csum_size; @@ -888,7 +879,7 @@ csum: csum_offset * csum_size); found: ins_size = (u32)(sums->len - total_bytes) >> - root->fs_info->sb->s_blocksize_bits; + fs_info->sb->s_blocksize_bits; ins_size *= csum_size; ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item, ins_size); @@ -896,7 +887,7 @@ found: ins_size); ins_size /= csum_size; - total_bytes += ins_size * root->sectorsize; + total_bytes += ins_size * fs_info->sectorsize; index += ins_size; btrfs_mark_buffer_dirty(path->nodes[0]); @@ -919,6 +910,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, const bool new_inline, struct extent_map *em) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf = path->nodes[0]; const int slot = path->slots[0]; @@ -928,7 +920,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, u8 type = btrfs_file_extent_type(leaf, fi); int compress_type = btrfs_file_extent_compression(leaf, fi); - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; btrfs_item_key_to_cpu(leaf, &key, slot); extent_start = key.offset; @@ -939,7 +931,8 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, } else if (type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, slot, fi); - extent_end = ALIGN(extent_start + size, root->sectorsize); + extent_end = ALIGN(extent_start + size, + fs_info->sectorsize); } em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); @@ -982,7 +975,7 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, em->compress_type = compress_type; } } else { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, root %llu", type, btrfs_ino(inode), extent_start, root->root_key.objectid); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3a14c87d9c92..b5c5da215d05 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -27,7 +27,6 @@ #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/statfs.h> #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> @@ -96,13 +95,13 @@ static int __compare_inode_defrag(struct inode_defrag *defrag1, static int __btrfs_add_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct inode_defrag *entry; struct rb_node **p; struct rb_node *parent = NULL; int ret; - p = &root->fs_info->defrag_inodes.rb_node; + p = &fs_info->defrag_inodes.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct inode_defrag, rb_node); @@ -126,16 +125,16 @@ static int __btrfs_add_inode_defrag(struct inode *inode, } set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); return 0; } -static inline int __need_auto_defrag(struct btrfs_root *root) +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) { - if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG)) + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) return 0; - if (btrfs_fs_closing(root->fs_info)) + if (btrfs_fs_closing(fs_info)) return 0; return 1; @@ -148,12 +147,13 @@ static inline int __need_auto_defrag(struct btrfs_root *root) int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct inode_defrag *defrag; u64 transid; int ret; - if (!__need_auto_defrag(root)) + if (!__need_auto_defrag(fs_info)) return 0; if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) @@ -172,7 +172,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, defrag->transid = transid; defrag->root = root->root_key.objectid; - spin_lock(&root->fs_info->defrag_inodes_lock); + spin_lock(&fs_info->defrag_inodes_lock); if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) { /* * If we set IN_DEFRAG flag and evict the inode from memory, @@ -185,7 +185,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, } else { kmem_cache_free(btrfs_inode_defrag_cachep, defrag); } - spin_unlock(&root->fs_info->defrag_inodes_lock); + spin_unlock(&fs_info->defrag_inodes_lock); return 0; } @@ -197,19 +197,19 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, static void btrfs_requeue_inode_defrag(struct inode *inode, struct inode_defrag *defrag) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; - if (!__need_auto_defrag(root)) + if (!__need_auto_defrag(fs_info)) goto out; /* * Here we don't check the IN_DEFRAG flag, because we need merge * them together. */ - spin_lock(&root->fs_info->defrag_inodes_lock); + spin_lock(&fs_info->defrag_inodes_lock); ret = __btrfs_add_inode_defrag(inode, defrag); - spin_unlock(&root->fs_info->defrag_inodes_lock); + spin_unlock(&fs_info->defrag_inodes_lock); if (ret) goto out; return; @@ -373,7 +373,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) &fs_info->fs_state)) break; - if (!__need_auto_defrag(fs_info->tree_root)) + if (!__need_auto_defrag(fs_info)) break; /* find an inode to defrag */ @@ -485,11 +485,11 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) * this also makes the decision about creating an inline extent vs * doing real data extents, marking pages dirty and delalloc as required. */ -int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, - struct page **pages, size_t num_pages, - loff_t pos, size_t write_bytes, - struct extent_state **cached) +int btrfs_dirty_pages(struct inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int err = 0; int i; u64 num_bytes; @@ -498,8 +498,9 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(inode); - start_pos = pos & ~((u64)root->sectorsize - 1); - num_bytes = round_up(write_bytes + pos - start_pos, root->sectorsize); + start_pos = pos & ~((u64) fs_info->sectorsize - 1); + num_bytes = round_up(write_bytes + pos - start_pos, + fs_info->sectorsize); end_of_last_block = start_pos + num_bytes - 1; err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, @@ -696,6 +697,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, u32 extent_item_size, int *key_inserted) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -706,6 +708,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; + u64 last_end = start; int del_nr = 0; int del_slot = 0; int extent_type; @@ -723,7 +726,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, modify_tree = 0; update_refs = (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == root->fs_info->tree_root); + root == fs_info->tree_root); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -797,8 +800,10 @@ next_slot: * extent item in the call to setup_items_for_insert() later * in this function. */ - if (extent_end == key.offset && extent_end >= search_start) + if (extent_end == key.offset && extent_end >= search_start) { + last_end = extent_end; goto delete_extent_item; + } if (extent_end <= search_start) { path->slots[0]++; @@ -851,7 +856,7 @@ next_slot: btrfs_mark_buffer_dirty(leaf); if (update_refs && disk_bytenr > 0) { - ret = btrfs_inc_extent_ref(trans, root, + ret = btrfs_inc_extent_ref(trans, fs_info, disk_bytenr, num_bytes, 0, root->root_key.objectid, new_key.objectid, @@ -861,6 +866,12 @@ next_slot: key.offset = start; } /* + * From here on out we will have actually dropped something, so + * last_end can be updated. + */ + last_end = extent_end; + + /* * | ---- range to drop ----- | * | -------- extent -------- | */ @@ -872,7 +883,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root->fs_info, path, &new_key); + btrfs_set_item_key_safe(fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -927,9 +938,9 @@ delete_extent_item: inode_sub_bytes(inode, extent_end - key.offset); extent_end = ALIGN(extent_end, - root->sectorsize); + fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { - ret = btrfs_free_extent(trans, root, + ret = btrfs_free_extent(trans, fs_info, disk_bytenr, num_bytes, 0, root->root_key.objectid, key.objectid, key.offset - @@ -986,7 +997,7 @@ delete_extent_item: if (!ret && replace_extent && leafs_visited == 1 && (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING || path->locks[0] == BTRFS_WRITE_LOCK) && - btrfs_leaf_free_space(root, leaf) >= + btrfs_leaf_free_space(fs_info, leaf) >= sizeof(struct btrfs_item) + extent_item_size) { key.objectid = ino; @@ -1010,7 +1021,7 @@ delete_extent_item: if (!replace_extent || !(*key_inserted)) btrfs_release_path(path); if (drop_end) - *drop_end = found ? min(end, extent_end) : end; + *drop_end = found ? min(end, last_end) : end; return ret; } @@ -1073,6 +1084,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct inode *inode, u64 start, u64 end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_path *path; @@ -1142,7 +1154,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root->fs_info, path, &new_key); + btrfs_set_item_key_safe(fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1176,7 +1188,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root->fs_info, path, &new_key); + btrfs_set_item_key_safe(fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1222,8 +1234,8 @@ again: extent_end - split); btrfs_mark_buffer_dirty(leaf); - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, + ret = btrfs_inc_extent_ref(trans, fs_info, bytenr, num_bytes, + 0, root->root_key.objectid, ino, orig_offset); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1256,7 +1268,7 @@ again: extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1276,7 +1288,7 @@ again: key.offset = other_start; del_slot = path->slots[0]; del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, 0, root->root_key.objectid, ino, orig_offset); if (ret) { @@ -1409,15 +1421,16 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, u64 *lockstart, u64 *lockend, struct extent_state **cached_state) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 start_pos; u64 last_pos; int i; int ret = 0; - start_pos = round_down(pos, root->sectorsize); + start_pos = round_down(pos, fs_info->sectorsize); last_pos = start_pos - + round_up(pos + write_bytes - start_pos, root->sectorsize) - 1; + + round_up(pos + write_bytes - start_pos, + fs_info->sectorsize) - 1; if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; @@ -1464,6 +1477,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, static noinline int check_can_nocow(struct inode *inode, loff_t pos, size_t *write_bytes) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered; u64 lockstart, lockend; @@ -1474,8 +1488,9 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos, if (!ret) return -ENOSPC; - lockstart = round_down(pos, root->sectorsize); - lockend = round_up(pos + *write_bytes, root->sectorsize) - 1; + lockstart = round_down(pos, fs_info->sectorsize); + lockend = round_up(pos + *write_bytes, + fs_info->sectorsize) - 1; while (1) { lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend); @@ -1509,6 +1524,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, loff_t pos) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct page **pages = NULL; struct extent_state *cached_state = NULL; @@ -1555,9 +1571,9 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, break; } - sector_offset = pos & (root->sectorsize - 1); + sector_offset = pos & (fs_info->sectorsize - 1); reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); + fs_info->sectorsize); ret = btrfs_check_data_free_space(inode, pos, write_bytes); if (ret < 0) { @@ -1577,7 +1593,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, PAGE_SIZE); reserve_bytes = round_up(write_bytes + sector_offset, - root->sectorsize); + fs_info->sectorsize); } else { break; } @@ -1621,12 +1637,10 @@ again: copied = btrfs_copy_from_user(pos, write_bytes, pages, i); - num_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - reserve_bytes); + num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); dirty_sectors = round_up(copied + sector_offset, - root->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, - dirty_sectors); + fs_info->sectorsize); + dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); /* * if we have trouble faulting in the pages, fall @@ -1654,11 +1668,9 @@ again: * managed to copy. */ if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ release_bytes -= dirty_sectors << - root->fs_info->sb->s_blocksize_bits; - + fs_info->sb->s_blocksize_bits; if (copied > 0) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->outstanding_extents++; @@ -1670,7 +1682,8 @@ again: } else { u64 __pos; - __pos = round_down(pos, root->sectorsize) + + __pos = round_down(pos, + fs_info->sectorsize) + (dirty_pages << PAGE_SHIFT); btrfs_delalloc_release_space(inode, __pos, release_bytes); @@ -1678,12 +1691,11 @@ again: } release_bytes = round_up(copied + sector_offset, - root->sectorsize); + fs_info->sectorsize); if (copied > 0) - ret = btrfs_dirty_pages(root, inode, pages, - dirty_pages, pos, copied, - NULL); + ret = btrfs_dirty_pages(inode, pages, dirty_pages, + pos, copied, NULL); if (need_unlock) unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, @@ -1698,8 +1710,10 @@ again: btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { - lockstart = round_down(pos, root->sectorsize); - lockend = round_up(pos + copied, root->sectorsize) - 1; + lockstart = round_down(pos, + fs_info->sectorsize); + lockend = round_up(pos + copied, + fs_info->sectorsize) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, EXTENT_NORESERVE, NULL, @@ -1712,8 +1726,8 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->nodesize >> PAGE_SHIFT) + 1) - btrfs_btree_balance_dirty(root); + if (dirty_pages < (fs_info->nodesize >> PAGE_SHIFT) + 1) + btrfs_btree_balance_dirty(fs_info); pos += copied; num_written += copied; @@ -1727,7 +1741,7 @@ again: btrfs_delalloc_release_metadata(inode, release_bytes); } else { btrfs_delalloc_release_space(inode, - round_down(pos, root->sectorsize), + round_down(pos, fs_info->sectorsize), release_bytes); } } @@ -1798,6 +1812,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; u64 start_pos; u64 end_pos; @@ -1829,7 +1844,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * although we have opened a file as writable, we have * to stop this write operation to ensure FS consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { inode_unlock(inode); err = -EROFS; goto out; @@ -1845,17 +1860,18 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, pos = iocb->ki_pos; count = iov_iter_count(from); - start_pos = round_down(pos, root->sectorsize); + start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); if (start_pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ - end_pos = round_up(pos + count, root->sectorsize); + end_pos = round_up(pos + count, + fs_info->sectorsize); err = btrfs_cont_expand(inode, oldsize, end_pos); if (err) { inode_unlock(inode); goto out; } - if (start_pos > round_up(oldsize, root->sectorsize)) + if (start_pos > round_up(oldsize, fs_info->sectorsize)) clean_page = 1; } @@ -1935,6 +1951,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; @@ -2045,12 +2062,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * commit does not start nor waits for ordered extents to complete. */ smp_mb(); - if (btrfs_inode_in_log(inode, root->fs_info->generation) || + if (btrfs_inode_in_log(inode, fs_info->generation) || (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed) || + fs_info->last_trans_committed) || (!btrfs_have_ordered_extents_in_range(inode, start, len) && BTRFS_I(inode)->last_trans - <= root->fs_info->last_trans_committed)) { + <= fs_info->last_trans_committed)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2129,7 +2146,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * which are indicated by ctx.io_err. */ if (ctx.io_err) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); ret = ctx.io_err; goto out; } @@ -2138,20 +2155,20 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); goto out; } } if (!full_sync) { ret = btrfs_wait_ordered_range(inode, start, len); if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); } else { - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); } out: return ret > 0 ? -EIO : ret; @@ -2208,6 +2225,7 @@ static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf, static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, u64 offset, u64 end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -2216,7 +2234,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_key key; int ret; - if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + if (btrfs_fs_incompat(fs_info, NO_HOLES)) goto out; key.objectid = btrfs_ino(inode); @@ -2224,9 +2242,15 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) + if (ret <= 0) { + /* + * We should have dropped this offset, so if we find it then + * something has gone horribly wrong. + */ + if (ret == 0) + ret = -EINVAL; return ret; - BUG_ON(!ret); + } leaf = path->nodes[0]; if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) { @@ -2248,7 +2272,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(root->fs_info, path, &key); + btrfs_set_item_key_safe(fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2284,7 +2308,7 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; @@ -2336,6 +2360,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; @@ -2347,13 +2372,13 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) u64 tail_len; u64 orig_start = offset; u64 cur_offset; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 min_size = btrfs_calc_trans_metadata_size(fs_info, 1); u64 drop_end; int ret = 0; int err = 0; unsigned int rsv_count; bool same_block; - bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); + bool no_holes = btrfs_fs_incompat(fs_info, NO_HOLES); u64 ino_size; bool truncated_block = false; bool updated_inode = false; @@ -2363,7 +2388,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; inode_lock(inode); - ino_size = round_up(inode->i_size, root->sectorsize); + ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) goto out_only_mutex; @@ -2373,11 +2398,11 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize); + lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); lockend = round_down(offset + len, - BTRFS_I(inode)->root->sectorsize) - 1; - same_block = (BTRFS_BYTES_TO_BLKS(root->fs_info, offset)) - == (BTRFS_BYTES_TO_BLKS(root->fs_info, offset + len - 1)); + btrfs_inode_sectorsize(inode)) - 1; + same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) + == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* * We needn't truncate any block which is beyond the end of the file * because we are sure there is no data there. @@ -2386,7 +2411,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) * Only do this if we are in the same block and we aren't doing the * entire block. */ - if (same_block && len < root->sectorsize) { + if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(inode, offset, len, 0); @@ -2489,12 +2514,12 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { ret = -ENOMEM; goto out_free; } - rsv->size = btrfs_calc_trunc_metadata_size(root, 1); + rsv->size = btrfs_calc_trans_metadata_size(fs_info, 1); rsv->failfast = 1; /* @@ -2509,7 +2534,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_free; } - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, 0); BUG_ON(ret); trans->block_rsv = rsv; @@ -2523,12 +2548,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret != -ENOSPC) break; - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; - if (cur_offset < ino_size) { + if (cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { + /* + * If we failed then we didn't insert our hole + * entries for the area we dropped, so now the + * fs is corrupted, so we must abort the + * transaction. + */ + btrfs_abort_transaction(trans, ret); err = ret; break; } @@ -2542,8 +2574,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { @@ -2552,7 +2584,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) break; } - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; @@ -2571,7 +2603,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_trans; } - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; /* * If we are using the NO_HOLES feature we might have had already an * hole that overlaps a part of the region [lockstart, lockend] and @@ -2593,6 +2625,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_end); if (ret) { + /* Same comment as above. */ + btrfs_abort_transaction(trans, ret); err = ret; goto out_trans; } @@ -2605,14 +2639,14 @@ out_trans: inode_inc_iversion(inode); inode->i_mtime = inode->i_ctime = current_time(inode); - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); updated_inode = true; - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); out_free: btrfs_free_path(path); - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state, GFP_NOFS); @@ -2630,7 +2664,7 @@ out_only_mutex: err = PTR_ERR(trans); } else { err = btrfs_update_inode(trans, root, inode); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); } } inode_unlock(inode); @@ -2695,7 +2729,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = BTRFS_I(inode)->root->sectorsize; + int blocksize = btrfs_inode_sectorsize(inode); int ret; alloc_start = round_down(offset, blocksize); @@ -2872,9 +2906,9 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_ordered_update_i_size(inode, actual_end, NULL); ret = btrfs_update_inode(trans, root, inode); if (ret) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); else - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); } } out_unlock: @@ -2891,7 +2925,7 @@ out: static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 lockstart; @@ -2909,10 +2943,11 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) */ start = max_t(loff_t, 0, *offset); - lockstart = round_down(start, root->sectorsize); - lockend = round_up(i_size_read(inode), root->sectorsize); + lockstart = round_down(start, fs_info->sectorsize); + lockend = round_up(i_size_read(inode), + fs_info->sectorsize); if (lockend <= lockstart) - lockend = lockstart + root->sectorsize; + lockend = lockstart + fs_info->sectorsize; lockend--; len = lockend - lockstart + 1; @@ -2998,7 +3033,6 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif - .copy_file_range = btrfs_copy_file_range, .clone_file_range = btrfs_clone_file_range, .dedupe_file_range = btrfs_dedupe_file_range, }; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e4b48f377d3a..7015892c9ee8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -42,11 +42,16 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); +static int btrfs_wait_cache_io_root(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path); static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_path *path, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_key location; struct btrfs_disk_key disk_key; @@ -74,9 +79,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_disk_key_to_cpu(&location, &disk_key); btrfs_release_path(path); - inode = btrfs_iget(root->fs_info->sb, &location, root, NULL); - if (!inode) - return ERR_PTR(-ENOENT); + inode = btrfs_iget(fs_info->sb, &location, root, NULL); if (IS_ERR(inode)) return inode; if (is_bad_inode(inode)) { @@ -96,6 +99,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, *block_group, struct btrfs_path *path) { struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = root->fs_info; u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; spin_lock(&block_group->lock); @@ -112,8 +116,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root, spin_lock(&block_group->lock); if (!((BTRFS_I(inode)->flags & flags) == flags)) { - btrfs_info(root->fs_info, - "Old style space inode found, converting."); + btrfs_info(fs_info, "Old style space inode found, converting."); BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW; block_group->disk_cache_state = BTRFS_DC_CLEAR; @@ -153,7 +156,7 @@ static int __create_free_space_inode(struct btrfs_root *root, inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); btrfs_item_key(leaf, &disk_key, path->slots[0]); - memset_extent_buffer(leaf, 0, (unsigned long)inode_item, + memzero_extent_buffer(leaf, (unsigned long)inode_item, sizeof(*inode_item)); btrfs_set_inode_generation(leaf, inode_item, trans->transid); btrfs_set_inode_size(leaf, inode_item, 0); @@ -181,7 +184,7 @@ static int __create_free_space_inode(struct btrfs_root *root, leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); - memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header)); + memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -205,15 +208,15 @@ int create_free_space_inode(struct btrfs_root *root, block_group->key.objectid); } -int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv) { u64 needed_bytes; int ret; /* 1 for slack space, 1 for updating the inode */ - needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) + - btrfs_calc_trans_metadata_size(root, 1); + needed_bytes = btrfs_calc_trunc_metadata_size(fs_info, 1) + + btrfs_calc_trans_metadata_size(fs_info, 1); spin_lock(&rsv->lock); if (rsv->reserved < needed_bytes) @@ -244,9 +247,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); - btrfs_wait_cache_io(root, trans, block_group, - &block_group->io_ctl, path, - block_group->key.objectid); + btrfs_wait_cache_io(trans, block_group, path); btrfs_put_block_group(block_group); } @@ -305,7 +306,7 @@ static int readahead_cache(struct inode *inode) } static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, - struct btrfs_root *root, int write) + int write) { int num_pages; int check_crcs = 0; @@ -327,7 +328,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return -ENOMEM; io_ctl->num_pages = num_pages; - io_ctl->root = root; + io_ctl->fs_info = btrfs_sb(inode->i_sb); io_ctl->check_crcs = check_crcs; io_ctl->inode = inode; @@ -450,7 +451,7 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) gen = io_ctl->cur; if (le64_to_cpu(*gen) != generation) { - btrfs_err_rl(io_ctl->root->fs_info, + btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", *gen, generation); io_ctl_unmap_page(io_ctl); @@ -476,7 +477,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); + btrfs_csum_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); tmp += index; @@ -504,9 +505,9 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, PAGE_SIZE - offset); - btrfs_csum_final(crc, (char *)&crc); + btrfs_csum_final(crc, (u8 *)&crc); if (val != crc) { - btrfs_err_rl(io_ctl->root->fs_info, + btrfs_err_rl(io_ctl->fs_info, "csum mismatch on free space cache"); io_ctl_unmap_page(io_ctl); return -EIO; @@ -669,6 +670,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_path *path, u64 offset) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_free_space_header *header; struct extent_buffer *leaf; struct btrfs_io_ctl io_ctl; @@ -708,23 +710,23 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, btrfs_release_path(path); if (!BTRFS_I(inode)->generation) { - btrfs_info(root->fs_info, + btrfs_info(fs_info, "The free space cache file (%llu) is invalid. skip it\n", offset); return 0; } if (BTRFS_I(inode)->generation != generation) { - btrfs_err(root->fs_info, - "free space inode generation (%llu) did not match free space cache generation (%llu)", - BTRFS_I(inode)->generation, generation); + btrfs_err(fs_info, + "free space inode generation (%llu) did not match free space cache generation (%llu)", + BTRFS_I(inode)->generation, generation); return 0; } if (!num_entries) return 0; - ret = io_ctl_init(&io_ctl, inode, root, 0); + ret = io_ctl_init(&io_ctl, inode, 0); if (ret) return ret; @@ -766,7 +768,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = link_free_space(ctl, e); spin_unlock(&ctl->tree_lock); if (ret) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -786,7 +788,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ctl->op->recalc_thresholds(ctl); spin_unlock(&ctl->tree_lock); if (ret) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "Duplicate entries in free space cache, dumping"); kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -1033,7 +1035,7 @@ fail: } static noinline_for_stack int -write_pinned_extent_entries(struct btrfs_root *root, +write_pinned_extent_entries(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_io_ctl *io_ctl, int *entries) @@ -1052,7 +1054,7 @@ write_pinned_extent_entries(struct btrfs_root *root, * We shouldn't have switched the pinned extents yet so this is the * right one */ - unpin = root->fs_info->pinned_extents; + unpin = fs_info->pinned_extents; start = block_group->key.objectid; @@ -1135,20 +1137,20 @@ cleanup_write_cache_enospc(struct inode *inode, GFP_NOFS); } -int btrfs_wait_cache_io(struct btrfs_root *root, - struct btrfs_trans_handle *trans, - struct btrfs_block_group_cache *block_group, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path, u64 offset) +static int __btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) { int ret; struct inode *inode = io_ctl->inode; + struct btrfs_fs_info *fs_info; if (!inode) return 0; - if (block_group) - root = root->fs_info->tree_root; + fs_info = btrfs_sb(inode->i_sb); /* Flush the dirty pages in the cache file. */ ret = flush_dirty_cache(inode); @@ -1165,9 +1167,9 @@ out: BTRFS_I(inode)->generation = 0; if (block_group) { #ifdef DEBUG - btrfs_err(root->fs_info, - "failed to write free space cache for block group %llu", - block_group->key.objectid); + btrfs_err(fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); #endif } } @@ -1200,6 +1202,23 @@ out: } +static int btrfs_wait_cache_io_root(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path) +{ + return __btrfs_wait_cache_io(root, trans, NULL, io_ctl, path, 0); +} + +int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + return __btrfs_wait_cache_io(block_group->fs_info->tree_root, trans, + block_group, &block_group->io_ctl, + path, block_group->key.objectid); +} + /** * __btrfs_write_out_cache - write out cached info to an inode * @root - the root the inode belongs to @@ -1220,6 +1239,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 offset) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached_state = NULL; LIST_HEAD(bitmap_list); int entries = 0; @@ -1231,7 +1251,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, return -EIO; WARN_ON(io_ctl->pages); - ret = io_ctl_init(io_ctl, inode, root, 1); + ret = io_ctl_init(io_ctl, inode, 1); if (ret) return ret; @@ -1277,7 +1297,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * If this changes while we are working we'll get added back to * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + ret = write_pinned_extent_entries(fs_info, block_group, + io_ctl, &entries); if (ret) goto out_nospc_locked; @@ -1296,8 +1317,8 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, - 0, i_size_read(inode), &cached_state); + ret = btrfs_dirty_pages(inode, io_ctl->pages, io_ctl->num_pages, 0, + i_size_read(inode), &cached_state); if (ret) goto out_nospc; @@ -1352,17 +1373,16 @@ out_nospc: goto out; } -int btrfs_write_out_cache(struct btrfs_root *root, +int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path) { + struct btrfs_root *root = fs_info->tree_root; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; - root = root->fs_info->tree_root; - spin_lock(&block_group->lock); if (block_group->disk_cache_state < BTRFS_DC_SETUP) { spin_unlock(&block_group->lock); @@ -1379,9 +1399,9 @@ int btrfs_write_out_cache(struct btrfs_root *root, path, block_group->key.objectid); if (ret) { #ifdef DEBUG - btrfs_err(root->fs_info, - "failed to write free space cache for block group %llu", - block_group->key.objectid); + btrfs_err(fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); #endif spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_ERROR; @@ -1968,11 +1988,11 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info) { struct btrfs_block_group_cache *block_group = ctl->private; + struct btrfs_fs_info *fs_info = block_group->fs_info; bool forced = false; #ifdef CONFIG_BTRFS_DEBUG - if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root, - block_group)) + if (btrfs_should_fragment_free_space(block_group)) forced = true; #endif @@ -1988,7 +2008,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ - if (info->bytes <= block_group->sectorsize * 4) { + if (info->bytes <= fs_info->sectorsize * 4) { if (ctl->free_extents * 2 <= ctl->extents_thresh) return false; } else { @@ -2447,6 +2467,7 @@ out: void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; struct rb_node *n; @@ -2456,23 +2477,23 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes && !block_group->ro) count++; - btrfs_crit(block_group->fs_info, - "entry offset %llu, bytes %llu, bitmap %s", + btrfs_crit(fs_info, "entry offset %llu, bytes %llu, bitmap %s", info->offset, info->bytes, (info->bitmap) ? "yes" : "no"); } - btrfs_info(block_group->fs_info, "block group has cluster?: %s", + btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); - btrfs_info(block_group->fs_info, + btrfs_info(fs_info, "%d blocks of free space at or bigger than bytes is", count); } void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; spin_lock_init(&ctl->tree_lock); - ctl->unit = block_group->sectorsize; + ctl->unit = fs_info->sectorsize; ctl->start = block_group->key.objectid; ctl->private = block_group; ctl->op = &free_space_op; @@ -3011,7 +3032,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group, * returns zero and sets up cluster if things worked out, otherwise * it returns -enospc */ -int btrfs_find_space_cluster(struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size) @@ -3029,14 +3050,14 @@ int btrfs_find_space_cluster(struct btrfs_root *root, * For metadata, allow allocates with smaller extents. For * data, keep it dense. */ - if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) { + if (btrfs_test_opt(fs_info, SSD_SPREAD)) { cont1_bytes = min_bytes = bytes + empty_size; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; - min_bytes = block_group->sectorsize; + min_bytes = fs_info->sectorsize; } else { cont1_bytes = max(bytes, (bytes + empty_size) >> 2); - min_bytes = block_group->sectorsize; + min_bytes = fs_info->sectorsize; } spin_lock(&ctl->tree_lock); @@ -3124,8 +3145,7 @@ static int do_trimming(struct btrfs_block_group_cache *block_group, spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); - ret = btrfs_discard_extent(fs_info->extent_root, - start, bytes, &trimmed); + ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); if (!ret) *total_trimmed += trimmed; @@ -3321,6 +3341,7 @@ void btrfs_get_block_group_trimming(struct btrfs_block_group_cache *cache) void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_map_tree *em_tree; struct extent_map *em; bool cleanup; @@ -3331,8 +3352,8 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) spin_unlock(&block_group->lock); if (cleanup) { - lock_chunks(block_group->fs_info->chunk_root); - em_tree = &block_group->fs_info->mapping_tree.map_tree; + mutex_lock(&fs_info->chunk_mutex); + em_tree = &fs_info->mapping_tree.map_tree; write_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, block_group->key.objectid, 1); @@ -3343,7 +3364,7 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group_cache *block_group) */ remove_extent_mapping(em_tree, em); write_unlock(&em_tree->lock); - unlock_chunks(block_group->fs_info->chunk_root); + mutex_unlock(&fs_info->chunk_mutex); /* once for us and once for the tree */ free_extent_map(em); @@ -3473,7 +3494,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root) int ret = 0; u64 root_gen = btrfs_root_generation(&root->root_item); - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) + if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) return 0; /* @@ -3512,12 +3533,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, struct btrfs_path *path, struct inode *inode) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; struct btrfs_io_ctl io_ctl; bool release_metadata = true; - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) + if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) return 0; memset(&io_ctl, 0, sizeof(io_ctl)); @@ -3531,16 +3553,16 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, * with or without an error. */ release_metadata = false; - ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + ret = btrfs_wait_cache_io_root(root, trans, &io_ctl, path); } if (ret) { if (release_metadata) btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG - btrfs_err(root->fs_info, - "failed to write free ino cache for root %llu", - root->root_key.objectid); + btrfs_err(fs_info, + "failed to write free ino cache for root %llu", + root->root_key.objectid); #endif } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 363fdd955e5d..6f3c025a2c6c 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -59,7 +59,7 @@ int create_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); -int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, +int btrfs_check_trunc_cache_free_space(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, @@ -67,12 +67,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); -int btrfs_wait_cache_io(struct btrfs_root *root, - struct btrfs_trans_handle *trans, +int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, - struct btrfs_io_ctl *io_ctl, - struct btrfs_path *path, u64 offset); -int btrfs_write_out_cache(struct btrfs_root *root, + struct btrfs_path *path); +int btrfs_write_out_cache(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -111,7 +109,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root); void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, u64 bytes); -int btrfs_find_space_cluster(struct btrfs_root *root, +int btrfs_find_space_cluster(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group, struct btrfs_free_cluster *cluster, u64 offset, u64 bytes, u64 empty_size); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 57401b474ec6..ff0c55337c2e 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -39,7 +39,7 @@ void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) * We convert to bitmaps when the disk space required for using extents * exceeds that required for using bitmaps. */ - bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + bitmap_range = cache->fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, bitmap_range); bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; @@ -189,7 +189,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, int ret; bitmap_size = free_space_bitmap_size(block_group->key.offset, - block_group->sectorsize); + fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -227,9 +227,9 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ASSERT(found_key.objectid + found_key.offset <= end); first = div_u64(found_key.objectid - start, - block_group->sectorsize); + fs_info->sectorsize); last = div_u64(found_key.objectid + found_key.offset - start, - block_group->sectorsize); + fs_info->sectorsize); le_bitmap_set(bitmap, first, last - first); extent_count++; @@ -270,7 +270,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } bitmap_cursor = bitmap; - bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + bitmap_range = fs_info->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; i = start; while (i < end) { unsigned long ptr; @@ -279,7 +279,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, extent_size = min(end - i, bitmap_range); data_size = free_space_bitmap_size(extent_size, - block_group->sectorsize); + fs_info->sectorsize); key.objectid = i; key.type = BTRFS_FREE_SPACE_BITMAP_KEY; @@ -330,7 +330,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, int ret; bitmap_size = free_space_bitmap_size(block_group->key.offset, - block_group->sectorsize); + fs_info->sectorsize); bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; @@ -370,11 +370,11 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, ASSERT(found_key.objectid + found_key.offset <= end); bitmap_pos = div_u64(found_key.objectid - start, - block_group->sectorsize * + fs_info->sectorsize * BITS_PER_BYTE); bitmap_cursor = bitmap + bitmap_pos; data_size = free_space_bitmap_size(found_key.offset, - block_group->sectorsize); + fs_info->sectorsize); ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); read_extent_buffer(leaf, bitmap_cursor, ptr, @@ -425,7 +425,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, extent_count++; } prev_bit = bit; - offset += block_group->sectorsize; + offset += fs_info->sectorsize; bitnr++; } if (prev_bit == 1) { @@ -517,7 +517,8 @@ int free_space_test_bit(struct btrfs_block_group_cache *block_group, ASSERT(offset >= found_start && offset < found_end); ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - i = div_u64(offset - found_start, block_group->sectorsize); + i = div_u64(offset - found_start, + block_group->fs_info->sectorsize); return !!extent_buffer_test_bit(leaf, ptr, i); } @@ -525,6 +526,7 @@ static void free_space_set_bits(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct extent_buffer *leaf; struct btrfs_key key; u64 end = *start + *size; @@ -544,8 +546,8 @@ static void free_space_set_bits(struct btrfs_block_group_cache *block_group, end = found_end; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); - first = div_u64(*start - found_start, block_group->sectorsize); - last = div_u64(end - found_start, block_group->sectorsize); + first = div_u64(*start - found_start, fs_info->sectorsize); + last = div_u64(end - found_start, fs_info->sectorsize); if (bit) extent_buffer_bitmap_set(leaf, ptr, first, last - first); else @@ -606,7 +608,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, * that block is within the block group. */ if (start > block_group->key.objectid) { - u64 prev_block = start - block_group->sectorsize; + u64 prev_block = start - block_group->fs_info->sectorsize; key.objectid = prev_block; key.type = (u8)-1; @@ -1121,7 +1123,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, } start = key.objectid; if (key.type == BTRFS_METADATA_ITEM_KEY) - start += fs_info->tree_root->nodesize; + start += fs_info->nodesize; else start += key.offset; } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { @@ -1187,7 +1189,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID); clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); - ret = btrfs_commit_transaction(trans, tree_root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; @@ -1196,7 +1198,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) abort: clear_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags); btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction(trans); return ret; } @@ -1267,7 +1269,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) list_del(&free_space_root->dirty_list); btrfs_tree_lock(free_space_root->node); - clean_tree_block(trans, tree_root->fs_info, free_space_root->node); + clean_tree_block(trans, fs_info, free_space_root->node); btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, free_space_root, free_space_root->node, 0, 1); @@ -1276,7 +1278,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) free_extent_buffer(free_space_root->commit_root); kfree(free_space_root); - ret = btrfs_commit_transaction(trans, tree_root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; @@ -1284,7 +1286,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) abort: btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction(trans); return ret; } @@ -1473,7 +1475,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, extent_count++; } prev_bit = bit; - offset += block_group->sectorsize; + offset += fs_info->sectorsize; } } if (prev_bit == 1) { diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b8acc07ac6c2..39c968f80157 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -182,7 +182,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(root, path, item_size - del_len, 1); + btrfs_truncate_item(root->fs_info, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -245,7 +245,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(root, path, item_size - sub_item_len, 1); + btrfs_truncate_item(root->fs_info, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -297,7 +297,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name, name_len, NULL)) goto out; - btrfs_extend_item(root, path, ins_len); + btrfs_extend_item(root->fs_info, path, ins_len); ret = 0; } if (ret < 0) @@ -328,6 +328,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, const char *name, int name_len, u64 inode_objectid, u64 ref_objectid, u64 index) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_inode_ref *ref; @@ -354,7 +355,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - btrfs_extend_item(root, path, ins_len); + btrfs_extend_item(fs_info, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -384,7 +385,7 @@ out: btrfs_free_path(path); if (ret == -EMLINK) { - struct btrfs_super_block *disk_super = root->fs_info->super_copy; + struct btrfs_super_block *disk_super = fs_info->super_copy; /* We ran out of space in the ref array. Need to * add an extended ref. */ if (btrfs_super_incompat_flags(disk_super) diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index d27014b8bf72..144b119ff43f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -38,7 +38,7 @@ static int caching_kthread(void *data) int slot; int ret; - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) + if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -180,7 +180,7 @@ static void start_caching(struct btrfs_root *root) if (IS_ERR(tsk)) { btrfs_warn(fs_info, "failed to start inode caching task"); btrfs_clear_pending_and_info(fs_info, INODE_MAP_CACHE, - "disabling inode map caching"); + "disabling inode map caching"); } } @@ -395,6 +395,7 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct btrfs_path *path; struct inode *inode; @@ -415,7 +416,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, if (btrfs_root_refs(&root->root_item) == 0) return 0; - if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE)) + if (!btrfs_test_opt(fs_info, INODE_MAP_CACHE)) return 0; path = btrfs_alloc_path(); @@ -423,7 +424,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root, return -ENOMEM; rsv = trans->block_rsv; - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; num_bytes = trans->bytes_reserved; /* @@ -433,14 +434,14 @@ int btrfs_save_ino_cache(struct btrfs_root *root, * 1 item for free space object * 3 items for pre-allocation */ - trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10); + trans->bytes_reserved = btrfs_calc_trans_metadata_size(fs_info, 10); ret = btrfs_block_rsv_add(root, trans->block_rsv, trans->bytes_reserved, BTRFS_RESERVE_NO_FLUSH); if (ret) goto out; - trace_btrfs_space_reservation(root->fs_info, "ino_cache", - trans->transid, trans->bytes_reserved, 1); + trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, + trans->bytes_reserved, 1); again: inode = lookup_free_ino_inode(root, path); if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) { @@ -506,9 +507,10 @@ again: out_put: iput(inode); out_release: - trace_btrfs_space_reservation(root->fs_info, "ino_cache", - trans->transid, trans->bytes_reserved, 0); - btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved); + trace_btrfs_space_reservation(fs_info, "ino_cache", trans->transid, + trans->bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, + trans->bytes_reserved); out: trans->block_rsv = rsv; trans->bytes_reserved = num_bytes; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8e3a5a266917..1e861a063721 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -30,7 +30,6 @@ #include <linux/mpage.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/statfs.h> #include <linux/compat.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> @@ -250,11 +249,12 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, int compress_type, struct page **compressed_pages) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 isize = i_size_read(inode); u64 actual_end = min(end + 1, isize); u64 inline_len = actual_end - start; - u64 aligned_end = ALIGN(end, root->sectorsize); + u64 aligned_end = ALIGN(end, fs_info->sectorsize); u64 data_len = inline_len; int ret; struct btrfs_path *path; @@ -265,12 +265,12 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end > root->sectorsize || - data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || + actual_end > fs_info->sectorsize || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || (!compressed_size && - (actual_end & (root->sectorsize - 1)) == 0) || + (actual_end & (fs_info->sectorsize - 1)) == 0) || end + 1 < isize || - data_len > root->fs_info->max_inline) { + data_len > fs_info->max_inline) { return 1; } @@ -283,7 +283,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, btrfs_free_path(path); return PTR_ERR(trans); } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; + trans->block_rsv = &fs_info->delalloc_block_rsv; if (compressed_size && compressed_pages) extent_item_size = btrfs_file_extent_calc_inline_size( @@ -326,7 +326,7 @@ out: */ btrfs_qgroup_free_data(inode, 0, PAGE_SIZE); btrfs_free_path(path); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } @@ -373,15 +373,15 @@ static noinline int add_async_extent(struct async_cow *cow, static inline int inode_need_compress(struct inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); /* force compress */ - if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) + if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; - if (btrfs_test_opt(root->fs_info, COMPRESS) || + if (btrfs_test_opt(fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || BTRFS_I(inode)->force_compress) return 1; @@ -411,9 +411,10 @@ static noinline void compress_file_range(struct inode *inode, struct async_cow *async_cow, int *num_added) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; u64 num_bytes; - u64 blocksize = root->sectorsize; + u64 blocksize = fs_info->sectorsize; u64 actual_end; u64 isize = i_size_read(inode); int ret = 0; @@ -426,7 +427,7 @@ static noinline void compress_file_range(struct inode *inode, unsigned long max_uncompressed = SZ_128K; int i; int will_compress; - int compress_type = root->fs_info->compress_type; + int compress_type = fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ @@ -625,7 +626,7 @@ cont: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) && + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !(BTRFS_I(inode)->force_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } @@ -683,6 +684,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) static noinline void submit_compressed_extents(struct inode *inode, struct async_cow *async_cow) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_extent *async_extent; u64 alloc_hint = 0; struct btrfs_key ins; @@ -795,7 +797,7 @@ retry: em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = async_extent->ram_size; - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; em->compress_type = async_extent->compress_type; set_bit(EXTENT_FLAG_PINNED, &em->flags); set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); @@ -830,7 +832,7 @@ retry: async_extent->ram_size - 1, 0); goto out_free_reserve; } - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* * clear dirty, set writeback and unlock the pages. @@ -871,8 +873,8 @@ retry: } return; out_free_reserve: - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: extent_clear_unlock_delalloc(inode, async_extent->start, async_extent->start + @@ -940,13 +942,14 @@ static noinline int cow_file_range(struct inode *inode, int *page_started, unsigned long *nr_written, int unlock, struct btrfs_dedupe_hash *hash) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; u64 alloc_hint = 0; u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; u64 cur_alloc_size; - u64 blocksize = root->sectorsize; + u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; @@ -990,7 +993,7 @@ static noinline int cow_file_range(struct inode *inode, } BUG_ON(disk_num_bytes > - btrfs_super_total_bytes(root->fs_info->super_copy)); + btrfs_super_total_bytes(fs_info->super_copy)); alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); @@ -1000,7 +1003,7 @@ static noinline int cow_file_range(struct inode *inode, cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, - root->sectorsize, 0, alloc_hint, + fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; @@ -1021,7 +1024,7 @@ static noinline int cow_file_range(struct inode *inode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ram_size; - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); em->generation = -1; @@ -1053,7 +1056,7 @@ static noinline int cow_file_range(struct inode *inode, goto out_drop_extent_cache; } - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (disk_num_bytes < cur_alloc_size) break; @@ -1084,8 +1087,8 @@ out: out_drop_extent_cache: btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, @@ -1119,6 +1122,7 @@ static noinline void async_cow_start(struct btrfs_work *work) */ static noinline void async_cow_submit(struct btrfs_work *work) { + struct btrfs_fs_info *fs_info; struct async_cow *async_cow; struct btrfs_root *root; unsigned long nr_pages; @@ -1126,16 +1130,17 @@ static noinline void async_cow_submit(struct btrfs_work *work) async_cow = container_of(work, struct async_cow, work); root = async_cow->root; + fs_info = root->fs_info; nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >> PAGE_SHIFT; /* * atomic_sub_return implies a barrier for waitqueue_active */ - if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < + if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M && - waitqueue_active(&root->fs_info->async_submit_wait)) - wake_up(&root->fs_info->async_submit_wait); + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); if (async_cow->inode) submit_compressed_extents(async_cow->inode, async_cow); @@ -1154,6 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, unsigned long *nr_written) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; @@ -1171,7 +1177,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->start = start; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && - !btrfs_test_opt(root->fs_info, FORCE_COMPRESS)) + !btrfs_test_opt(fs_info, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + SZ_512K - 1); @@ -1186,22 +1192,21 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, nr_pages = (cur_end - start + PAGE_SIZE) >> PAGE_SHIFT; - atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + atomic_add(nr_pages, &fs_info->async_delalloc_pages); - btrfs_queue_work(root->fs_info->delalloc_workers, - &async_cow->work); + btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work); - if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->async_delalloc_pages) < - limit)); + if (atomic_read(&fs_info->async_delalloc_pages) > limit) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->async_delalloc_pages) < + limit)); } - while (atomic_read(&root->fs_info->async_submit_draining) && - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->async_delalloc_pages) == - 0)); + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->async_delalloc_pages) == + 0)); } *nr_written += nr_pages; @@ -1211,14 +1216,14 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, return 0; } -static noinline int csum_exist_in_range(struct btrfs_root *root, +static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes) { int ret; struct btrfs_ordered_sum *sums; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr, bytenr + num_bytes - 1, &list, 0); if (ret == 0 && list_empty(&list)) return 0; @@ -1243,6 +1248,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, u64 start, u64 end, int *page_started, int force, unsigned long *nr_written) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct extent_buffer *leaf; @@ -1298,7 +1304,7 @@ static noinline int run_delalloc_nocow(struct inode *inode, return PTR_ERR(trans); } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; + trans->block_rsv = &fs_info->delalloc_block_rsv; cow_start = (u64)-1; cur_offset = start; @@ -1374,7 +1380,7 @@ next_slot: goto out_check; if (extent_type == BTRFS_FILE_EXTENT_REG && !force) goto out_check; - if (btrfs_extent_readonly(root, disk_bytenr)) + if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out_check; if (btrfs_cross_ref_exist(trans, root, ino, found_key.offset - @@ -1397,17 +1403,18 @@ next_slot: * this ensure that csum for a given extent are * either valid or do not exist. */ - if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + if (csum_exist_in_range(fs_info, disk_bytenr, + num_bytes)) goto out_check; - if (!btrfs_inc_nocow_writers(root->fs_info, - disk_bytenr)) + if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) goto out_check; nocow = 1; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = found_key.offset + btrfs_file_extent_inline_len(leaf, path->slots[0], fi); - extent_end = ALIGN(extent_end, root->sectorsize); + extent_end = ALIGN(extent_end, + fs_info->sectorsize); } else { BUG_ON(1); } @@ -1417,8 +1424,7 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); if (nocow) - btrfs_dec_nocow_writers(root->fs_info, - disk_bytenr); + btrfs_dec_nocow_writers(fs_info, disk_bytenr); goto next_slot; } if (!nocow) { @@ -1441,7 +1447,7 @@ out_check: if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); if (nocow) - btrfs_dec_nocow_writers(root->fs_info, + btrfs_dec_nocow_writers(fs_info, disk_bytenr); goto error; } @@ -1461,7 +1467,7 @@ out_check: em->block_start = disk_bytenr; em->orig_block_len = disk_num_bytes; em->ram_bytes = ram_bytes; - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; em->mod_start = em->start; em->mod_len = em->len; set_bit(EXTENT_FLAG_PINNED, &em->flags); @@ -1486,7 +1492,7 @@ out_check: ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, num_bytes, num_bytes, type); if (nocow) - btrfs_dec_nocow_writers(root->fs_info, disk_bytenr); + btrfs_dec_nocow_writers(fs_info, disk_bytenr); BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == @@ -1528,7 +1534,7 @@ out_check: } error: - err = btrfs_end_transaction(trans, root); + err = btrfs_end_transaction(trans); if (!ret) ret = err; @@ -1693,6 +1699,8 @@ static void btrfs_merge_extent_hook(struct inode *inode, static void btrfs_add_delalloc_inodes(struct btrfs_root *root, struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + spin_lock(&root->delalloc_lock); if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_add_tail(&BTRFS_I(inode)->delalloc_inodes, @@ -1701,11 +1709,11 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, &BTRFS_I(inode)->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { - spin_lock(&root->fs_info->delalloc_root_lock); + spin_lock(&fs_info->delalloc_root_lock); BUG_ON(!list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root, - &root->fs_info->delalloc_roots); - spin_unlock(&root->fs_info->delalloc_root_lock); + &fs_info->delalloc_roots); + spin_unlock(&fs_info->delalloc_root_lock); } } spin_unlock(&root->delalloc_lock); @@ -1714,6 +1722,8 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, static void btrfs_del_delalloc_inode(struct btrfs_root *root, struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + spin_lock(&root->delalloc_lock); if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { list_del_init(&BTRFS_I(inode)->delalloc_inodes); @@ -1721,10 +1731,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, &BTRFS_I(inode)->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { - spin_lock(&root->fs_info->delalloc_root_lock); + spin_lock(&fs_info->delalloc_root_lock); BUG_ON(list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); - spin_unlock(&root->fs_info->delalloc_root_lock); + spin_unlock(&fs_info->delalloc_root_lock); } } spin_unlock(&root->delalloc_lock); @@ -1739,6 +1749,8 @@ static void btrfs_set_bit_hook(struct inode *inode, struct extent_state *state, unsigned *bits) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) WARN_ON(1); /* @@ -1760,11 +1772,11 @@ static void btrfs_set_bit_hook(struct inode *inode, } /* For sanity tests */ - if (btrfs_is_testing(root->fs_info)) + if (btrfs_is_testing(fs_info)) return; - __percpu_counter_add(&root->fs_info->delalloc_bytes, len, - root->fs_info->delalloc_batch); + __percpu_counter_add(&fs_info->delalloc_bytes, len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; if (*bits & EXTENT_DEFRAG) @@ -1783,6 +1795,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, unsigned *bits) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 len = state->end + 1 - state->start; u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1, BTRFS_MAX_EXTENT_SIZE); @@ -1815,11 +1828,11 @@ static void btrfs_clear_bit_hook(struct inode *inode, * error. */ if (*bits & EXTENT_DO_ACCOUNTING && - root != root->fs_info->tree_root) + root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); /* For sanity tests. */ - if (btrfs_is_testing(root->fs_info)) + if (btrfs_is_testing(fs_info)) return; if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID @@ -1829,8 +1842,8 @@ static void btrfs_clear_bit_hook(struct inode *inode, btrfs_free_reserved_data_space_noquota(inode, state->start, len); - __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, - root->fs_info->delalloc_batch); + __percpu_counter_add(&fs_info->delalloc_bytes, -len, + fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes -= len; if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && @@ -1853,7 +1866,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags) { - struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 logical = (u64)bio->bi_iter.bi_sector << 9; u64 length = 0; u64 map_length; @@ -1864,8 +1878,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, length = bio->bi_iter.bi_size; map_length = length; - ret = btrfs_map_block(root->fs_info, bio_op(bio), logical, - &map_length, NULL, 0); + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + NULL, 0); if (ret < 0) return ret; if (map_length < length + size) @@ -1885,10 +1899,9 @@ static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; - ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + ret = btrfs_csum_one_bio(inode, bio, 0, 0); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -1905,10 +1918,10 @@ static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; - ret = btrfs_map_bio(root, bio, mirror_num, 1); + ret = btrfs_map_bio(fs_info, bio, mirror_num, 1); if (ret) { bio->bi_error = ret; bio_endio(bio); @@ -1924,6 +1937,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; int ret = 0; @@ -1936,7 +1950,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, metadata = BTRFS_WQ_ENDIO_FREE_SPACE; if (bio_op(bio) != REQ_OP_WRITE) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); + ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); if (ret) goto out; @@ -1946,7 +1960,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(root, inode, bio, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) goto out; } @@ -1956,20 +1970,19 @@ static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) goto mapit; /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, - inode, bio, mirror_num, - bio_flags, bio_offset, - __btrfs_submit_bio_start, - __btrfs_submit_bio_done); + ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num, + bio_flags, bio_offset, + __btrfs_submit_bio_start, + __btrfs_submit_bio_done); goto out; } else if (!skip_sum) { - ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + ret = btrfs_csum_one_bio(inode, bio, 0, 0); if (ret) goto out; } mapit: - ret = btrfs_map_bio(root, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); out: if (ret < 0) { @@ -2090,8 +2103,8 @@ out_page: static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; - struct btrfs_root *root = BTRFS_I(inode)->root; /* this page is properly in the ordered list */ if (TestClearPagePrivate2(page)) @@ -2109,7 +2122,7 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) btrfs_init_work(&fixup->work, btrfs_fixup_helper, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); + btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EBUSY; } @@ -2180,10 +2193,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_alloc_reserved_file_extent(trans, root, - root->root_key.objectid, - btrfs_ino(inode), file_pos, - ram_bytes, &ins); + ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid, + btrfs_ino(inode), file_pos, + ram_bytes, &ins); /* * Release the reserved range from inode dirty range map, as it is * already moved into delayed_ref_head @@ -2293,7 +2305,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, void *ctx) { struct btrfs_file_extent_item *extent; - struct btrfs_fs_info *fs_info; struct old_sa_defrag_extent *old = ctx; struct new_sa_defrag_extent *new = old->new; struct btrfs_path *path = new->path; @@ -2302,6 +2313,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, struct sa_defrag_extent_backref *backref; struct extent_buffer *leaf; struct inode *inode = new->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int slot; int ret; u64 extent_offset; @@ -2315,7 +2327,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - fs_info = BTRFS_I(inode)->root->fs_info; root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(root)) { if (PTR_ERR(root) == -ENOENT) @@ -2413,7 +2424,7 @@ out: static noinline bool record_extent_backrefs(struct btrfs_path *path, struct new_sa_defrag_extent *new) { - struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); struct old_sa_defrag_extent *old, *tmp; int ret; @@ -2471,13 +2482,12 @@ static noinline int relink_extent_backref(struct btrfs_path *path, struct btrfs_file_extent_item *item; struct btrfs_ordered_extent *ordered; struct btrfs_trans_handle *trans; - struct btrfs_fs_info *fs_info; struct btrfs_root *root; struct btrfs_key key; struct extent_buffer *leaf; struct old_sa_defrag_extent *old = backref->old; struct new_sa_defrag_extent *new = old->new; - struct inode *src_inode = new->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); struct inode *inode; struct extent_state *cached = NULL; int ret = 0; @@ -2498,7 +2508,6 @@ static noinline int relink_extent_backref(struct btrfs_path *path, key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; - fs_info = BTRFS_I(src_inode)->root->fs_info; index = srcu_read_lock(&fs_info->subvol_srcu); root = btrfs_read_fs_root_no_name(fs_info, &key); @@ -2643,7 +2652,7 @@ again: inode_add_bytes(inode, len); btrfs_release_path(path); - ret = btrfs_inc_extent_ref(trans, root, new->bytenr, + ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr, new->disk_len, 0, backref->root_id, backref->inum, new->file_pos); /* start - extent_offset */ @@ -2656,7 +2665,7 @@ again: out_free_path: btrfs_release_path(path); path->leave_spinning = 0; - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, &cached, GFP_NOFS); @@ -2679,6 +2688,7 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) static void relink_file_extents(struct new_sa_defrag_extent *new) { + struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); struct btrfs_path *path; struct sa_defrag_extent_backref *backref; struct sa_defrag_extent_backref *prev = NULL; @@ -2725,14 +2735,15 @@ static void relink_file_extents(struct new_sa_defrag_extent *new) out: free_sa_defrag_extent(new); - atomic_dec(&root->fs_info->defrag_running); - wake_up(&root->fs_info->transaction_wait); + atomic_dec(&fs_info->defrag_running); + wake_up(&fs_info->transaction_wait); } static struct new_sa_defrag_extent * record_old_file_extents(struct inode *inode, struct btrfs_ordered_extent *ordered) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct btrfs_key key; @@ -2831,7 +2842,7 @@ next: } btrfs_free_path(path); - atomic_inc(&root->fs_info->defrag_running); + atomic_inc(&fs_info->defrag_running); return new; @@ -2842,12 +2853,12 @@ out_kfree: return NULL; } -static void btrfs_release_delalloc_bytes(struct btrfs_root *root, +static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group_cache *cache; - cache = btrfs_lookup_block_group(root->fs_info, start); + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); spin_lock(&cache->lock); @@ -2864,6 +2875,7 @@ static void btrfs_release_delalloc_bytes(struct btrfs_root *root, static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct inode *inode = ordered_extent->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -2914,7 +2926,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans = NULL; goto out; } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; + trans->block_rsv = &fs_info->delalloc_block_rsv; ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); @@ -2949,7 +2961,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out_unlock; } - trans->block_rsv = &root->fs_info->delalloc_block_rsv; + trans->block_rsv = &fs_info->delalloc_block_rsv; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; @@ -2960,7 +2972,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset + logical_len); } else { - BUG_ON(root == root->fs_info->tree_root); + BUG_ON(root == fs_info->tree_root); ret = insert_reserved_file_extent(trans, inode, ordered_extent->file_offset, ordered_extent->start, @@ -2969,7 +2981,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); if (!ret) - btrfs_release_delalloc_bytes(root, + btrfs_release_delalloc_bytes(fs_info, ordered_extent->start, ordered_extent->disk_len); } @@ -2996,10 +3008,10 @@ out_unlock: ordered_extent->file_offset + ordered_extent->len - 1, &cached_state, GFP_NOFS); out: - if (root != root->fs_info->tree_root) + if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, ordered_extent->len); if (trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret || truncated) { u64 start, end; @@ -3023,7 +3035,8 @@ out: if ((ret || !logical_len) && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) - btrfs_free_reserved_extent(root, ordered_extent->start, + btrfs_free_reserved_extent(fs_info, + ordered_extent->start, ordered_extent->disk_len, 1); } @@ -3038,7 +3051,7 @@ out: if (new) { if (ret) { free_sa_defrag_extent(new); - atomic_dec(&root->fs_info->defrag_running); + atomic_dec(&fs_info->defrag_running); } else { relink_file_extents(new); } @@ -3063,7 +3076,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate) { struct inode *inode = page->mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; btrfs_work_func_t func; @@ -3076,10 +3089,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return 0; if (btrfs_is_free_space_inode(inode)) { - wq = root->fs_info->endio_freespace_worker; + wq = fs_info->endio_freespace_worker; func = btrfs_freespace_write_helper; } else { - wq = root->fs_info->endio_write_workers; + wq = fs_info->endio_write_workers; func = btrfs_endio_write_helper; } @@ -3103,7 +3116,7 @@ static int __readpage_endio_check(struct inode *inode, kaddr = kmap_atomic(page); csum = btrfs_csum_data(kaddr + pgoff, csum, len); - btrfs_csum_final(csum, (char *)&csum); + btrfs_csum_final(csum, (u8 *)&csum); if (csum != csum_expected) goto zeroit; @@ -3156,7 +3169,7 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, void btrfs_add_delayed_iput(struct inode *inode) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) @@ -3172,9 +3185,8 @@ void btrfs_add_delayed_iput(struct inode *inode) spin_unlock(&fs_info->delayed_iput_lock); } -void btrfs_run_delayed_iputs(struct btrfs_root *root) +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { @@ -3204,6 +3216,7 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *block_rsv; int ret; @@ -3228,7 +3241,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) && btrfs_root_refs(&root->root_item) > 0) { - ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root, + ret = btrfs_del_orphan_item(trans, fs_info->tree_root, root->root_key.objectid); if (ret) btrfs_abort_transaction(trans, ret); @@ -3239,7 +3252,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, if (block_rsv) { WARN_ON(block_rsv->size > 0); - btrfs_free_block_rsv(root, block_rsv); + btrfs_free_block_rsv(fs_info, block_rsv); } } @@ -3252,6 +3265,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *block_rsv = NULL; int reserve = 0; @@ -3259,7 +3273,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) int ret; if (!root->orphan_block_rsv) { - block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + block_rsv = btrfs_alloc_block_rsv(fs_info, + BTRFS_BLOCK_RSV_TEMP); if (!block_rsv) return -ENOMEM; } @@ -3268,7 +3283,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) if (!root->orphan_block_rsv) { root->orphan_block_rsv = block_rsv; } else if (block_rsv) { - btrfs_free_block_rsv(root, block_rsv); + btrfs_free_block_rsv(fs_info, block_rsv); block_rsv = NULL; } @@ -3331,7 +3346,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) /* insert an orphan item to track subvolume contains orphan files */ if (insert >= 2) { - ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root, + ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, root->root_key.objectid); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); @@ -3382,6 +3397,7 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, */ int btrfs_orphan_cleanup(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key, found_key; @@ -3441,8 +3457,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { - btrfs_err(root->fs_info, - "Error removing orphan entry, stopping orphan cleanup"); + btrfs_err(fs_info, + "Error removing orphan entry, stopping orphan cleanup"); ret = -EINVAL; goto out; } @@ -3452,12 +3468,12 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); + inode = btrfs_iget(fs_info->sb, &found_key, root, NULL); ret = PTR_ERR_OR_ZERO(inode); if (ret && ret != -ENOENT) goto out; - if (ret == -ENOENT && root == root->fs_info->tree_root) { + if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; @@ -3499,11 +3515,11 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = PTR_ERR(trans); goto out; } - btrfs_debug(root->fs_info, "auto deleting %Lu", - found_key.objectid); + btrfs_debug(fs_info, "auto deleting %Lu", + found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) goto out; continue; @@ -3533,7 +3549,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) goto out; } ret = btrfs_orphan_add(trans, inode); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) { iput(inode); goto out; @@ -3557,25 +3573,24 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; if (root->orphan_block_rsv) - btrfs_block_rsv_release(root, root->orphan_block_rsv, + btrfs_block_rsv_release(fs_info, root->orphan_block_rsv, (u64)-1); if (root->orphan_block_rsv || test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } if (nr_unlink) - btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink); + btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); if (nr_truncate) - btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate); + btrfs_debug(fs_info, "truncated %d orphans", nr_truncate); out: if (ret) - btrfs_err(root->fs_info, - "could not do orphan cleanup %d", ret); + btrfs_err(fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } @@ -3654,6 +3669,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, */ static int btrfs_read_locked_inode(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; @@ -3734,7 +3750,7 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ - if (BTRFS_I(inode)->last_trans == root->fs_info->generation) + if (BTRFS_I(inode)->last_trans == fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -3800,7 +3816,7 @@ cache_acl: path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) - btrfs_err(root->fs_info, + btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(inode), root->root_key.objectid, ret); @@ -3819,10 +3835,7 @@ cache_acl: break; case S_IFDIR: inode->i_fop = &btrfs_dir_file_operations; - if (root == root->fs_info->tree_root) - inode->i_op = &btrfs_dir_ro_inode_operations; - else - inode->i_op = &btrfs_dir_inode_operations; + inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; @@ -3937,6 +3950,7 @@ failed: noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; /* @@ -3948,7 +3962,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, */ if (!btrfs_is_free_space_inode(inode) && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && !test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { + && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); @@ -3982,6 +3996,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, const char *name, int name_len) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; struct extent_buffer *leaf; @@ -4036,14 +4051,14 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, dir_ino, &index); if (ret) { - btrfs_info(root->fs_info, + btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name_len, name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } skip_backref: - ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; @@ -4138,8 +4153,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) } out: - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(root->fs_info); return ret; } @@ -4148,6 +4163,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct inode *dir, u64 objectid, const char *name, int name_len) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; @@ -4180,9 +4196,9 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, - objectid, root->root_key.objectid, - dir_ino, &index, name, name_len); + ret = btrfs_del_root_ref(trans, fs_info, objectid, + root->root_key.objectid, dir_ino, + &index, name, name_len); if (ret < 0) { if (ret != -ENOENT) { btrfs_abort_transaction(trans, ret); @@ -4206,7 +4222,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = btrfs_delete_delayed_dir_index(trans, root, dir, index); + ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4274,8 +4290,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; } out: - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(root->fs_info); return err; } @@ -4284,18 +4300,19 @@ static int truncate_space_check(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytes_deleted) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; /* * This is only used to apply pressure to the enospc system, we don't * intend to use this reservation at all. */ - bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); - bytes_deleted *= root->nodesize; - ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted); + bytes_deleted *= fs_info->nodesize; + ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, bytes_deleted, BTRFS_RESERVE_NO_FLUSH); if (!ret) { - trace_btrfs_space_reservation(root->fs_info, "transaction", + trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, bytes_deleted, 1); trans->bytes_reserved += bytes_deleted; @@ -4338,7 +4355,7 @@ static int truncate_inline_extent(struct inode *inode, btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(root, path, size, 1); + btrfs_truncate_item(root->fs_info, path, size, 1); if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) inode_sub_bytes(inode, item_end + 1 - new_size); @@ -4362,6 +4379,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; @@ -4407,9 +4425,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, * extent just the way it is. */ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == root->fs_info->tree_root) + root == fs_info->tree_root) btrfs_drop_extent_cache(inode, ALIGN(new_size, - root->sectorsize), (u64)-1, 0); + fs_info->sectorsize), + (u64)-1, 0); /* * This function is also used to drop the items in the log tree before @@ -4431,7 +4450,7 @@ search_again: * bytes_deleted is > 0, it will be huge by the time we get here */ if (be_nice && bytes_deleted > SZ_32M) { - if (btrfs_should_end_transaction(trans, root)) { + if (btrfs_should_end_transaction(trans)) { err = -EAGAIN; goto error; } @@ -4483,8 +4502,19 @@ search_again: if (found_type > min_type) { del_item = 1; } else { - if (item_end < new_size) + if (item_end < new_size) { + /* + * With NO_HOLES mode, for the following mapping + * + * [0-4k][hole][8k-12k] + * + * if truncating isize down to 6k, it ends up + * isize being 8k. + */ + if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) + last_size = new_size; break; + } if (found_key.offset >= new_size) del_item = 1; else @@ -4508,7 +4538,7 @@ search_again: btrfs_file_extent_num_bytes(leaf, fi); extent_num_bytes = ALIGN(new_size - found_key.offset, - root->sectorsize); + fs_info->sectorsize); btrfs_set_file_extent_num_bytes(leaf, fi, extent_num_bytes); num_dec = (orig_num_bytes - @@ -4595,16 +4625,16 @@ delete: if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || - root == root->fs_info->tree_root)) { + root == fs_info->tree_root)) { btrfs_set_path_blocking(path); bytes_deleted += extent_num_bytes; - ret = btrfs_free_extent(trans, root, extent_start, + ret = btrfs_free_extent(trans, fs_info, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset); BUG_ON(ret); - if (btrfs_should_throttle_delayed_refs(trans, root)) - btrfs_async_run_delayed_refs(root, + if (btrfs_should_throttle_delayed_refs(trans, fs_info)) + btrfs_async_run_delayed_refs(fs_info, trans->delayed_ref_updates * 2, trans->transid, 0); if (be_nice) { @@ -4613,9 +4643,8 @@ delete: should_end = 1; } if (btrfs_should_throttle_delayed_refs(trans, - root)) { + fs_info)) should_throttle = 1; - } } } @@ -4640,7 +4669,9 @@ delete: unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, root, updates * 2); + ret = btrfs_run_delayed_refs(trans, + fs_info, + updates * 2); if (ret && !err) err = ret; } @@ -4675,7 +4706,8 @@ error: unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; - ret = btrfs_run_delayed_refs(trans, root, updates * 2); + ret = btrfs_run_delayed_refs(trans, fs_info, + updates * 2); if (ret && !err) err = ret; } @@ -4697,13 +4729,13 @@ error: int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, int front) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct address_space *mapping = inode->i_mapping; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; char *kaddr; - u32 blocksize = root->sectorsize; + u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; @@ -4807,6 +4839,7 @@ out: static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, u64 offset, u64 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; int ret; @@ -4814,8 +4847,8 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, * Still need to make sure the inode looks like it's been updated so * that any holes get logged if we fsync. */ - if (btrfs_fs_incompat(root->fs_info, NO_HOLES)) { - BTRFS_I(inode)->last_trans = root->fs_info->generation; + if (btrfs_fs_incompat(fs_info, NO_HOLES)) { + BTRFS_I(inode)->last_trans = fs_info->generation; BTRFS_I(inode)->last_sub_trans = root->log_transid; BTRFS_I(inode)->last_log_commit = root->last_log_commit; return 0; @@ -4833,7 +4866,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1); if (ret) { btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } @@ -4843,7 +4876,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, btrfs_abort_transaction(trans, ret); else btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } @@ -4855,13 +4888,14 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode, */ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 hole_start = ALIGN(oldsize, root->sectorsize); - u64 block_end = ALIGN(size, root->sectorsize); + u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); + u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; @@ -4904,7 +4938,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) break; } last_byte = min(extent_map_end(em), block_end); - last_byte = ALIGN(last_byte , root->sectorsize); + last_byte = ALIGN(last_byte, fs_info->sectorsize); if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; hole_size = last_byte - cur_offset; @@ -4929,9 +4963,9 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->bdev = root->fs_info->fs_devices->latest_bdev; + hole_em->bdev = fs_info->fs_devices->latest_bdev; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = root->fs_info->generation; + hole_em->generation = fs_info->generation; while (1) { write_lock(&em_tree->lock); @@ -5006,7 +5040,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); btrfs_end_write_no_snapshoting(root); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } else { /* @@ -5037,7 +5071,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * will be consistent. */ ret = btrfs_orphan_add(trans, inode); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) return ret; @@ -5068,7 +5102,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) err = btrfs_orphan_del(trans, inode); if (err) btrfs_abort_transaction(trans, err); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } } @@ -5201,6 +5235,7 @@ static void evict_inode_truncate_pages(struct inode *inode) void btrfs_evict_inode(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; @@ -5215,7 +5250,7 @@ void btrfs_evict_inode(struct inode *inode) return; } - min_size = btrfs_calc_trunc_metadata_size(root, 1); + min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); evict_inode_truncate_pages(inode); @@ -5235,7 +5270,7 @@ void btrfs_evict_inode(struct inode *inode) btrfs_free_io_failure_record(inode, 0, (u64)-1); - if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)); goto no_delete; @@ -5253,14 +5288,14 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } - rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { btrfs_orphan_del(NULL, inode); goto no_delete; } rsv->size = min_size; rsv->failfast = 1; - global_rsv = &root->fs_info->global_block_rsv; + global_rsv = &fs_info->global_block_rsv; btrfs_i_size_write(inode, 0); @@ -5294,18 +5329,18 @@ void btrfs_evict_inode(struct inode *inode) * steal_from_global == 3: abandon all hope! */ if (steal_from_global > 2) { - btrfs_warn(root->fs_info, - "Could not get space for a delete, will truncate on mount %d", - ret); + btrfs_warn(fs_info, + "Could not get space for a delete, will truncate on mount %d", + ret); btrfs_orphan_del(NULL, inode); - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_orphan_del(NULL, inode); - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } @@ -5315,7 +5350,7 @@ void btrfs_evict_inode(struct inode *inode) * again. */ if (steal_from_global) { - if (!btrfs_check_space_for_delayed_refs(trans, root)) + if (!btrfs_check_space_for_delayed_refs(trans, fs_info)) ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0); else @@ -5328,10 +5363,10 @@ void btrfs_evict_inode(struct inode *inode) * again. */ if (ret) { - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret) { btrfs_orphan_del(NULL, inode); - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); goto no_delete; } continue; @@ -5345,13 +5380,13 @@ void btrfs_evict_inode(struct inode *inode) if (ret != -ENOSPC && ret != -EAGAIN) break; - trans->block_rsv = &root->fs_info->trans_block_rsv; - btrfs_end_transaction(trans, root); + trans->block_rsv = &fs_info->trans_block_rsv; + btrfs_end_transaction(trans); trans = NULL; - btrfs_btree_balance_dirty(root); + btrfs_btree_balance_dirty(fs_info); } - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); /* * Errors here aren't a big deal, it just means we leave orphan items @@ -5364,13 +5399,13 @@ void btrfs_evict_inode(struct inode *inode) btrfs_orphan_del(NULL, inode); } - trans->block_rsv = &root->fs_info->trans_block_rsv; - if (!(root == root->fs_info->tree_root || + trans->block_rsv = &fs_info->trans_block_rsv; + if (!(root == fs_info->tree_root || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) btrfs_return_ino(root, btrfs_ino(inode)); - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); @@ -5416,7 +5451,7 @@ out_err: * needs to be changed to reflect the root directory of the tree root. This * is kind of like crossing a mount point. */ -static int fixup_tree_root_location(struct btrfs_root *root, +static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct inode *dir, struct dentry *dentry, struct btrfs_key *location, @@ -5441,8 +5476,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path, - 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret) { if (ret < 0) err = ret; @@ -5463,7 +5497,7 @@ static int fixup_tree_root_location(struct btrfs_root *root, btrfs_release_path(path); - new_root = btrfs_read_fs_root_no_name(root->fs_info, location); + new_root = btrfs_read_fs_root_no_name(fs_info, location); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; @@ -5517,6 +5551,7 @@ static void inode_tree_add(struct inode *inode) static void inode_tree_del(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int empty = 0; @@ -5529,7 +5564,7 @@ static void inode_tree_del(struct inode *inode) spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { - synchronize_srcu(&root->fs_info->subvol_srcu); + synchronize_srcu(&fs_info->subvol_srcu); spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); @@ -5540,13 +5575,14 @@ static void inode_tree_del(struct inode *inode) void btrfs_invalidate_inodes(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; struct rb_node *prev; struct btrfs_inode *entry; struct inode *inode; u64 objectid = 0; - if (!test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); @@ -5682,6 +5718,7 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; inode->i_op = &btrfs_dir_ro_inode_operations; + inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = current_time(inode); @@ -5694,6 +5731,7 @@ static struct inode *new_simple_dir(struct super_block *s, struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; @@ -5718,8 +5756,8 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - index = srcu_read_lock(&root->fs_info->subvol_srcu); - ret = fixup_tree_root_location(root, dir, dentry, + index = srcu_read_lock(&fs_info->subvol_srcu); + ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) @@ -5729,13 +5767,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } else { inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL); } - srcu_read_unlock(&root->fs_info->subvol_srcu, index); + srcu_read_unlock(&fs_info->subvol_srcu, index); if (!IS_ERR(inode) && root != sub_root) { - down_read(&root->fs_info->cleanup_work_sem); + down_read(&fs_info->cleanup_work_sem); if (!(inode->i_sb->s_flags & MS_RDONLY)) ret = btrfs_orphan_cleanup(sub_root); - up_read(&root->fs_info->cleanup_work_sem); + up_read(&fs_info->cleanup_work_sem); if (ret) { iput(inode); inode = ERR_PTR(ret); @@ -5792,6 +5830,7 @@ unsigned char btrfs_filetype_table[] = { static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_item *item; struct btrfs_dir_item *di; @@ -5805,20 +5844,11 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) int slot; unsigned char d_type; int over = 0; - u32 di_cur; - u32 di_total; - u32 di_len; - int key_type = BTRFS_DIR_INDEX_KEY; char tmp_name[32]; char *name_ptr; int name_len; - int is_curr = 0; /* ctx->pos points to the current index? */ - bool emitted; bool put = false; - - /* FIXME, use a real flag for deciding about the key type */ - if (root->fs_info->tree_root == root) - key_type = BTRFS_DIR_ITEM_KEY; + struct btrfs_key location; if (!dir_emit_dots(file, ctx)) return 0; @@ -5829,14 +5859,11 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) path->reada = READA_FORWARD; - if (key_type == BTRFS_DIR_INDEX_KEY) { - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); - put = btrfs_readdir_get_delayed_items(inode, &ins_list, - &del_list); - } + INIT_LIST_HEAD(&ins_list); + INIT_LIST_HEAD(&del_list); + put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); - key.type = key_type; + key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(inode); @@ -5844,7 +5871,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (ret < 0) goto err; - emitted = false; while (1) { leaf = path->nodes[0]; slot = path->slots[0]; @@ -5862,98 +5888,52 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (found_key.objectid != key.objectid) break; - if (found_key.type != key_type) + if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) goto next; - if (key_type == BTRFS_DIR_INDEX_KEY && - btrfs_should_delete_dir_index(&del_list, - found_key.offset)) + if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; ctx->pos = found_key.offset; - is_curr = 1; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); - di_cur = 0; - di_total = btrfs_item_size(leaf, item); - - while (di_cur < di_total) { - struct btrfs_key location; - - if (verify_dir_item(root, leaf, di)) - break; + if (verify_dir_item(fs_info, leaf, di)) + goto next; - name_len = btrfs_dir_name_len(leaf, di); - if (name_len <= sizeof(tmp_name)) { - name_ptr = tmp_name; - } else { - name_ptr = kmalloc(name_len, GFP_KERNEL); - if (!name_ptr) { - ret = -ENOMEM; - goto err; - } + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_KERNEL); + if (!name_ptr) { + ret = -ENOMEM; + goto err; } - read_extent_buffer(leaf, name_ptr, - (unsigned long)(di + 1), name_len); - - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; - btrfs_dir_item_key_to_cpu(leaf, di, &location); + } + read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), + name_len); + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); - /* is this a reference to our own snapshot? If so - * skip it. - * - * In contrast to old kernels, we insert the snapshot's - * dir item and dir index after it has been created, so - * we won't find a reference to our own snapshot. We - * still keep the following code for backward - * compatibility. - */ - if (location.type == BTRFS_ROOT_ITEM_KEY && - location.objectid == root->root_key.objectid) { - over = 0; - goto skip; - } - over = !dir_emit(ctx, name_ptr, name_len, - location.objectid, d_type); + over = !dir_emit(ctx, name_ptr, name_len, location.objectid, + d_type); -skip: - if (name_ptr != tmp_name) - kfree(name_ptr); + if (name_ptr != tmp_name) + kfree(name_ptr); - if (over) - goto nopos; - emitted = true; - di_len = btrfs_dir_name_len(leaf, di) + - btrfs_dir_data_len(leaf, di) + sizeof(*di); - di_cur += di_len; - di = (struct btrfs_dir_item *)((char *)di + di_len); - } + if (over) + goto nopos; + ctx->pos++; next: path->slots[0]++; } - if (key_type == BTRFS_DIR_INDEX_KEY) { - if (is_curr) - ctx->pos++; - ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list, &emitted); - if (ret) - goto nopos; - } - - /* - * If we haven't emitted any dir entry, we must not touch ctx->pos as - * it was was set to the termination value in previous call. We assume - * that "." and ".." were emitted if we reach this point and set the - * termination value as well for an empty directory. - */ - if (ctx->pos > 2 && !emitted) + ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); + if (ret) goto nopos; - /* Reached end of directory/root. Bump pos past the last item. */ - ctx->pos++; - /* * Stop new entries from being returned after we return the last * entry. @@ -5971,12 +5951,10 @@ next: * last entry requires it because doing so has broken 32bit apps * in the past. */ - if (key_type == BTRFS_DIR_INDEX_KEY) { - if (ctx->pos >= INT_MAX) - ctx->pos = LLONG_MAX; - else - ctx->pos = INT_MAX; - } + if (ctx->pos >= INT_MAX) + ctx->pos = LLONG_MAX; + else + ctx->pos = INT_MAX; nopos: ret = 0; err: @@ -6006,7 +5984,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); } return ret; } @@ -6019,6 +5997,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc) */ static int btrfs_dirty_inode(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; @@ -6033,16 +6012,16 @@ static int btrfs_dirty_inode(struct inode *inode) ret = btrfs_update_inode(trans, root, inode); if (ret && ret == -ENOSPC) { /* whoops, lets try again with the full transaction */ - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, root, inode); } - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (BTRFS_I(inode)->delayed_node) - btrfs_balance_delayed_items(root); + btrfs_balance_delayed_items(fs_info); return ret; } @@ -6168,6 +6147,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, u64 ref_objectid, u64 objectid, umode_t mode, u64 *index) { + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; struct btrfs_inode_item *inode_item; struct btrfs_key *location; @@ -6183,7 +6163,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (!path) return ERR_PTR(-ENOMEM); - inode = new_inode(root->fs_info->sb); + inode = new_inode(fs_info->sb); if (!inode) { btrfs_free_path(path); return ERR_PTR(-ENOMEM); @@ -6277,7 +6257,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item, + memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); @@ -6296,9 +6276,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { - if (btrfs_test_opt(root->fs_info, NODATASUM)) + if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(root->fs_info, NODATACOW)) + if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } @@ -6312,7 +6292,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, ret = btrfs_inode_inherit_props(trans, inode, dir); if (ret) - btrfs_err(root->fs_info, + btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", btrfs_ino(inode), root->root_key.objectid, ret); @@ -6343,6 +6323,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct inode *parent_inode, struct inode *inode, const char *name, int name_len, int add_backref, u64 index) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; struct btrfs_key key; struct btrfs_root *root = BTRFS_I(parent_inode)->root; @@ -6358,9 +6339,9 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, - key.objectid, root->root_key.objectid, - parent_ino, index, name, name_len); + ret = btrfs_add_root_ref(trans, fs_info, key.objectid, + root->root_key.objectid, parent_ino, + index, name, name_len); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, parent_ino, index); @@ -6394,9 +6375,9 @@ fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; int err; - err = btrfs_del_root_ref(trans, root->fs_info->tree_root, - key.objectid, root->root_key.objectid, - parent_ino, &local_index, name, name_len); + err = btrfs_del_root_ref(trans, fs_info, key.objectid, + root->root_key.objectid, parent_ino, + &local_index, name, name_len); } else if (add_backref) { u64 local_index; @@ -6423,6 +6404,7 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, static int btrfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; @@ -6475,9 +6457,9 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, } out_unlock: - btrfs_end_transaction(trans, root); - btrfs_balance_delayed_items(root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_balance_delayed_items(fs_info); + btrfs_btree_balance_dirty(fs_info); if (drop_inode) { inode_dec_link_count(inode); iput(inode); @@ -6494,6 +6476,7 @@ out_unlock_inode: static int btrfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; @@ -6550,13 +6533,13 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); out_unlock: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (err && drop_inode_on_err) { inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(root); - btrfs_btree_balance_dirty(root); + btrfs_balance_delayed_items(fs_info); + btrfs_btree_balance_dirty(fs_info); return err; out_unlock_inode: @@ -6571,6 +6554,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 index; int err; int drop_inode = 0; @@ -6628,20 +6612,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, btrfs_log_new_name(trans, inode, NULL, parent); } - btrfs_balance_delayed_items(root); + btrfs_balance_delayed_items(fs_info); fail: if (trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root); + btrfs_btree_balance_dirty(fs_info); return err; } static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -6699,13 +6684,13 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) drop_on_err = 0; out_fail: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (drop_on_err) { inode_dec_link_count(inode); iput(inode); } - btrfs_balance_delayed_items(root); - btrfs_btree_balance_dirty(root); + btrfs_balance_delayed_items(fs_info); + btrfs_btree_balance_dirty(fs_info); return err; out_fail_inode: @@ -6820,6 +6805,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, int create) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; int err = 0; u64 extent_start = 0; @@ -6841,7 +6827,7 @@ again: read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); if (em) - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; read_unlock(&em_tree->lock); if (em) { @@ -6857,7 +6843,7 @@ again: err = -ENOMEM; goto out; } - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; @@ -6916,7 +6902,8 @@ again: } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); - extent_end = ALIGN(extent_start + size, root->sectorsize); + extent_end = ALIGN(extent_start + size, + fs_info->sectorsize); } next: if (start >= extent_end) { @@ -6965,7 +6952,7 @@ next: copy_size = min_t(u64, PAGE_SIZE - pg_offset, size - extent_offset); em->start = extent_start + extent_offset; - em->len = ALIGN(copy_size, root->sectorsize); + em->len = ALIGN(copy_size, fs_info->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; ptr = btrfs_file_extent_inline_start(item) + extent_offset; @@ -7024,7 +7011,7 @@ not_found_em: insert: btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); err = -EIO; @@ -7049,11 +7036,11 @@ insert: * extent causing the -EEXIST. */ if (existing->start == em->start && - extent_map_end(existing) == extent_map_end(em) && + extent_map_end(existing) >= extent_map_end(em) && em->block_start == existing->block_start) { /* - * these two extents are the same, it happens - * with inlines especially + * The existing extent map already encompasses the + * entire extent map we tried to add. */ free_extent_map(em); em = existing; @@ -7081,11 +7068,11 @@ insert: write_unlock(&em_tree->lock); out: - trace_btrfs_get_extent(root, em); + trace_btrfs_get_extent(root, inode, em); btrfs_free_path(path); if (trans) { - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); if (!err) err = ret; } @@ -7237,7 +7224,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, struct extent_map *em = NULL; int ret; - down_read(&BTRFS_I(inode)->dio_sem); if (type != BTRFS_ORDERED_NOCOW) { em = create_pinned_em(inode, start, len, orig_start, block_start, block_len, orig_block_len, @@ -7256,7 +7242,6 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, em = ERR_PTR(ret); } out: - up_read(&BTRFS_I(inode)->dio_sem); return em; } @@ -7264,6 +7249,7 @@ static struct extent_map *btrfs_create_dio_extent(struct inode *inode, static struct extent_map *btrfs_new_extent_direct(struct inode *inode, u64 start, u64 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em; struct btrfs_key ins; @@ -7271,17 +7257,18 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); - ret = btrfs_reserve_extent(root, len, len, root->sectorsize, 0, - alloc_hint, &ins, 1, 1); + ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, + 0, alloc_hint, &ins, 1, 1); if (ret) return ERR_PTR(ret); em = btrfs_create_dio_extent(inode, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, 0); - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, + ins.offset, 1); return em; } @@ -7294,6 +7281,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_path *path; int ret; @@ -7374,14 +7362,15 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - if (btrfs_extent_readonly(root, disk_bytenr)) + if (btrfs_extent_readonly(fs_info, disk_bytenr)) goto out; num_bytes = min(offset + *len, extent_end) - offset; if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + num_bytes, root->sectorsize) - 1; + range_end = round_up(offset + num_bytes, + root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); if (ret) { @@ -7404,7 +7393,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode), key.offset - backref_offset, disk_bytenr); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) { ret = 0; goto out; @@ -7418,8 +7407,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, */ disk_bytenr += backref_offset; disk_bytenr += offset - key.offset; - if (csum_exist_in_range(root, disk_bytenr, num_bytes)) - goto out; + if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) + goto out; /* * all of the above have passed, it is safe to overwrite this extent * without cow @@ -7641,11 +7630,18 @@ static void adjust_dio_outstanding_extents(struct inode *inode, * within our reservation, otherwise we need to adjust our inode * counter appropriately. */ - if (dio_data->outstanding_extents) { + if (dio_data->outstanding_extents >= num_extents) { dio_data->outstanding_extents -= num_extents; } else { + /* + * If dio write length has been split due to no large enough + * contiguous space, we need to compensate our inode counter + * appropriately. + */ + u64 num_needed = num_extents - dio_data->outstanding_extents; + spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->outstanding_extents += num_extents; + BTRFS_I(inode)->outstanding_extents += num_needed; spin_unlock(&BTRFS_I(inode)->lock); } } @@ -7653,8 +7649,8 @@ static void adjust_dio_outstanding_extents(struct inode *inode, static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; - struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; u64 start = iblock << inode->i_blkbits; @@ -7666,7 +7662,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (create) unlock_bits |= EXTENT_DIRTY; else - len = min_t(u64, len, root->sectorsize); + len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; @@ -7755,14 +7751,14 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes) == 1 && - btrfs_inc_nocow_writers(root->fs_info, block_start)) { + btrfs_inc_nocow_writers(fs_info, block_start)) { struct extent_map *em2; em2 = btrfs_create_dio_extent(inode, start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(root->fs_info, block_start); + btrfs_dec_nocow_writers(fs_info, block_start); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); em = em2; @@ -7855,19 +7851,18 @@ err: static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, - BTRFS_WQ_ENDIO_DIO_REPAIR); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR); if (ret) goto err; - ret = btrfs_map_bio(root, bio, mirror_num, 0); + ret = btrfs_map_bio(fs_info, bio, mirror_num, 0); err: bio_put(bio); return ret; @@ -7917,7 +7912,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, struct io_failure_record *failrec; struct bio *bio; int isector; - int read_mode; + int read_mode = 0; int ret; BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); @@ -7935,10 +7930,8 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio, if ((failed_bio->bi_vcnt > 1) || (failed_bio->bi_io_vec->bv_len - > BTRFS_I(inode)->root->sectorsize)) - read_mode = READ_SYNC | REQ_FAILFAST_DEV; - else - read_mode = READ_SYNC; + > btrfs_inode_sectorsize(inode))) + read_mode |= REQ_FAILFAST_DEV; isector = start - btrfs_io_bio(failed_bio)->logical; isector >>= inode->i_sb->s_blocksize_bits; @@ -7982,7 +7975,7 @@ static void btrfs_retry_endio_nocsum(struct bio *bio) ASSERT(bio->bi_vcnt == 1); inode = bio->bi_io_vec->bv_page->mapping->host; - ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); done->uptodate = 1; bio_for_each_segment_all(bvec, bio, i) @@ -8006,7 +7999,7 @@ static int __btrfs_correct_data_nocsum(struct inode *inode, int ret; fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = BTRFS_I(inode)->root->sectorsize; + sectorsize = fs_info->sectorsize; start = io_bio->logical; done.inode = inode; @@ -8065,7 +8058,7 @@ static void btrfs_retry_endio(struct bio *bio) ASSERT(bio->bi_vcnt == 1); inode = bio->bi_io_vec->bv_page->mapping->host; - ASSERT(bio->bi_io_vec->bv_len == BTRFS_I(inode)->root->sectorsize); + ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(inode)); bio_for_each_segment_all(bvec, bio, i) { ret = __readpage_endio_check(done->inode, io_bio, i, @@ -8100,7 +8093,7 @@ static int __btrfs_subio_endio_read(struct inode *inode, int ret; fs_info = BTRFS_I(inode)->root->fs_info; - sectorsize = BTRFS_I(inode)->root->sectorsize; + sectorsize = fs_info->sectorsize; err = 0; start = io_bio->logical; @@ -8197,7 +8190,7 @@ static void btrfs_endio_direct_write_update_ordered(struct inode *inode, const u64 bytes, const int uptodate) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; u64 ordered_offset = offset; u64 ordered_bytes = bytes; @@ -8213,8 +8206,7 @@ again: btrfs_init_work(&ordered->work, btrfs_endio_write_helper, finish_ordered_fn, NULL, NULL); - btrfs_queue_work(root->fs_info->endio_write_workers, - &ordered->work); + btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8249,8 +8241,7 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, unsigned long bio_flags, u64 offset) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; - ret = btrfs_csum_one_bio(root, inode, bio, offset, 1); + ret = btrfs_csum_one_bio(inode, bio, offset, 1); BUG_ON(ret); /* -ENOMEM */ return 0; } @@ -8304,8 +8295,7 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return bio; } -static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, - struct inode *inode, +static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode, struct btrfs_dio_private *dip, struct bio *bio, u64 file_offset) @@ -8320,7 +8310,7 @@ static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, * contention. */ if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, + ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, file_offset); if (ret) return ret; @@ -8340,9 +8330,9 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, int skip_sum, int async_submit) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; bool write = bio_op(bio) == REQ_OP_WRITE; - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; if (async_submit) @@ -8351,8 +8341,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, bio_get(bio); if (!write) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, - BTRFS_WQ_ENDIO_DATA); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } @@ -8361,27 +8350,27 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, goto map; if (write && async_submit) { - ret = btrfs_wq_submit_bio(root->fs_info, - inode, bio, 0, 0, file_offset, - __btrfs_submit_bio_start_direct_io, - __btrfs_submit_bio_done); + ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0, + file_offset, + __btrfs_submit_bio_start_direct_io, + __btrfs_submit_bio_done); goto err; } else if (write) { /* * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); + ret = btrfs_csum_one_bio(inode, bio, file_offset, 1); if (ret) goto err; } else { - ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, + ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio, file_offset); if (ret) goto err; } map: - ret = btrfs_map_bio(root, bio, 0, async_submit); + ret = btrfs_map_bio(fs_info, bio, 0, async_submit); err: bio_put(bio); return ret; @@ -8391,23 +8380,24 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, int skip_sum) { struct inode *inode = dip->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct bio *bio; struct bio *orig_bio = dip->orig_bio; - struct bio_vec *bvec = orig_bio->bi_io_vec; + struct bio_vec *bvec; u64 start_sector = orig_bio->bi_iter.bi_sector; u64 file_offset = dip->logical_offset; u64 submit_len = 0; u64 map_length; - u32 blocksize = root->sectorsize; + u32 blocksize = fs_info->sectorsize; int async_submit = 0; int nr_sectors; int ret; - int i; + int i, j; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), - start_sector << 9, &map_length, NULL, 0); + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, + &map_length, NULL, 0); if (ret) return -EIO; @@ -8427,14 +8417,14 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; atomic_inc(&dip->pending_bios); - while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - nr_sectors = BTRFS_BYTES_TO_BLKS(root->fs_info, bvec->bv_len); + bio_for_each_segment_all(bvec, orig_bio, j) { + nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len); i = 0; next_block: if (unlikely(map_length < submit_len + blocksize || @@ -8465,14 +8455,13 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), - bio_flags(orig_bio)); + bio->bi_opf = orig_bio->bi_opf; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; - ret = btrfs_map_block(root->fs_info, bio_op(orig_bio), + ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9, &map_length, NULL, 0); if (ret) { @@ -8487,7 +8476,6 @@ next_block: i++; goto next_block; } - bvec++; } } @@ -8619,12 +8607,13 @@ free_ordered: kfree(dip); } -static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, - const struct iov_iter *iter, loff_t offset) +static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, + struct kiocb *iocb, + const struct iov_iter *iter, loff_t offset) { int seg; int i; - unsigned blocksize_mask = root->sectorsize - 1; + unsigned int blocksize_mask = fs_info->sectorsize - 1; ssize_t retval = -EINVAL; if (offset & blocksize_mask) @@ -8656,7 +8645,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_data dio_data = { 0 }; loff_t offset = iocb->ki_pos; size_t count = 0; @@ -8665,7 +8654,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) + if (check_direct_IO(fs_info, iocb, iter, offset)) return 0; inode_dio_begin(inode); @@ -8705,10 +8694,12 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * do the accounting properly if we go over the number we * originally calculated. Abuse current->journal_info for this. */ - dio_data.reserve = round_up(count, root->sectorsize); + dio_data.reserve = round_up(count, + fs_info->sectorsize); dio_data.unsubmitted_oe_range_start = (u64)offset; dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; + down_read(&BTRFS_I(inode)->dio_sem); } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { inode_dio_end(inode); @@ -8717,10 +8708,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } ret = __blockdev_direct_IO(iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + fs_info->fs_devices->latest_bdev, iter, btrfs_get_blocks_direct, NULL, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { + up_read(&BTRFS_I(inode)->dio_sem); current->journal_info = NULL; if (ret < 0 && ret != -EIOCBQUEUED) { if (dio_data.reserve) @@ -8976,7 +8968,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; @@ -9051,7 +9043,8 @@ again: } if (page->index == ((size - 1) >> PAGE_SHIFT)) { - reserved_space = round_up(size - page_start, root->sectorsize); + reserved_space = round_up(size - page_start, + fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; spin_lock(&BTRFS_I(inode)->lock); @@ -9100,7 +9093,7 @@ again: set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = root->fs_info->generation; + BTRFS_I(inode)->last_trans = fs_info->generation; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; @@ -9121,13 +9114,14 @@ out_noreserve: static int btrfs_truncate(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv; int ret = 0; int err = 0; struct btrfs_trans_handle *trans; - u64 mask = root->sectorsize - 1; - u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); + u64 mask = fs_info->sectorsize - 1; + u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1); ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); @@ -9170,7 +9164,7 @@ static int btrfs_truncate(struct inode *inode) * 3) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ - rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); + rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; @@ -9187,7 +9181,7 @@ static int btrfs_truncate(struct inode *inode) } /* Migrate the slack space for the truncate to our reserve */ - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv, + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, 0); BUG_ON(ret); @@ -9210,15 +9204,15 @@ static int btrfs_truncate(struct inode *inode) break; } - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret) { err = ret; break; } - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { @@ -9227,7 +9221,8 @@ static int btrfs_truncate(struct inode *inode) break; } - ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, + btrfs_block_rsv_release(fs_info, rsv, -1); + ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, 0); BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; @@ -9241,16 +9236,16 @@ static int btrfs_truncate(struct inode *inode) } if (trans) { - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; ret = btrfs_update_inode(trans, root, inode); if (ret && !err) err = ret; - ret = btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + ret = btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); } out: - btrfs_free_block_rsv(root, rsv); + btrfs_free_block_rsv(fs_info, rsv); if (ret && !err) err = ret; @@ -9366,6 +9361,7 @@ static void btrfs_i_callback(struct rcu_head *head) void btrfs_destroy_inode(struct inode *inode) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -9387,8 +9383,8 @@ void btrfs_destroy_inode(struct inode *inode) if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)) { - btrfs_info(root->fs_info, "inode %llu still on the orphan list", - btrfs_ino(inode)); + btrfs_info(fs_info, "inode %llu still on the orphan list", + btrfs_ino(inode)); atomic_dec(&root->orphan_inodes); } @@ -9397,7 +9393,7 @@ void btrfs_destroy_inode(struct inode *inode) if (!ordered) break; else { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->len); btrfs_remove_ordered_extent(inode, ordered); @@ -9509,6 +9505,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_dir, struct dentry *new_dentry) { + struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; @@ -9531,9 +9528,9 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&root->fs_info->subvol_sem); + down_read(&fs_info->subvol_sem); if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&dest->fs_info->subvol_sem); + down_read(&fs_info->subvol_sem); /* * We want to reserve the absolute worst case amount of items. So if @@ -9566,7 +9563,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); } else { btrfs_pin_log_trans(root); root_log_pinned = true; @@ -9582,7 +9579,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(dest->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); } else { btrfs_pin_log_trans(dest); dest_log_pinned = true; @@ -9696,12 +9693,12 @@ out_fail: * allow the tasks to sync it. */ if (ret && (root_log_pinned || dest_log_pinned)) { - if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || - btrfs_inode_in_log(new_dir, root->fs_info->generation) || - btrfs_inode_in_log(old_inode, root->fs_info->generation) || + if (btrfs_inode_in_log(old_dir, fs_info->generation) || + btrfs_inode_in_log(new_dir, fs_info->generation) || + btrfs_inode_in_log(old_inode, fs_info->generation) || (new_inode && - btrfs_inode_in_log(new_inode, root->fs_info->generation))) - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_inode_in_log(new_inode, fs_info->generation))) + btrfs_set_log_full_commit(fs_info, trans); if (root_log_pinned) { btrfs_end_log_trans(root); @@ -9712,12 +9709,12 @@ out_fail: dest_log_pinned = false; } } - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&dest->fs_info->subvol_sem); + up_read(&fs_info->subvol_sem); if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&root->fs_info->subvol_sem); + up_read(&fs_info->subvol_sem); return ret; } @@ -9777,6 +9774,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -9833,7 +9831,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* close the racy window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - down_read(&root->fs_info->subvol_sem); + down_read(&fs_info->subvol_sem); /* * We want to reserve the absolute worst case amount of items. So if * both inodes are subvols and we need to unlink them then that would @@ -9864,7 +9862,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); } else { btrfs_pin_log_trans(root); log_pinned = true; @@ -9971,20 +9969,20 @@ out_fail: * allow the tasks to sync it. */ if (ret && log_pinned) { - if (btrfs_inode_in_log(old_dir, root->fs_info->generation) || - btrfs_inode_in_log(new_dir, root->fs_info->generation) || - btrfs_inode_in_log(old_inode, root->fs_info->generation) || + if (btrfs_inode_in_log(old_dir, fs_info->generation) || + btrfs_inode_in_log(new_dir, fs_info->generation) || + btrfs_inode_in_log(old_inode, fs_info->generation) || (new_inode && - btrfs_inode_in_log(new_inode, root->fs_info->generation))) - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_inode_in_log(new_inode, fs_info->generation))) + btrfs_set_log_full_commit(fs_info, trans); btrfs_end_log_trans(root); log_pinned = false; } - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) - up_read(&root->fs_info->subvol_sem); + up_read(&fs_info->subvol_sem); return ret; } @@ -10119,9 +10117,10 @@ out: int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; ret = __start_delalloc_inodes(root, delay_iput, -1); @@ -10132,14 +10131,14 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) * we have to make sure the IO is actually started and that * ordered extents get created before we return */ - atomic_inc(&root->fs_info->async_submit_draining); - while (atomic_read(&root->fs_info->nr_async_submits) || - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->nr_async_submits) == 0 && - atomic_read(&root->fs_info->async_delalloc_pages) == 0)); - } - atomic_dec(&root->fs_info->async_submit_draining); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&fs_info->async_submit_draining); return ret; } @@ -10202,6 +10201,7 @@ out: static int btrfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; @@ -10218,7 +10218,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; /* @@ -10312,12 +10312,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, d_instantiate(dentry, inode); out_unlock: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } - btrfs_btree_balance_dirty(root); + btrfs_btree_balance_dirty(fs_info); return err; out_unlock_inode: @@ -10331,6 +10331,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; @@ -10367,10 +10368,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) { if (own_trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); break; } - btrfs_dec_block_group_reservations(root->fs_info, ins.objectid); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; ret = insert_reserved_file_extent(trans, inode, @@ -10379,11 +10380,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); if (ret) { - btrfs_free_reserved_extent(root, ins.objectid, + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); btrfs_abort_transaction(trans, ret); if (own_trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); break; } @@ -10404,7 +10405,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; @@ -10443,12 +10444,12 @@ next: if (ret) { btrfs_abort_transaction(trans, ret); if (own_trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); break; } if (own_trans) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); } if (cur_offset < end) btrfs_free_reserved_data_space(inode, cur_offset, @@ -10496,6 +10497,7 @@ static int btrfs_permission(struct inode *inode, int mask) static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = NULL; @@ -10552,11 +10554,11 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) mark_inode_dirty(inode); out: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (ret) iput(inode); - btrfs_balance_delayed_items(root); - btrfs_btree_balance_dirty(root); + btrfs_balance_delayed_items(fs_info); + btrfs_btree_balance_dirty(fs_info); return ret; out_inode: @@ -10587,8 +10589,6 @@ static const struct inode_operations btrfs_dir_inode_operations = { static const struct inode_operations btrfs_dir_ro_inode_operations = { .lookup = btrfs_lookup, .permission = btrfs_permission, - .get_acl = btrfs_get_acl, - .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; @@ -10668,7 +10668,6 @@ static const struct inode_operations btrfs_special_inode_operations = { .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7acbd2cf6192..21e51b0ba188 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -33,7 +33,6 @@ #include <linux/namei.h> #include <linux/swap.h> #include <linux/writeback.h> -#include <linux/statfs.h> #include <linux/compat.h> #include <linux/bit_spinlock.h> #include <linux/security.h> @@ -216,6 +215,7 @@ static int check_flags(unsigned int flags) static int btrfs_ioctl_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *ip = BTRFS_I(inode); struct btrfs_root *root = ip->root; struct btrfs_trans_handle *trans; @@ -325,7 +325,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) ip->flags |= BTRFS_INODE_COMPRESS; ip->flags &= ~BTRFS_INODE_NOCOMPRESS; - if (root->fs_info->compress_type == BTRFS_COMPRESS_LZO) + if (fs_info->compress_type == BTRFS_COMPRESS_LZO) comp = "lzo"; else comp = "zlib"; @@ -352,7 +352,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) inode->i_ctime = current_time(inode); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); out_drop: if (ret) { ip->flags = ip_oldflags; @@ -374,7 +374,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg) static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) { - struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_device *device; struct request_queue *q; struct fstrim_range range; @@ -410,7 +411,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) range.len = min(range.len, total_bytes - range.start); range.minlen = max(range.minlen, minlen); - ret = btrfs_trim_fs(fs_info->tree_root, &range); + ret = btrfs_trim_fs(fs_info, &range); if (ret < 0) return ret; @@ -437,6 +438,7 @@ static noinline int create_subvol(struct inode *dir, u64 *async_transid, struct btrfs_qgroup_inherit *inherit) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_key key; struct btrfs_root_item *root_item; @@ -459,7 +461,7 @@ static noinline int create_subvol(struct inode *dir, if (!root_item) return -ENOMEM; - ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); + ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid); if (ret) goto fail_free; @@ -485,14 +487,14 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(root, &block_rsv, + btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved); goto fail_free; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, fs_info, 0, objectid, inherit); if (ret) goto fail; @@ -502,24 +504,22 @@ static noinline int create_subvol(struct inode *dir, goto fail; } - memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header)); + memzero_extent_buffer(leaf, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(leaf, leaf->start); btrfs_set_header_generation(leaf, trans->transid); btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(leaf, objectid); - write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(), - BTRFS_FSID_SIZE); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - btrfs_header_chunk_tree_uuid(leaf), - BTRFS_UUID_SIZE); + write_extent_buffer_fsid(leaf, fs_info->fsid); + write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_mark_buffer_dirty(leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); + btrfs_set_stack_inode_nbytes(inode_item, + fs_info->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(root_item, 0); @@ -552,13 +552,13 @@ static noinline int create_subvol(struct inode *dir, key.objectid = objectid; key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; - ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) goto fail; key.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + new_root = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); @@ -599,14 +599,13 @@ static noinline int create_subvol(struct inode *dir, ret = btrfs_update_inode(trans, root, dir); BUG_ON(ret); - ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + ret = btrfs_add_root_ref(trans, fs_info, objectid, root->root_key.objectid, btrfs_ino(dir), index, name, namelen); BUG_ON(ret); - ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - root_item->uuid, BTRFS_UUID_KEY_SUBVOL, - objectid); + ret = btrfs_uuid_tree_add(trans, fs_info, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) btrfs_abort_transaction(trans, ret); @@ -614,15 +613,15 @@ fail: kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved); if (async_transid) { *async_transid = trans->transid; - err = btrfs_commit_transaction_async(trans, root, 1); + err = btrfs_commit_transaction_async(trans, 1); if (err) - err = btrfs_commit_transaction(trans, root); + err = btrfs_commit_transaction(trans); } else { - err = btrfs_commit_transaction(trans, root); + err = btrfs_commit_transaction(trans); } if (err && !ret) ret = err; @@ -662,6 +661,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; @@ -721,19 +721,17 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto fail; } - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); if (async_transid) { *async_transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, - root->fs_info->extent_root, 1); + ret = btrfs_commit_transaction_async(trans, 1); if (ret) - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); } else { - ret = btrfs_commit_transaction(trans, - root->fs_info->extent_root); + ret = btrfs_commit_transaction(trans); } if (ret) goto fail; @@ -755,7 +753,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, d_instantiate(dentry, inode); ret = 0; fail: - btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, + btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); dec_and_free: @@ -836,13 +834,14 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) * sys_mkdirat and vfs_mkdir, but we only do a single component lookup * inside this filesystem so it's quite a bit simpler. */ -static noinline int btrfs_mksubvol(struct path *parent, +static noinline int btrfs_mksubvol(const struct path *parent, char *name, int namelen, struct btrfs_root *snap_src, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = d_inode(parent->dentry); + struct inode *dir = d_inode(parent->dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; int error; @@ -869,7 +868,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (error) goto out_dput; - down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + down_read(&fs_info->subvol_sem); if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0) goto out_up_read; @@ -884,7 +883,7 @@ static noinline int btrfs_mksubvol(struct path *parent, if (!error) fsnotify_mkdir(dir, dentry); out_up_read: - up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem); + up_read(&fs_info->subvol_sem); out_dput: dput(dentry); out_unlock: @@ -1268,6 +1267,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct file_ra_state *ra = NULL; unsigned long last_index; @@ -1365,8 +1365,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!(inode->i_sb->s_flags & MS_ACTIVE)) break; - if (btrfs_defrag_cancelled(root->fs_info)) { - btrfs_debug(root->fs_info, "defrag_file cancelled"); + if (btrfs_defrag_cancelled(fs_info)) { + btrfs_debug(fs_info, "defrag_file cancelled"); ret = -EAGAIN; break; } @@ -1454,18 +1454,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * we have to make sure the IO is actually started and that * ordered extents get created before we return */ - atomic_inc(&root->fs_info->async_submit_draining); - while (atomic_read(&root->fs_info->nr_async_submits) || - atomic_read(&root->fs_info->async_delalloc_pages)) { - wait_event(root->fs_info->async_submit_wait, - (atomic_read(&root->fs_info->nr_async_submits) == 0 && - atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + atomic_inc(&fs_info->async_submit_draining); + while (atomic_read(&fs_info->nr_async_submits) || + atomic_read(&fs_info->async_delalloc_pages)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0 && + atomic_read(&fs_info->async_delalloc_pages) == 0)); } - atomic_dec(&root->fs_info->async_submit_draining); + atomic_dec(&fs_info->async_submit_draining); } if (range->compress_type == BTRFS_COMPRESS_LZO) { - btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO); + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); } ret = defrag_count; @@ -1485,10 +1485,12 @@ out_ra: static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; @@ -1505,13 +1507,12 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { + if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } - mutex_lock(&root->fs_info->volume_mutex); + mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -1533,19 +1534,19 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - btrfs_info(root->fs_info, "resizing devid %llu", devid); + btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(root->fs_info, devid, NULL, NULL); + device = btrfs_find_device(fs_info, devid, NULL, NULL); if (!device) { - btrfs_info(root->fs_info, "resizer unable to find device %llu", - devid); + btrfs_info(fs_info, "resizer unable to find device %llu", + devid); ret = -ENODEV; goto out_free; } if (!device->writeable) { - btrfs_info(root->fs_info, + btrfs_info(fs_info, "resizer unable to apply on readonly device %llu", devid); ret = -EPERM; @@ -1599,11 +1600,11 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - new_size = div_u64(new_size, root->sectorsize); - new_size *= root->sectorsize; + new_size = div_u64(new_size, fs_info->sectorsize); + new_size *= fs_info->sectorsize; - btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu", - rcu_str_deref(device->name), new_size); + btrfs_info_in_rcu(fs_info, "new size for %s is %llu", + rcu_str_deref(device->name), new_size); if (new_size > old_size) { trans = btrfs_start_transaction(root, 0); @@ -1612,7 +1613,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } ret = btrfs_grow_device(trans, device, new_size); - btrfs_commit_transaction(trans, root); + btrfs_commit_transaction(trans); } else if (new_size < old_size) { ret = btrfs_shrink_device(device, new_size); } /* equal, nothing need to do */ @@ -1620,8 +1621,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mutex_unlock(&fs_info->volume_mutex); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mnt_drop_write_file(file); return ret; } @@ -1774,6 +1775,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret = 0; u64 flags = 0; @@ -1781,10 +1783,10 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file, if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; - down_read(&root->fs_info->subvol_sem); + down_read(&fs_info->subvol_sem); if (btrfs_root_readonly(root)) flags |= BTRFS_SUBVOL_RDONLY; - up_read(&root->fs_info->subvol_sem); + up_read(&fs_info->subvol_sem); if (copy_to_user(arg, &flags, sizeof(flags))) ret = -EFAULT; @@ -1796,6 +1798,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, void __user *arg) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; u64 root_flags; @@ -1829,7 +1832,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_drop_write; } - down_write(&root->fs_info->subvol_sem); + down_write(&fs_info->subvol_sem); /* nothing to do */ if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root)) @@ -1851,9 +1854,9 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, spin_unlock(&root->root_item_lock); } else { spin_unlock(&root->root_item_lock); - btrfs_warn(root->fs_info, - "Attempt to set subvolume %llu read-write during send", - root->root_key.objectid); + btrfs_warn(fs_info, + "Attempt to set subvolume %llu read-write during send", + root->root_key.objectid); ret = -EPERM; goto out_drop_sem; } @@ -1865,15 +1868,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, goto out_reset; } - ret = btrfs_update_root(trans, root->fs_info->tree_root, + ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); - btrfs_commit_transaction(trans, root); + btrfs_commit_transaction(trans); out_reset: if (ret) btrfs_set_root_flags(&root->root_item, root_flags); out_drop_sem: - up_write(&root->fs_info->subvol_sem); + up_write(&fs_info->subvol_sem); out_drop_write: mnt_drop_write_file(file); out: @@ -1885,6 +1888,7 @@ out: */ static noinline int may_destroy_subvol(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; @@ -1896,14 +1900,14 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) return -ENOMEM; /* Make sure this root isn't set as the default subvol */ - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path, + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, dir_id, "default", 7, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { ret = -EPERM; - btrfs_err(root->fs_info, + btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); goto out; @@ -1915,8 +1919,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, - &key, path, 0, 0); + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; BUG_ON(ret == 0); @@ -2087,10 +2090,10 @@ static noinline int search_ioctl(struct inode *inode, size_t *buf_size, char __user *ubuf) { + struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); struct btrfs_root *root; struct btrfs_key key; struct btrfs_path *path; - struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info; int ret; int num_found = 0; unsigned long sk_offset = 0; @@ -2353,6 +2356,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, void __user *arg) { struct dentry *parent = file->f_path.dentry; + struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb); struct dentry *dentry; struct inode *dir = d_inode(parent); struct inode *inode; @@ -2418,7 +2422,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, * rmdir(2). */ err = -EPERM; - if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED)) + if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED)) goto out_dput; /* @@ -2462,14 +2466,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, spin_unlock(&dest->root_item_lock); } else { spin_unlock(&dest->root_item_lock); - btrfs_warn(root->fs_info, - "Attempt to delete subvolume %llu during send", - dest->root_key.objectid); + btrfs_warn(fs_info, + "Attempt to delete subvolume %llu during send", + dest->root_key.objectid); err = -EPERM; goto out_unlock_inode; } - down_write(&root->fs_info->subvol_sem); + down_write(&fs_info->subvol_sem); err = may_destroy_subvol(dest); if (err) @@ -2514,7 +2518,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, - root->fs_info->tree_root, + fs_info->tree_root, dest->root_key.objectid); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2523,8 +2527,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } } - ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, - dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, + ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.uuid, + BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); @@ -2532,7 +2536,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { - ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, + ret = btrfs_uuid_tree_rem(trans, fs_info, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); @@ -2546,14 +2550,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); if (ret && !err) err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); + btrfs_subvolume_release_metadata(fs_info, &block_rsv, qgroup_reserved); out_up_write: - up_write(&root->fs_info->subvol_sem); + up_write(&fs_info->subvol_sem); if (err) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); @@ -2655,7 +2659,7 @@ out: return ret; } -static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2663,12 +2667,10 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { + if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; - } - mutex_lock(&root->fs_info->volume_mutex); + mutex_lock(&fs_info->volume_mutex); vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); @@ -2676,21 +2678,22 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - ret = btrfs_init_new_device(root, vol_args->name); + ret = btrfs_init_new_device(fs_info, vol_args->name); if (!ret) - btrfs_info(root->fs_info, "disk added %s",vol_args->name); + btrfs_info(fs_info, "disk added %s", vol_args->name); kfree(vol_args); out: - mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mutex_unlock(&fs_info->volume_mutex); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); return ret; } static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; int ret; @@ -2711,28 +2714,27 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { + if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } - mutex_lock(&root->fs_info->volume_mutex); + mutex_lock(&fs_info->volume_mutex); if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { - ret = btrfs_rm_device(root, NULL, vol_args->devid); + ret = btrfs_rm_device(fs_info, NULL, vol_args->devid); } else { vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - ret = btrfs_rm_device(root, vol_args->name, 0); + ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - mutex_unlock(&root->fs_info->volume_mutex); - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mutex_unlock(&fs_info->volume_mutex); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - btrfs_info(root->fs_info, "device deleted: id %llu", + btrfs_info(fs_info, "device deleted: id %llu", vol_args->devid); else - btrfs_info(root->fs_info, "device deleted: %s", + btrfs_info(fs_info, "device deleted: %s", vol_args->name); } out: @@ -2744,7 +2746,8 @@ err_drop: static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; int ret; @@ -2755,8 +2758,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { + if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2768,26 +2770,27 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - mutex_lock(&root->fs_info->volume_mutex); - ret = btrfs_rm_device(root, vol_args->name, 0); - mutex_unlock(&root->fs_info->volume_mutex); + mutex_lock(&fs_info->volume_mutex); + ret = btrfs_rm_device(fs_info, vol_args->name, 0); + mutex_unlock(&fs_info->volume_mutex); if (!ret) - btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); out_drop_write: mnt_drop_write_file(file); return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, + void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int ret = 0; fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL); @@ -2796,7 +2799,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) mutex_lock(&fs_devices->device_list_mutex); fi_args->num_devices = fs_devices->num_devices; - memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); + memcpy(&fi_args->fsid, fs_info->fsid, sizeof(fi_args->fsid)); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->devid > fi_args->max_id) @@ -2804,9 +2807,9 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) } mutex_unlock(&fs_devices->device_list_mutex); - fi_args->nodesize = root->fs_info->super_copy->nodesize; - fi_args->sectorsize = root->fs_info->super_copy->sectorsize; - fi_args->clone_alignment = root->fs_info->super_copy->sectorsize; + fi_args->nodesize = fs_info->super_copy->nodesize; + fi_args->sectorsize = fs_info->super_copy->sectorsize; + fi_args->clone_alignment = fs_info->super_copy->sectorsize; if (copy_to_user(arg, fi_args, sizeof(*fi_args))) ret = -EFAULT; @@ -2815,11 +2818,12 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, + void __user *arg) { struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int ret = 0; char *s_uuid = NULL; @@ -2831,7 +2835,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) s_uuid = di_args->uuid; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL); + dev = btrfs_find_device(fs_info, di_args->devid, s_uuid, NULL); if (!dev) { ret = -ENODEV; @@ -3305,10 +3309,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); if (ret) { btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); out: return ret; } @@ -3406,9 +3410,10 @@ static int clone_copy_inline_extent(struct inode *src, const u64 size, char *inline_data) { + struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb); struct btrfs_root *root = BTRFS_I(dst)->root; const u64 aligned_end = ALIGN(new_key->offset + datal, - root->sectorsize); + fs_info->sectorsize); int ret; struct btrfs_key key; @@ -3529,6 +3534,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, const u64 destoff, int no_time_update) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; struct extent_buffer *leaf; @@ -3542,9 +3548,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN); + buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); if (!buf) { - buf = vmalloc(root->nodesize); + buf = vmalloc(fs_info->nodesize); if (!buf) return ret; } @@ -3707,7 +3713,7 @@ process_slot: if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } @@ -3715,7 +3721,7 @@ process_slot: &new_key, size); if (ret) { btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } @@ -3739,7 +3745,8 @@ process_slot: if (disko) { inode_add_bytes(inode, datal); - ret = btrfs_inc_extent_ref(trans, root, + ret = btrfs_inc_extent_ref(trans, + fs_info, disko, diskl, 0, root->root_key.objectid, btrfs_ino(inode), @@ -3747,8 +3754,7 @@ process_slot: if (ret) { btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, - root); + btrfs_end_transaction(trans); goto out; } @@ -3767,7 +3773,7 @@ process_slot: if (comp && (skip || trim)) { ret = -EINVAL; - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } size -= skip + trim; @@ -3783,7 +3789,7 @@ process_slot: if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } leaf = path->nodes[0]; @@ -3802,7 +3808,7 @@ process_slot: btrfs_release_path(path); last_dest_end = ALIGN(new_key.offset + datal, - root->sectorsize); + fs_info->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, destoff, olen, @@ -3843,7 +3849,7 @@ process_slot: if (ret) { if (ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } clone_update_extent_map(inode, trans, NULL, last_dest_end, @@ -3863,10 +3869,11 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, { struct inode *inode = file_inode(file); struct inode *src = file_inode(file_src); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 len = olen; - u64 bs = root->fs_info->sb->s_blocksize; + u64 bs = fs_info->sb->s_blocksize; int same_inode = src == inode; /* @@ -3980,18 +3987,6 @@ out_unlock: return ret; } -ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, - struct file *file_out, loff_t pos_out, - size_t len, unsigned int flags) -{ - ssize_t ret; - - ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); - if (ret == 0) - ret = len; - return ret; -} - int btrfs_clone_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, u64 len) { @@ -4007,6 +4002,7 @@ int btrfs_clone_file_range(struct file *src_file, loff_t off, static long btrfs_ioctl_trans_start(struct file *file) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; int ret; @@ -4027,7 +4023,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (ret) goto out; - atomic_inc(&root->fs_info->open_ioctl_trans); + atomic_inc(&fs_info->open_ioctl_trans); ret = -ENOMEM; trans = btrfs_start_ioctl_transaction(root); @@ -4038,7 +4034,7 @@ static long btrfs_ioctl_trans_start(struct file *file) return 0; out_drop: - atomic_dec(&root->fs_info->open_ioctl_trans); + atomic_dec(&fs_info->open_ioctl_trans); mnt_drop_write_file(file); out: return ret; @@ -4047,6 +4043,7 @@ out: static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root *new_root; struct btrfs_dir_item *di; @@ -4077,7 +4074,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) location.type = BTRFS_ROOT_ITEM_KEY; location.offset = (u64)-1; - new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + new_root = btrfs_read_fs_root_no_name(fs_info, &location); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); goto out; @@ -4097,13 +4094,13 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) goto out; } - dir_id = btrfs_super_root_dir(root->fs_info->super_copy); - di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, + dir_id = btrfs_super_root_dir(fs_info->super_copy); + di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, dir_id, "default", 7, 1); if (IS_ERR_OR_NULL(di)) { btrfs_free_path(path); - btrfs_end_transaction(trans, root); - btrfs_err(new_root->fs_info, + btrfs_end_transaction(trans); + btrfs_err(fs_info, "Umm, you don't have the default diritem, this isn't going to work"); ret = -ENOENT; goto out; @@ -4114,8 +4111,8 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL); - btrfs_end_transaction(trans, root); + btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); + btrfs_end_transaction(trans); out: mnt_drop_write_file(file); return ret; @@ -4137,7 +4134,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, } } -static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, + void __user *arg) { struct btrfs_ioctl_space_args space_args; struct btrfs_ioctl_space_info space; @@ -4165,7 +4163,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) info = NULL; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; @@ -4221,7 +4219,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) info = NULL; rcu_read_lock(); - list_for_each_entry_rcu(tmp, &root->fs_info->space_info, + list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; @@ -4252,7 +4250,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) * Add global block reserve */ if (slot_count) { - struct btrfs_block_rsv *block_rsv = &root->fs_info->global_block_rsv; + struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; spin_lock(&block_rsv->lock); space.total_bytes = block_rsv->size; @@ -4294,7 +4292,7 @@ long btrfs_ioctl_trans_end(struct file *file) return -EINVAL; file->private_data = NULL; - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); atomic_dec(&root->fs_info->open_ioctl_trans); @@ -4319,9 +4317,9 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, goto out; } transid = trans->transid; - ret = btrfs_commit_transaction_async(trans, root, 0); + ret = btrfs_commit_transaction_async(trans, 0); if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } out: @@ -4331,7 +4329,7 @@ out: return 0; } -static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, +static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info, void __user *argp) { u64 transid; @@ -4342,12 +4340,12 @@ static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root, } else { transid = 0; /* current trans */ } - return btrfs_wait_for_commit(root, transid); + return btrfs_wait_for_commit(fs_info, transid); } static long btrfs_ioctl_scrub(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb); struct btrfs_ioctl_scrub_args *sa; int ret; @@ -4364,7 +4362,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) goto out; } - ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end, + ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, 0); @@ -4378,15 +4376,15 @@ out: return ret; } -static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_scrub_cancel(root->fs_info); + return btrfs_scrub_cancel(fs_info); } -static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, +static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_scrub_args *sa; @@ -4399,7 +4397,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, if (IS_ERR(sa)) return PTR_ERR(sa); - ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); + ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; @@ -4408,7 +4406,7 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, return ret; } -static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, +static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_get_dev_stats *sa; @@ -4423,7 +4421,7 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, return -EPERM; } - ret = btrfs_get_dev_stats(root, sa); + ret = btrfs_get_dev_stats(fs_info, sa); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; @@ -4432,7 +4430,8 @@ static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root, return ret; } -static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) +static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, + void __user *arg) { struct btrfs_ioctl_dev_replace_args *p; int ret; @@ -4446,27 +4445,25 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg) switch (p->cmd) { case BTRFS_IOCTL_DEV_REPLACE_CMD_START: - if (root->fs_info->sb->s_flags & MS_RDONLY) { + if (fs_info->sb->s_flags & MS_RDONLY) { ret = -EROFS; goto out; } if (atomic_xchg( - &root->fs_info->mutually_exclusive_operation_running, - 1)) { + &fs_info->mutually_exclusive_operation_running, 1)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { - ret = btrfs_dev_replace_by_ioctl(root, p); + ret = btrfs_dev_replace_by_ioctl(fs_info, p); atomic_set( - &root->fs_info->mutually_exclusive_operation_running, - 0); + &fs_info->mutually_exclusive_operation_running, 0); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: - btrfs_dev_replace_status(root->fs_info, p); + btrfs_dev_replace_status(fs_info, p); ret = 0; break; case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL: - ret = btrfs_dev_replace_cancel(root->fs_info, p); + ret = btrfs_dev_replace_cancel(fs_info, p); break; default: ret = -EINVAL; @@ -4559,7 +4556,7 @@ static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) return 0; } -static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, +static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg) { int ret = 0; @@ -4572,11 +4569,8 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, return -EPERM; loi = memdup_user(arg, sizeof(*loi)); - if (IS_ERR(loi)) { - ret = PTR_ERR(loi); - loi = NULL; - goto out; - } + if (IS_ERR(loi)) + return PTR_ERR(loi); path = btrfs_alloc_path(); if (!path) { @@ -4592,7 +4586,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path, + ret = iterate_inodes_from_logical(loi->logical, fs_info, path, build_ino_list, inodes); if (ret == -EINVAL) ret = -ENOENT; @@ -4788,25 +4782,24 @@ out: return ret; } -static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd) +static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case BTRFS_BALANCE_CTL_PAUSE: - return btrfs_pause_balance(root->fs_info); + return btrfs_pause_balance(fs_info); case BTRFS_BALANCE_CTL_CANCEL: - return btrfs_cancel_balance(root->fs_info); + return btrfs_cancel_balance(fs_info); } return -EINVAL; } -static long btrfs_ioctl_balance_progress(struct btrfs_root *root, +static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; int ret = 0; @@ -4838,7 +4831,8 @@ out: static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_ctl_args *sa; struct btrfs_trans_handle *trans = NULL; int ret; @@ -4857,8 +4851,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) goto drop_write; } - down_write(&root->fs_info->subvol_sem); - trans = btrfs_start_transaction(root->fs_info->tree_root, 2); + down_write(&fs_info->subvol_sem); + trans = btrfs_start_transaction(fs_info->tree_root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; @@ -4866,22 +4860,22 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(trans, root->fs_info); + ret = btrfs_quota_enable(trans, fs_info); break; case BTRFS_QUOTA_CTL_DISABLE: - ret = btrfs_quota_disable(trans, root->fs_info); + ret = btrfs_quota_disable(trans, fs_info); break; default: ret = -EINVAL; break; } - err = btrfs_commit_transaction(trans, root->fs_info->tree_root); + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; out: kfree(sa); - up_write(&root->fs_info->subvol_sem); + up_write(&fs_info->subvol_sem); drop_write: mnt_drop_write_file(file); return ret; @@ -4889,7 +4883,9 @@ drop_write: static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_assign_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -4916,19 +4912,19 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->assign) { - ret = btrfs_add_qgroup_relation(trans, root->fs_info, + ret = btrfs_add_qgroup_relation(trans, fs_info, sa->src, sa->dst); } else { - ret = btrfs_del_qgroup_relation(trans, root->fs_info, + ret = btrfs_del_qgroup_relation(trans, fs_info, sa->src, sa->dst); } /* update qgroup status and info */ - err = btrfs_run_qgroups(trans, root->fs_info); + err = btrfs_run_qgroups(trans, fs_info); if (err < 0) - btrfs_handle_fs_error(root->fs_info, err, - "failed to update qgroup status and info"); - err = btrfs_end_transaction(trans, root); + btrfs_handle_fs_error(fs_info, err, + "failed to update qgroup status and info"); + err = btrfs_end_transaction(trans); if (err && !ret) ret = err; @@ -4941,7 +4937,9 @@ drop_write: static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_create_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -4973,12 +4971,12 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->create) { - ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); + ret = btrfs_create_qgroup(trans, fs_info, sa->qgroupid); } else { - ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); + ret = btrfs_remove_qgroup(trans, fs_info, sa->qgroupid); } - err = btrfs_end_transaction(trans, root); + err = btrfs_end_transaction(trans); if (err && !ret) ret = err; @@ -4991,7 +4989,9 @@ drop_write: static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_qgroup_limit_args *sa; struct btrfs_trans_handle *trans; int ret; @@ -5024,9 +5024,9 @@ static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg) } /* FIXME: check if the IDs really exist */ - ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim); + ret = btrfs_limit_qgroup(trans, fs_info, qgroupid, &sa->lim); - err = btrfs_end_transaction(trans, root); + err = btrfs_end_transaction(trans); if (err && !ret) ret = err; @@ -5039,7 +5039,8 @@ drop_write: static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_rescan_args *qsa; int ret; @@ -5061,7 +5062,7 @@ static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg) goto out; } - ret = btrfs_qgroup_rescan(root->fs_info); + ret = btrfs_qgroup_rescan(fs_info); out: kfree(qsa); @@ -5072,7 +5073,8 @@ drop_write: static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_quota_rescan_args *qsa; int ret = 0; @@ -5083,9 +5085,9 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) if (!qsa) return -ENOMEM; - if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { qsa->flags = 1; - qsa->progress = root->fs_info->qgroup_rescan_progress.objectid; + qsa->progress = fs_info->qgroup_rescan_progress.objectid; } if (copy_to_user(arg, qsa, sizeof(*qsa))) @@ -5097,18 +5099,20 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - return btrfs_qgroup_wait_for_completion(root->fs_info, true); + return btrfs_qgroup_wait_for_completion(fs_info, true); } static long _btrfs_ioctl_set_received_subvol(struct file *file, struct btrfs_ioctl_received_subvol_args *sa) { struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_root_item *root_item = &root->root_item; struct btrfs_trans_handle *trans; @@ -5123,7 +5127,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, if (ret < 0) return ret; - down_write(&root->fs_info->subvol_sem); + down_write(&fs_info->subvol_sem); if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { ret = -EINVAL; @@ -5154,8 +5158,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, BTRFS_UUID_SIZE); if (received_uuid_changed && !btrfs_is_empty_uuid(root_item->received_uuid)) - btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root, - root_item->received_uuid, + btrfs_uuid_tree_rem(trans, fs_info, root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE); @@ -5166,15 +5169,14 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec); btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec); - ret = btrfs_update_root(trans, root->fs_info->tree_root, + ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret < 0) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); goto out; } if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) { - ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, - sa->uuid, + ret = btrfs_uuid_tree_add(trans, fs_info, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, root->root_key.objectid); if (ret < 0 && ret != -EEXIST) { @@ -5182,14 +5184,14 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, goto out; } } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } out: - up_write(&root->fs_info->subvol_sem); + up_write(&fs_info->subvol_sem); mnt_drop_write_file(file); return ret; } @@ -5203,11 +5205,8 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, int ret = 0; args32 = memdup_user(arg, sizeof(*args32)); - if (IS_ERR(args32)) { - ret = PTR_ERR(args32); - args32 = NULL; - goto out; - } + if (IS_ERR(args32)) + return PTR_ERR(args32); args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { @@ -5255,11 +5254,8 @@ static long btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; sa = memdup_user(arg, sizeof(*sa)); - if (IS_ERR(sa)) { - ret = PTR_ERR(sa); - sa = NULL; - goto out; - } + if (IS_ERR(sa)) + return PTR_ERR(sa); ret = _btrfs_ioctl_set_received_subvol(file, sa); @@ -5277,20 +5273,22 @@ out: static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); size_t len; int ret; char label[BTRFS_LABEL_SIZE]; - spin_lock(&root->fs_info->super_lock); - memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE); - spin_unlock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); + memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE); + spin_unlock(&fs_info->super_lock); len = strnlen(label, BTRFS_LABEL_SIZE); if (len == BTRFS_LABEL_SIZE) { - btrfs_warn(root->fs_info, - "label is too long, return the first %zu bytes", --len); + btrfs_warn(fs_info, + "label is too long, return the first %zu bytes", + --len); } ret = copy_to_user(arg, label, len); @@ -5300,8 +5298,10 @@ static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_trans_handle *trans; char label[BTRFS_LABEL_SIZE]; int ret; @@ -5313,7 +5313,7 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) return -EFAULT; if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "unable to set label with more than %d bytes", BTRFS_LABEL_SIZE - 1); return -EINVAL; @@ -5329,10 +5329,10 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) goto out_unlock; } - spin_lock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); strcpy(super_block->label, label); - spin_unlock(&root->fs_info->super_lock); - ret = btrfs_commit_transaction(trans, root); + spin_unlock(&fs_info->super_lock); + ret = btrfs_commit_transaction(trans); out_unlock: mnt_drop_write_file(file); @@ -5360,8 +5360,9 @@ int btrfs_ioctl_get_supported_features(void __user *arg) static int btrfs_ioctl_get_features(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags features; features.compat_flags = btrfs_super_compat_flags(super_block); @@ -5374,7 +5375,7 @@ static int btrfs_ioctl_get_features(struct file *file, void __user *arg) return 0; } -static int check_feature_bits(struct btrfs_root *root, +static int check_feature_bits(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -5389,14 +5390,14 @@ static int check_feature_bits(struct btrfs_root *root, if (unsupported) { names = btrfs_printable_features(set, unsupported); if (names) { - btrfs_warn(root->fs_info, - "this kernel does not support the %s feature bit%s", - names, strchr(names, ',') ? "s" : ""); + btrfs_warn(fs_info, + "this kernel does not support the %s feature bit%s", + names, strchr(names, ',') ? "s" : ""); kfree(names); } else - btrfs_warn(root->fs_info, - "this kernel does not support %s bits 0x%llx", - type, unsupported); + btrfs_warn(fs_info, + "this kernel does not support %s bits 0x%llx", + type, unsupported); return -EOPNOTSUPP; } @@ -5404,14 +5405,14 @@ static int check_feature_bits(struct btrfs_root *root, if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { - btrfs_warn(root->fs_info, - "can't set the %s feature bit%s while mounted", - names, strchr(names, ',') ? "s" : ""); + btrfs_warn(fs_info, + "can't set the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); kfree(names); } else - btrfs_warn(root->fs_info, - "can't set %s bits 0x%llx while mounted", - type, disallowed); + btrfs_warn(fs_info, + "can't set %s bits 0x%llx while mounted", + type, disallowed); return -EPERM; } @@ -5419,30 +5420,32 @@ static int check_feature_bits(struct btrfs_root *root, if (disallowed) { names = btrfs_printable_features(set, disallowed); if (names) { - btrfs_warn(root->fs_info, - "can't clear the %s feature bit%s while mounted", - names, strchr(names, ',') ? "s" : ""); + btrfs_warn(fs_info, + "can't clear the %s feature bit%s while mounted", + names, strchr(names, ',') ? "s" : ""); kfree(names); } else - btrfs_warn(root->fs_info, - "can't clear %s bits 0x%llx while mounted", - type, disallowed); + btrfs_warn(fs_info, + "can't clear %s bits 0x%llx while mounted", + type, disallowed); return -EPERM; } return 0; } -#define check_feature(root, change_mask, flags, mask_base) \ -check_feature_bits(root, FEAT_##mask_base, change_mask, flags, \ +#define check_feature(fs_info, change_mask, flags, mask_base) \ +check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \ BTRFS_FEATURE_ ## mask_base ## _SUPP, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \ BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR) static int btrfs_ioctl_set_features(struct file *file, void __user *arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; - struct btrfs_super_block *super_block = root->fs_info->super_copy; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_super_block *super_block = fs_info->super_copy; struct btrfs_ioctl_feature_flags flags[2]; struct btrfs_trans_handle *trans; u64 newflags; @@ -5459,17 +5462,17 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) !flags[0].incompat_flags) return 0; - ret = check_feature(root, flags[0].compat_flags, + ret = check_feature(fs_info, flags[0].compat_flags, flags[1].compat_flags, COMPAT); if (ret) return ret; - ret = check_feature(root, flags[0].compat_ro_flags, + ret = check_feature(fs_info, flags[0].compat_ro_flags, flags[1].compat_ro_flags, COMPAT_RO); if (ret) return ret; - ret = check_feature(root, flags[0].incompat_flags, + ret = check_feature(fs_info, flags[0].incompat_flags, flags[1].incompat_flags, INCOMPAT); if (ret) return ret; @@ -5484,7 +5487,7 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) goto out_drop_write; } - spin_lock(&root->fs_info->super_lock); + spin_lock(&fs_info->super_lock); newflags = btrfs_super_compat_flags(super_block); newflags |= flags[0].compat_flags & flags[1].compat_flags; newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags); @@ -5499,9 +5502,9 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg) newflags |= flags[0].incompat_flags & flags[1].incompat_flags; newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags); btrfs_set_super_incompat_flags(super_block, newflags); - spin_unlock(&root->fs_info->super_lock); + spin_unlock(&fs_info->super_lock); - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); out_drop_write: mnt_drop_write_file(file); @@ -5511,7 +5514,9 @@ out_drop_write: long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct inode *inode = file_inode(file); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = BTRFS_I(inode)->root; void __user *argp = (void __user *)arg; switch (cmd) { @@ -5546,15 +5551,15 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_RESIZE: return btrfs_ioctl_resize(file, argp); case BTRFS_IOC_ADD_DEV: - return btrfs_ioctl_add_dev(root, argp); + return btrfs_ioctl_add_dev(fs_info, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(file, argp); case BTRFS_IOC_RM_DEV_V2: return btrfs_ioctl_rm_dev_v2(file, argp); case BTRFS_IOC_FS_INFO: - return btrfs_ioctl_fs_info(root, argp); + return btrfs_ioctl_fs_info(fs_info, argp); case BTRFS_IOC_DEV_INFO: - return btrfs_ioctl_dev_info(root, argp); + return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); case BTRFS_IOC_TRANS_START: @@ -5570,40 +5575,40 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_INO_PATHS: return btrfs_ioctl_ino_to_path(root, argp); case BTRFS_IOC_LOGICAL_INO: - return btrfs_ioctl_logical_to_ino(root, argp); + return btrfs_ioctl_logical_to_ino(fs_info, argp); case BTRFS_IOC_SPACE_INFO: - return btrfs_ioctl_space_info(root, argp); + return btrfs_ioctl_space_info(fs_info, argp); case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1); + ret = btrfs_start_delalloc_roots(fs_info, 0, -1); if (ret) return ret; - ret = btrfs_sync_fs(file_inode(file)->i_sb, 1); + ret = btrfs_sync_fs(inode->i_sb, 1); /* * The transaction thread may want to do more work, * namely it pokes the cleaner kthread that will start * processing uncleaned subvols. */ - wake_up_process(root->fs_info->transaction_kthread); + wake_up_process(fs_info->transaction_kthread); return ret; } case BTRFS_IOC_START_SYNC: return btrfs_ioctl_start_sync(root, argp); case BTRFS_IOC_WAIT_SYNC: - return btrfs_ioctl_wait_sync(root, argp); + return btrfs_ioctl_wait_sync(fs_info, argp); case BTRFS_IOC_SCRUB: return btrfs_ioctl_scrub(file, argp); case BTRFS_IOC_SCRUB_CANCEL: - return btrfs_ioctl_scrub_cancel(root, argp); + return btrfs_ioctl_scrub_cancel(fs_info); case BTRFS_IOC_SCRUB_PROGRESS: - return btrfs_ioctl_scrub_progress(root, argp); + return btrfs_ioctl_scrub_progress(fs_info, argp); case BTRFS_IOC_BALANCE_V2: return btrfs_ioctl_balance(file, argp); case BTRFS_IOC_BALANCE_CTL: - return btrfs_ioctl_balance_ctl(root, arg); + return btrfs_ioctl_balance_ctl(fs_info, arg); case BTRFS_IOC_BALANCE_PROGRESS: - return btrfs_ioctl_balance_progress(root, argp); + return btrfs_ioctl_balance_progress(fs_info, argp); case BTRFS_IOC_SET_RECEIVED_SUBVOL: return btrfs_ioctl_set_received_subvol(file, argp); #ifdef CONFIG_64BIT @@ -5613,7 +5618,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SEND: return btrfs_ioctl_send(file, argp); case BTRFS_IOC_GET_DEV_STATS: - return btrfs_ioctl_get_dev_stats(root, argp); + return btrfs_ioctl_get_dev_stats(fs_info, argp); case BTRFS_IOC_QUOTA_CTL: return btrfs_ioctl_quota_ctl(file, argp); case BTRFS_IOC_QGROUP_ASSIGN: @@ -5629,7 +5634,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_QUOTA_RESCAN_WAIT: return btrfs_ioctl_quota_rescan_wait(file, argp); case BTRFS_IOC_DEV_REPLACE: - return btrfs_ioctl_dev_replace(root, argp); + return btrfs_ioctl_dev_replace(fs_info, argp); case BTRFS_IOC_GET_FSLABEL: return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: @@ -5648,6 +5653,10 @@ long btrfs_ioctl(struct file *file, unsigned int #ifdef CONFIG_COMPAT long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { + /* + * These all access 32-bit values anyway so no further + * handling is necessary. + */ switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; @@ -5658,8 +5667,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; - default: - return -ENOIOCTLCMD; } return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 48655da0f4ca..45d26980caf9 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -254,25 +254,21 @@ out: return ret; } -static int lzo_decompress_biovec(struct list_head *ws, +static int lzo_decompress_bio(struct list_head *ws, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen) { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0, ret2; char *data_in; unsigned long page_in_index = 0; - unsigned long page_out_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; unsigned long working_bytes; - unsigned long pg_offset; - size_t in_len; size_t out_len; unsigned long in_offset; @@ -292,7 +288,6 @@ static int lzo_decompress_biovec(struct list_head *ws, in_page_bytes_left = PAGE_SIZE - LZO_LEN; tot_out = 0; - pg_offset = 0; while (tot_in < tot_len) { in_len = read_compress_length(data_in + in_offset); @@ -365,16 +360,14 @@ cont: tot_out += out_len; ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, - tot_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); + tot_out, disk_start, orig_bio); if (ret2 == 0) break; } done: kunmap(pages_in[page_in_index]); if (!ret) - btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + zero_fill_bio(orig_bio); return ret; } @@ -438,6 +431,6 @@ const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, - .decompress_biovec = lzo_decompress_biovec, + .decompress_bio = lzo_decompress_bio, .decompress = lzo_decompress, }; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b2d1e95de7be..041c3326d109 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -186,6 +186,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, u64 start, u64 len, u64 disk_len, int type, int dio, int compress_type) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_inode_tree *tree; struct rb_node *node; @@ -234,11 +235,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, &root->ordered_extents); root->nr_ordered_extents++; if (root->nr_ordered_extents == 1) { - spin_lock(&root->fs_info->ordered_root_lock); + spin_lock(&fs_info->ordered_root_lock); BUG_ON(!list_empty(&root->ordered_root)); - list_add_tail(&root->ordered_root, - &root->fs_info->ordered_roots); - spin_unlock(&root->fs_info->ordered_root_lock); + list_add_tail(&root->ordered_root, &fs_info->ordered_roots); + spin_unlock(&fs_info->ordered_root_lock); } spin_unlock(&root->ordered_extent_lock); @@ -303,6 +303,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, struct btrfs_ordered_extent **cached, u64 *file_offset, u64 io_size, int uptodate) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; @@ -331,14 +332,14 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, entry->len); *file_offset = dec_end; if (dec_start > dec_end) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, - "bad ordering dec_start %llu end %llu", dec_start, dec_end); + btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", + dec_start, dec_end); } to_dec = dec_end - dec_start; if (to_dec > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, - "bad ordered accounting left %llu size %llu", - entry->bytes_left, to_dec); + btrfs_crit(fs_info, + "bad ordered accounting left %llu size %llu", + entry->bytes_left, to_dec); } entry->bytes_left -= to_dec; if (!uptodate) @@ -588,6 +589,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = BTRFS_I(inode)->root; struct rb_node *node; @@ -618,11 +620,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, * lock, so be nice and check if trans is set, but ASSERT() so * if it isn't set a developer will notice. */ - spin_lock(&root->fs_info->trans_lock); - trans = root->fs_info->running_transaction; + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; if (trans) atomic_inc(&trans->use_count); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); ASSERT(trans); if (trans) { @@ -639,10 +641,10 @@ void btrfs_remove_ordered_extent(struct inode *inode, trace_btrfs_ordered_extent_remove(inode, entry); if (!root->nr_ordered_extents) { - spin_lock(&root->fs_info->ordered_root_lock); + spin_lock(&fs_info->ordered_root_lock); BUG_ON(list_empty(&root->ordered_root)); list_del_init(&root->ordered_root); - spin_unlock(&root->fs_info->ordered_root_lock); + spin_unlock(&fs_info->ordered_root_lock); } spin_unlock(&root->ordered_extent_lock); wake_up(&entry->wait); @@ -664,6 +666,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, const u64 range_start, const u64 range_len) { + struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(splice); LIST_HEAD(skipped); LIST_HEAD(works); @@ -694,8 +697,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, btrfs_flush_delalloc_helper, btrfs_run_ordered_extent_work, NULL, NULL); list_add_tail(&ordered->work_list, &works); - btrfs_queue_work(root->fs_info->flush_workers, - &ordered->flush_work); + btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); cond_resched(); spin_lock(&root->ordered_extent_lock); @@ -978,7 +980,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, ordered->file_offset + ordered->truncated_len); } else { - offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); + offset = ALIGN(offset, btrfs_inode_sectorsize(inode)); } disk_i_size = BTRFS_I(inode)->disk_i_size; @@ -1087,7 +1089,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + u32 sectorsize = btrfs_inode_sectorsize(inode); int index = 0; ordered = btrfs_lookup_ordered_extent(inode, offset); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 451507776ff5..5f2b0ca28705 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -145,10 +145,10 @@ struct btrfs_ordered_extent { * calculates the total size you need to allocate for an ordered sum * structure spanning 'bytes' in the file */ -static inline int btrfs_ordered_sum_size(struct btrfs_root *root, +static inline int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { - int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize); + int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32); } diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 438575ea8d25..cdafbf92ef0c 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -161,7 +161,7 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, } } -void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) { int i; u32 type, nr; @@ -182,8 +182,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) nr = btrfs_header_nritems(l); - btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d", - btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l)); + btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d", + btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(fs_info, l)); for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); @@ -314,7 +315,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) } } -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c) { int i; u32 nr; struct btrfs_key key; @@ -325,13 +326,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) nr = btrfs_header_nritems(c); level = btrfs_header_level(c); if (level == 0) { - btrfs_print_leaf(root, c); + btrfs_print_leaf(fs_info, c); return; } - btrfs_info(root->fs_info, + btrfs_info(fs_info, "node %llu level %d total ptrs %d free spc %u", btrfs_header_bytenr(c), level, nr, - (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + (u32)BTRFS_NODEPTRS_PER_BLOCK(fs_info) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); pr_info("\tkey %d (%llu %u %llu) block %llu\n", @@ -339,7 +340,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_node_blockptr(c, i)); } for (i = 0; i < nr; i++) { - struct extent_buffer *next = read_tree_block(root, + struct extent_buffer *next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), btrfs_node_ptr_generation(c, i)); if (IS_ERR(next)) { @@ -355,7 +356,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) if (btrfs_header_level(next) != level - 1) BUG(); - btrfs_print_tree(root, next); + btrfs_print_tree(fs_info, next); free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 7faddfacc5bd..4f2e0ea0e95a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -18,6 +18,6 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ -void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c); +void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index cf0b444ac4f3..f2621e330954 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -301,6 +301,7 @@ static int inherit_props(struct btrfs_trans_handle *trans, struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; @@ -320,14 +321,14 @@ static int inherit_props(struct btrfs_trans_handle *trans, if (!value) continue; - num_bytes = btrfs_calc_trans_metadata_size(root, 1); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1); ret = btrfs_block_rsv_add(root, trans->block_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); if (ret) goto out; ret = __btrfs_set_prop(trans, inode, h->xattr_name, value, strlen(value), 0); - btrfs_block_rsv_release(root, trans->block_rsv, num_bytes); + btrfs_block_rsv_release(fs_info, trans->block_rsv, num_bytes); if (ret) goto out; } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 11f4fffe503e..662821f1252c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -131,8 +131,15 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -#define ptr_to_u64(x) ((u64)(uintptr_t)x) -#define u64_to_ptr(x) ((struct btrfs_qgroup *)(uintptr_t)x) +static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) +{ + return (u64)(uintptr_t)qg; +} + +static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) +{ + return (struct btrfs_qgroup *)(uintptr_t)n->aux; +} static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, @@ -1012,7 +1019,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, tree_root->fs_info, quota_root->node); + clean_tree_block(trans, fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1066,7 +1073,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Get all of the parent groups that contain this qgroup */ list_for_each_entry(glist, &qgroup->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -1074,7 +1081,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Iterate all of the parents and adjust their reference counts */ ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); + qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; WARN_ON(sign < 0 && qgroup->excl < num_bytes); @@ -1087,7 +1094,7 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, /* Add any parents of the parents */ list_for_each_entry(glist, &qgroup->groups, next_group) { ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) goto out; } @@ -1185,7 +1192,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - ret = add_relation_rb(quota_root->fs_info, src, dst); + ret = add_relation_rb(fs_info, src, dst); if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; @@ -1333,7 +1340,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - del_qgroup_rb(quota_root->fs_info, qgroupid); + del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); @@ -1450,7 +1457,7 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, return ret; } -int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, +int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) { @@ -1460,7 +1467,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, u64 bytenr = record->bytenr; assert_spin_locked(&delayed_refs->lock); - trace_btrfs_qgroup_insert_dirty_extent(fs_info, record); + trace_btrfs_qgroup_trace_extent(fs_info, record); while (*p) { parent_node = *p; @@ -1479,7 +1486,7 @@ int btrfs_qgroup_insert_dirty_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag) { @@ -1502,14 +1509,228 @@ int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, record->old_roots = NULL; spin_lock(&delayed_refs->lock); - ret = btrfs_qgroup_insert_dirty_extent_nolock(fs_info, delayed_refs, - record); + ret = btrfs_qgroup_trace_extent_nolock(fs_info, delayed_refs, record); spin_unlock(&delayed_refs->lock); if (ret > 0) kfree(record); return 0; } +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + int nr = btrfs_header_nritems(eb); + int i, extent_type, ret; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 bytenr, num_bytes; + + /* We can be called directly from walk_up_proc() */ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + for (i = 0; i < nr; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + + if (key.type != BTRFS_EXTENT_DATA_KEY) + continue; + + fi = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item); + /* filter out non qgroup-accountable extents */ + extent_type = btrfs_file_extent_type(eb, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + continue; + + bytenr = btrfs_file_extent_disk_bytenr(eb, fi); + if (!bytenr) + continue; + + num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); + + ret = btrfs_qgroup_trace_extent(trans, fs_info, bytenr, + num_bytes, GFP_NOFS); + if (ret) + return ret; + } + return 0; +} + +/* + * Walk up the tree from the bottom, freeing leaves and any interior + * nodes which have had all slots visited. If a node (leaf or + * interior) is freed, the node above it will have it's slot + * incremented. The root node will never be freed. + * + * At the end of this function, we should have a path which has all + * slots incremented to the next position for a search. If we need to + * read a new node it will be NULL and the node above it will have the + * correct slot selected for a later read. + * + * If we increment the root nodes slot counter past the number of + * elements, 1 is returned to signal completion of the search. + */ +static int adjust_slots_upwards(struct btrfs_root *root, + struct btrfs_path *path, int root_level) +{ + int level = 0; + int nr, slot; + struct extent_buffer *eb; + + if (root_level == 0) + return 1; + + while (level <= root_level) { + eb = path->nodes[level]; + nr = btrfs_header_nritems(eb); + path->slots[level]++; + slot = path->slots[level]; + if (slot >= nr || level == 0) { + /* + * Don't free the root - we will detect this + * condition after our loop and return a + * positive value for caller to stop walking the tree. + */ + if (level != root_level) { + btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; + + free_extent_buffer(eb); + path->nodes[level] = NULL; + path->slots[level] = 0; + } + } else { + /* + * We have a valid slot to walk back down + * from. Stop here so caller can process these + * new nodes. + */ + break; + } + + level++; + } + + eb = path->nodes[root_level]; + if (path->slots[root_level] >= btrfs_header_nritems(eb)) + return 1; + + return 0; +} + +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, int root_level) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; + int level; + struct extent_buffer *eb = root_eb; + struct btrfs_path *path = NULL; + + BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_eb == NULL); + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return 0; + + if (!extent_buffer_uptodate(root_eb)) { + ret = btrfs_read_buffer(root_eb, root_gen); + if (ret) + goto out; + } + + if (root_level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans, fs_info, root_eb); + goto out; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Walk down the tree. Missing extent blocks are filled in as + * we go. Metadata is accounted every time we read a new + * extent block. + * + * When we reach a leaf, we account for file extent items in it, + * walk back up the tree (adjusting slot pointers as we go) + * and restart the search process. + */ + extent_buffer_get(root_eb); /* For path */ + path->nodes[root_level] = root_eb; + path->slots[root_level] = 0; + path->locks[root_level] = 0; /* so release_path doesn't try to unlock */ +walk_down: + level = root_level; + while (level >= 0) { + if (path->nodes[level] == NULL) { + int parent_slot; + u64 child_gen; + u64 child_bytenr; + + /* + * We need to get child blockptr/gen from parent before + * we can read it. + */ + eb = path->nodes[level + 1]; + parent_slot = path->slots[level + 1]; + child_bytenr = btrfs_node_blockptr(eb, parent_slot); + child_gen = btrfs_node_ptr_generation(eb, parent_slot); + + eb = read_tree_block(fs_info, child_bytenr, child_gen); + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); + ret = -EIO; + goto out; + } + + path->nodes[level] = eb; + path->slots[level] = 0; + + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + path->locks[level] = BTRFS_READ_LOCK_BLOCKING; + + ret = btrfs_qgroup_trace_extent(trans, fs_info, + child_bytenr, + fs_info->nodesize, + GFP_NOFS); + if (ret) + goto out; + } + + if (level == 0) { + ret = btrfs_qgroup_trace_leaf_items(trans,fs_info, + path->nodes[level]); + if (ret) + goto out; + + /* Nonzero return here means we completed our search */ + ret = adjust_slots_upwards(root, path, root_level); + if (ret) + break; + + /* Restart search with new slots */ + goto walk_down; + } + + level--; + } + + ret = 0; +out: + btrfs_free_path(path); + + return ret; +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* @@ -1535,30 +1756,30 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, continue; ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), + ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); if (ret < 0) return ret; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), GFP_ATOMIC); + ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); if (ret < 0) return ret; ULIST_ITER_INIT(&tmp_uiter); while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(tmp_unode->aux); + qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) return ret; ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), + qgroup_to_aux(glist->group), GFP_ATOMIC); if (ret < 0) return ret; @@ -1619,7 +1840,7 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, while ((unode = ulist_next(qgroups, &uiter))) { bool dirty = false; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -1950,7 +2171,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, } rcu_read_lock(); - level_size = srcroot->nodesize; + level_size = fs_info->nodesize; rcu_read_unlock(); } @@ -2034,8 +2255,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { if (*i_qgroups) { - ret = add_relation_rb(quota_root->fs_info, objectid, - *i_qgroups); + ret = add_relation_rb(fs_info, objectid, *i_qgroups); if (ret) goto unlock; } @@ -2125,7 +2345,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) && qg->reserved + (s64)qg->rfer + num_bytes > @@ -2157,7 +2377,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes) while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { struct btrfs_qgroup *qg; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); qg->reserved += num_bytes; } @@ -2202,7 +2422,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qg; struct btrfs_qgroup_list *glist; - qg = u64_to_ptr(unode->aux); + qg = unode_aux_to_qgroup(unode); qg->reserved -= num_bytes; @@ -2302,7 +2522,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) - num_bytes = fs_info->extent_root->nodesize; + num_bytes = fs_info->nodesize; else num_bytes = found.offset; @@ -2335,10 +2555,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) int err = -ENOMEM; int ret = 0; - mutex_lock(&fs_info->qgroup_rescan_lock); - fs_info->qgroup_rescan_running = true; - mutex_unlock(&fs_info->qgroup_rescan_lock); - path = btrfs_alloc_path(); if (!path) goto out; @@ -2356,9 +2572,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = qgroup_rescan_leaf(fs_info, path, trans); } if (err > 0) - btrfs_commit_transaction(trans, fs_info->fs_root); + btrfs_commit_transaction(trans); else - btrfs_end_transaction(trans, fs_info->fs_root); + btrfs_end_transaction(trans); } out: @@ -2393,7 +2609,7 @@ out: err = ret; btrfs_err(fs_info, "fail to update qgroup status: %d", err); } - btrfs_end_transaction(trans, fs_info->quota_root); + btrfs_end_transaction(trans); if (btrfs_fs_closing(fs_info)) { btrfs_info(fs_info, "qgroup scan paused"); @@ -2449,6 +2665,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, sizeof(fs_info->qgroup_rescan_progress)); fs_info->qgroup_rescan_progress.objectid = progress_objectid; init_completion(&fs_info->qgroup_rescan_completion); + fs_info->qgroup_rescan_running = true; spin_unlock(&fs_info->qgroup_lock); mutex_unlock(&fs_info->qgroup_rescan_lock); @@ -2512,7 +2729,7 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); } - ret = btrfs_commit_transaction(trans, fs_info->fs_root); + ret = btrfs_commit_transaction(trans); if (ret) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return ret; @@ -2677,13 +2894,14 @@ int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len) int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid) || num_bytes == 0) return 0; - BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); ret = qgroup_reserve(root, num_bytes); if (ret < 0) return ret; @@ -2693,9 +2911,10 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes) void btrfs_qgroup_free_meta_all(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; int reserved; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid)) return; @@ -2707,11 +2926,13 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) { - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || !is_fstree(root->objectid)) return; - BUG_ON(num_bytes != round_down(num_bytes, root->nodesize)); + BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes); atomic_sub(num_bytes, &root->qgroup_meta_rsv); qgroup_free(root, num_bytes); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 1bc64c864b62..416ae8e1d23c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -23,6 +23,34 @@ #include "delayed-ref.h" /* + * Btrfs qgroup overview + * + * Btrfs qgroup splits into 3 main part: + * 1) Reserve + * Reserve metadata/data space for incoming operations + * Affect how qgroup limit works + * + * 2) Trace + * Tell btrfs qgroup to trace dirty extents. + * + * Dirty extents including: + * - Newly allocated extents + * - Extents going to be deleted (in this trans) + * - Extents whose owner is going to be modified + * + * This is the main part affects whether qgroup numbers will stay + * consistent. + * Btrfs qgroup can trace clean extents and won't cause any problem, + * but it will consume extra CPU time, it should be avoided if possible. + * + * 3) Account + * Btrfs qgroup will updates its numbers, based on dirty extents traced + * in previous step. + * + * Normally at qgroup rescan and transaction commit time. + */ + +/* * Record a dirty extent, and info qgroup to update quota on it * TODO: Use kmem cache to alloc it. */ @@ -65,8 +93,8 @@ struct btrfs_delayed_extent_op; int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); /* - * Insert one dirty extent record into @delayed_refs, informing qgroup to - * account that extent at commit trans time. + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at commit trans time. * * No lock version, caller must acquire delayed ref lock and allocate memory. * @@ -74,14 +102,15 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, * Return >0 for existing record, caller can free @record safely. * Error is not possible */ -int btrfs_qgroup_insert_dirty_extent_nolock( +int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); /* - * Insert one dirty extent record into @delayed_refs, informing qgroup to - * account that extent at commit trans time. + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. * * Better encapsulated version. * @@ -89,10 +118,33 @@ int btrfs_qgroup_insert_dirty_extent_nolock( * Return <0 for error, like memory allocation failure or invalid parameter * (NULL trans) */ -int btrfs_qgroup_insert_dirty_extent(struct btrfs_trans_handle *trans, +int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, gfp_t gfp_flag); +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ +int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct extent_buffer *eb); +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ +int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *root_eb, + u64 root_gen, int root_level); int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d016d4a79864..d2a9a1ee5361 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -969,8 +969,9 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * allocation and initial setup for the btrfs_raid_bio. Not * this does not allocate any pages for rbio->pages. */ -static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, - struct btrfs_bio *bbio, u64 stripe_len) +static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, + struct btrfs_bio *bbio, + u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; @@ -991,7 +992,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bbio = bbio; - rbio->fs_info = root->fs_info; + rbio->fs_info = fs_info; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->real_stripes = real_stripes; @@ -1144,10 +1145,10 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; + struct bio_vec *bvec; u64 start; unsigned long stripe_offset; unsigned long page_index; - struct page *p; int i; spin_lock_irq(&rbio->bio_list_lock); @@ -1156,10 +1157,8 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) stripe_offset = start - rbio->bbio->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; - for (i = 0; i < bio->bi_vcnt; i++) { - p = bio->bi_io_vec[i].bv_page; - rbio->bio_pages[page_index + i] = p; - } + bio_for_each_segment_all(bvec, bio, i) + rbio->bio_pages[page_index + i] = bvec->bv_page; } spin_unlock_irq(&rbio->bio_list_lock); } @@ -1433,13 +1432,11 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, */ static void set_bio_pages_uptodate(struct bio *bio) { + struct bio_vec *bvec; int i; - struct page *p; - for (i = 0; i < bio->bi_vcnt; i++) { - p = bio->bi_io_vec[i].bv_page; - SetPageUptodate(p); - } + bio_for_each_segment_all(bvec, bio, i) + SetPageUptodate(bvec->bv_page); } /* @@ -1482,11 +1479,8 @@ cleanup: static void async_rmw_stripe(struct btrfs_raid_bio *rbio) { - btrfs_init_work(&rbio->work, btrfs_rmw_helper, - rmw_work, NULL, NULL); - - btrfs_queue_work(rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_init_work(&rbio->work, btrfs_rmw_helper, rmw_work, NULL, NULL); + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } static void async_read_rebuild(struct btrfs_raid_bio *rbio) @@ -1494,8 +1488,7 @@ static void async_read_rebuild(struct btrfs_raid_bio *rbio) btrfs_init_work(&rbio->work, btrfs_rmw_helper, read_rebuild_work, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } /* @@ -1577,8 +1570,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_rmw_end_io; bio_set_op_attrs(bio, REQ_OP_READ, 0); - btrfs_bio_wq_end_io(rbio->fs_info, bio, - BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -1743,7 +1735,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len) { struct btrfs_raid_bio *rbio; @@ -1751,7 +1743,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(root, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { btrfs_put_bbio(bbio); return PTR_ERR(rbio); @@ -1760,7 +1752,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; - btrfs_bio_counter_inc_noblocked(root->fs_info); + btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1770,16 +1762,15 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); return ret; } - cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, - sizeof(*plug)); + cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { - plug->info = root->fs_info; + plug->info = fs_info; INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); @@ -1787,7 +1778,7 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio, } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); } return ret; } @@ -2102,8 +2093,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_recover_end_io; bio_set_op_attrs(bio, REQ_OP_READ, 0); - btrfs_bio_wq_end_io(rbio->fs_info, bio, - BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2123,14 +2113,14 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, int mirror_num, int generic_io) { struct btrfs_raid_bio *rbio; int ret; - rbio = alloc_rbio(root, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) btrfs_put_bbio(bbio); @@ -2143,7 +2133,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { - btrfs_warn(root->fs_info, + btrfs_warn(fs_info, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", __func__, (u64)bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bbio->map_type); @@ -2154,7 +2144,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, } if (generic_io) { - btrfs_bio_counter_inc_noblocked(root->fs_info); + btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; } else { btrfs_get_bbio(bbio); @@ -2212,7 +2202,7 @@ static void read_rebuild_work(struct btrfs_work *work) */ struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, +raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) @@ -2220,7 +2210,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(root, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2239,7 +2229,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, } /* Now we just support the sectorsize equals to page size */ - ASSERT(root->sectorsize == PAGE_SIZE); + ASSERT(fs_info->sectorsize == PAGE_SIZE); ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); @@ -2621,8 +2611,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid56_parity_scrub_end_io; bio_set_op_attrs(bio, REQ_OP_READ, 0); - btrfs_bio_wq_end_io(rbio->fs_info, bio, - BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2650,8 +2639,7 @@ static void async_scrub_parity(struct btrfs_raid_bio *rbio) btrfs_init_work(&rbio->work, btrfs_rmw_helper, scrub_parity_work, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, - &rbio->work); + btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) @@ -2663,12 +2651,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, +raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 length) { struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(root, bbio, length); + rbio = alloc_rbio(fs_info, bbio, length); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 8b694699d502..4ee4fe346838 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -42,24 +42,24 @@ static inline int nr_data_stripes(struct map_lookup *map) struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, +int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct btrfs_root *root, struct bio *bio, +int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio, +raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 stripe_len, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_root *root, struct bio *bio, +raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_bio *bbio, u64 length); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 75bab76739be..e88bca87f5d2 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -107,18 +107,14 @@ static int reada_add_block(struct reada_control *rc, u64 logical, /* in case of err, eb might be NULL */ static void __readahead_hook(struct btrfs_fs_info *fs_info, struct reada_extent *re, struct extent_buffer *eb, - u64 start, int err) + int err) { - int level = 0; int nritems; int i; u64 bytenr; u64 generation; struct list_head list; - if (eb) - level = btrfs_header_level(eb); - spin_lock(&re->lock); /* * just take the full list from the extent. afterwards we @@ -143,7 +139,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, * trigger more readahead depending from the content, e.g. * fetch the checksums for the extents in the leaf. */ - if (!level) + if (!btrfs_header_level(eb)) goto cleanup; nritems = btrfs_header_nritems(eb); @@ -213,12 +209,8 @@ cleanup: return; } -/* - * start is passed separately in case eb in NULL, which may be the case with - * failed I/O - */ int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, u64 start, int err) + struct extent_buffer *eb, int err) { int ret = 0; struct reada_extent *re; @@ -226,7 +218,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, /* find extent */ spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, - start >> PAGE_SHIFT); + eb->start >> PAGE_SHIFT); if (re) re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -235,7 +227,7 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, goto start_machine; } - __readahead_hook(fs_info, re, eb, start, err); + __readahead_hook(fs_info, re, eb, err); reada_extent_put(fs_info, re); /* our ref */ start_machine: @@ -311,14 +303,13 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, return zone; } -static struct reada_extent *reada_find_extent(struct btrfs_root *root, +static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_key *top) { int ret; struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; - struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; @@ -343,7 +334,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (!re) return NULL; - blocksize = root->nodesize; + blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -354,13 +345,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, * map block */ length = blocksize; - ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length, - &bbio, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &length, &bbio, 0); if (ret || !bbio || length < blocksize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "readahead: more than %d copies not supported", BTRFS_MAX_MIRRORS); goto error; @@ -401,7 +392,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, ret = radix_tree_insert(&fs_info->reada_tree, index, re); if (ret == -EEXIST) { re_exist = radix_tree_lookup(&fs_info->reada_tree, index); - BUG_ON(!re_exist); re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -448,7 +438,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, /* ignore whether the entry was inserted */ radix_tree_delete(&dev->reada_extents, index); } - BUG_ON(fs_info == NULL); radix_tree_delete(&fs_info->reada_tree, index); spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -554,17 +543,18 @@ static void reada_control_release(struct kref *kref) static int reada_add_block(struct reada_control *rc, u64 logical, struct btrfs_key *top, u64 generation) { - struct btrfs_root *root = rc->root; + struct btrfs_fs_info *fs_info = rc->fs_info; struct reada_extent *re; struct reada_extctl *rec; - re = reada_find_extent(root, logical, top); /* takes one ref */ + /* takes one ref */ + re = reada_find_extent(fs_info, logical, top); if (!re) return -1; rec = kzalloc(sizeof(*rec), GFP_KERNEL); if (!rec) { - reada_extent_put(root->fs_info, re); + reada_extent_put(fs_info, re); return -ENOMEM; } @@ -688,7 +678,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->reada_lock); return 0; } - dev->reada_next = re->logical + fs_info->tree_root->nodesize; + dev->reada_next = re->logical + fs_info->nodesize; re->refcnt++; spin_unlock(&fs_info->reada_lock); @@ -714,12 +704,11 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, logical = re->logical; atomic_inc(&dev->reada_in_flight); - ret = reada_tree_block_flagged(fs_info->extent_root, logical, - mirror_num, &eb); + ret = reada_tree_block_flagged(fs_info, logical, mirror_num, &eb); if (ret) - __readahead_hook(fs_info, re, NULL, logical, ret); + __readahead_hook(fs_info, re, NULL, ret); else if (eb) - __readahead_hook(fs_info, re, eb, eb->start, ret); + __readahead_hook(fs_info, re, eb, ret); if (eb) free_extent_buffer(eb); @@ -852,7 +841,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) if (ret == 0) break; pr_debug(" re: logical %llu size %u empty %d scheduled %d", - re->logical, fs_info->tree_root->nodesize, + re->logical, fs_info->nodesize, list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { @@ -885,7 +874,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all) continue; } pr_debug("re: logical %llu size %u list empty %d scheduled %d", - re->logical, fs_info->tree_root->nodesize, + re->logical, fs_info->nodesize, list_empty(&re->extctl), re->scheduled); for (i = 0; i < re->nzones; ++i) { pr_cont(" zone %llu-%llu devs", @@ -924,7 +913,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, if (!rc) return ERR_PTR(-ENOMEM); - rc->root = root; + rc->fs_info = root->fs_info; rc->key_start = *key_start; rc->key_end = *key_end; atomic_set(&rc->elems, 0); @@ -952,18 +941,17 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->root->fs_info; + struct btrfs_fs_info *fs_info = rc->fs_info; while (atomic_read(&rc->elems)) { if (!atomic_read(&fs_info->reada_works_cnt)) reada_start_machine(fs_info); wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0, 5 * HZ); - dump_devs(rc->root->fs_info, - atomic_read(&rc->elems) < 10 ? 1 : 0); + dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); } - dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); + dump_devs(fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0); kref_put(&rc->refcnt, reada_control_release); @@ -973,7 +961,7 @@ int btrfs_reada_wait(void *handle) int btrfs_reada_wait(void *handle) { struct reada_control *rc = handle; - struct btrfs_fs_info *fs_info = rc->root->fs_info; + struct btrfs_fs_info *fs_info = rc->fs_info; while (atomic_read(&rc->elems)) { if (!atomic_read(&fs_info->reada_works_cnt)) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index c4af0cdb783d..379711048fb0 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1288,9 +1288,10 @@ fail: */ static int __must_check __add_reloc_root(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node; - struct reloc_control *rc = root->fs_info->reloc_ctl; + struct reloc_control *rc = fs_info->reloc_ctl; node = kmalloc(sizeof(*node), GFP_NOFS); if (!node) @@ -1304,7 +1305,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - btrfs_panic(root->fs_info, -EEXIST, + btrfs_panic(fs_info, -EEXIST, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); kfree(node); @@ -1321,9 +1322,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) */ static void __del_reloc_root(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; - struct reloc_control *rc = root->fs_info->reloc_ctl; + struct reloc_control *rc = fs_info->reloc_ctl; spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, @@ -1338,9 +1340,9 @@ static void __del_reloc_root(struct btrfs_root *root) return; BUG_ON((struct btrfs_root *)node->data != root); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); list_del_init(&root->root_list); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); kfree(node); } @@ -1350,9 +1352,10 @@ static void __del_reloc_root(struct btrfs_root *root) */ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) { + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; struct mapping_node *node = NULL; - struct reloc_control *rc = root->fs_info->reloc_ctl; + struct reloc_control *rc = fs_info->reloc_ctl; spin_lock(&rc->reloc_root_tree.lock); rb_node = tree_search(&rc->reloc_root_tree.rb_root, @@ -1380,11 +1383,11 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr) static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; - u64 last_snap = 0; int ret; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); @@ -1395,14 +1398,22 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, root_key.offset = objectid; if (root->root_key.objectid == objectid) { + u64 commit_root_gen; + /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); BUG_ON(ret); - - last_snap = btrfs_root_last_snapshot(&root->root_item); - btrfs_set_root_last_snapshot(&root->root_item, - trans->transid - 1); + /* + * Set the last_snapshot field to the generation of the commit + * root - like this ctree.c:btrfs_block_can_be_shared() behaves + * correctly (returns true) when the relocation root is created + * either inside the critical section of a transaction commit + * (through transaction.c:qgroup_account_snapshot()) and when + * it's created before the transaction commit is started. + */ + commit_root_gen = btrfs_header_generation(root->commit_root); + btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen); } else { /* * called by btrfs_reloc_post_snapshot_hook. @@ -1426,23 +1437,17 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); root_item->drop_level = 0; - /* - * abuse rtransid, it is safe because it is impossible to - * receive data into a relocation tree. - */ - btrfs_set_root_rtransid(root_item, last_snap); - btrfs_set_root_otransid(root_item, trans->transid); } btrfs_tree_unlock(eb); free_extent_buffer(eb); - ret = btrfs_insert_root(trans, root->fs_info->tree_root, + ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); BUG_ON(ret); kfree(root_item); - reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key); + reloc_root = btrfs_read_fs_root(fs_info->tree_root, &root_key); BUG_ON(IS_ERR(reloc_root)); reloc_root->last_trans = trans->transid; return reloc_root; @@ -1455,8 +1460,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; - struct reloc_control *rc = root->fs_info->reloc_ctl; + struct reloc_control *rc = fs_info->reloc_ctl; struct btrfs_block_rsv *rsv; int clear_rsv = 0; int ret; @@ -1492,6 +1498,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_root_item *root_item; int ret; @@ -1502,7 +1509,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root = root->reloc_root; root_item = &reloc_root->root_item; - if (root->fs_info->reloc_ctl->merge_reloc_tree && + if (fs_info->reloc_ctl->merge_reloc_tree && btrfs_root_refs(root_item) == 0) { root->reloc_root = NULL; __del_reloc_root(reloc_root); @@ -1514,7 +1521,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, reloc_root->commit_root = btrfs_root_node(reloc_root); } - ret = btrfs_update_root(trans, root->fs_info->tree_root, + ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); BUG_ON(ret); @@ -1642,6 +1649,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *leaf) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; struct btrfs_file_extent_item *fi; struct inode *inode = NULL; @@ -1698,8 +1706,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, - root->sectorsize)); - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + fs_info->sectorsize)); + WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, key.offset, end); @@ -1727,7 +1735,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, dirty = 1; key.offset -= btrfs_file_extent_offset(leaf, fi); - ret = btrfs_inc_extent_ref(trans, root, new_bytenr, + ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); @@ -1736,7 +1744,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, break; } - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + ret = btrfs_free_extent(trans, fs_info, bytenr, num_bytes, parent, btrfs_header_owner(leaf), key.objectid, key.offset); if (ret) { @@ -1777,6 +1785,7 @@ int replace_path(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *next_key, int lowest_level, int max_level) { + struct btrfs_fs_info *fs_info = dest->fs_info; struct extent_buffer *eb; struct extent_buffer *parent; struct btrfs_key key; @@ -1834,7 +1843,7 @@ again: btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = dest->nodesize; + blocksize = fs_info->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { @@ -1860,7 +1869,7 @@ again: break; } - eb = read_tree_block(dest, old_bytenr, old_ptr_gen); + eb = read_tree_block(fs_info, old_bytenr, old_ptr_gen); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -1901,6 +1910,29 @@ again: BUG_ON(ret); /* + * Info qgroup to trace both subtrees. + * + * We must trace both trees. + * 1) Tree reloc subtree + * If not traced, we will leak data numbers + * 2) Fs subtree + * If not traced, we will double count old data + * and tree block numbers, if current trans doesn't free + * data reloc tree inode. + */ + ret = btrfs_qgroup_trace_subtree(trans, src, parent, + btrfs_header_generation(parent), + btrfs_header_level(parent)); + if (ret < 0) + break; + ret = btrfs_qgroup_trace_subtree(trans, dest, + path->nodes[level], + btrfs_header_generation(path->nodes[level]), + btrfs_header_level(path->nodes[level])); + if (ret < 0) + break; + + /* * swap blocks in fs tree and reloc tree. */ btrfs_set_node_blockptr(parent, slot, new_bytenr); @@ -1913,21 +1945,21 @@ again: path->slots[level], old_ptr_gen); btrfs_mark_buffer_dirty(path->nodes[level]); - ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, - path->nodes[level]->start, + ret = btrfs_inc_extent_ref(trans, fs_info, old_bytenr, + blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, - 0, dest->root_key.objectid, level - 1, - 0); + ret = btrfs_inc_extent_ref(trans, fs_info, new_bytenr, + blocksize, 0, dest->root_key.objectid, + level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, + ret = btrfs_free_extent(trans, fs_info, new_bytenr, blocksize, path->nodes[level]->start, src->root_key.objectid, level - 1, 0); BUG_ON(ret); - ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, + ret = btrfs_free_extent(trans, fs_info, old_bytenr, blocksize, 0, dest->root_key.objectid, level - 1, 0); BUG_ON(ret); @@ -1986,6 +2018,7 @@ static noinline_for_stack int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, int *level) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *eb = NULL; int i; u64 bytenr; @@ -2016,7 +2049,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - eb = read_tree_block(root, bytenr, ptr_gen); + eb = read_tree_block(fs_info, bytenr, ptr_gen); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2038,6 +2071,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, struct btrfs_key *min_key, struct btrfs_key *max_key) { + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = NULL; u64 objectid; u64 start, end; @@ -2072,7 +2106,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, start = 0; else { start = min_key->offset; - WARN_ON(!IS_ALIGNED(start, root->sectorsize)); + WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize)); } } else { start = 0; @@ -2087,7 +2121,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, if (max_key->offset == 0) continue; end = max_key->offset; - WARN_ON(!IS_ALIGNED(end, root->sectorsize)); + WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; } } else { @@ -2127,6 +2161,7 @@ static int find_next_key(struct btrfs_path *path, int level, static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; LIST_HEAD(inode_list); struct btrfs_key key; struct btrfs_key next_key; @@ -2175,7 +2210,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_unlock_up_safe(path, 0); } - min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; memset(&next_key, 0, sizeof(next_key)); while (1) { @@ -2236,10 +2271,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path->slots[level]); root_item->drop_level = level; - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction_throttle(trans); trans = NULL; - btrfs_btree_balance_dirty(root); + btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -2267,9 +2302,9 @@ out: } if (trans) - btrfs_end_transaction_throttle(trans, root); + btrfs_end_transaction_throttle(trans); - btrfs_btree_balance_dirty(root); + btrfs_btree_balance_dirty(fs_info); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -2281,16 +2316,17 @@ static noinline_for_stack int prepare_to_merge(struct reloc_control *rc, int err) { struct btrfs_root *root = rc->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *reloc_root; struct btrfs_trans_handle *trans; LIST_HEAD(reloc_roots); u64 num_bytes = 0; int ret; - mutex_lock(&root->fs_info->reloc_mutex); - rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; + mutex_lock(&fs_info->reloc_mutex); + rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; rc->merging_rsv_size += rc->nodes_relocated * 2; - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); again: if (!err) { @@ -2304,16 +2340,16 @@ again: trans = btrfs_join_transaction(rc->extent_root); if (IS_ERR(trans)) { if (!err) - btrfs_block_rsv_release(rc->extent_root, - rc->block_rsv, num_bytes); + btrfs_block_rsv_release(fs_info, rc->block_rsv, + num_bytes); return PTR_ERR(trans); } if (!err) { if (num_bytes != rc->merging_rsv_size) { - btrfs_end_transaction(trans, rc->extent_root); - btrfs_block_rsv_release(rc->extent_root, - rc->block_rsv, num_bytes); + btrfs_end_transaction(trans); + btrfs_block_rsv_release(fs_info, rc->block_rsv, + num_bytes); goto again; } } @@ -2325,8 +2361,7 @@ again: struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); - root = read_fs_root(reloc_root->fs_info, - reloc_root->root_key.offset); + root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); @@ -2344,9 +2379,9 @@ again: list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans, rc->extent_root); + btrfs_commit_transaction(trans); else - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction(trans); return err; } @@ -2369,11 +2404,9 @@ void free_reloc_roots(struct list_head *list) static noinline_for_stack void merge_reloc_roots(struct reloc_control *rc) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_root *root; struct btrfs_root *reloc_root; - u64 last_snap; - u64 otransid; - u64 objectid; LIST_HEAD(reloc_roots); int found = 0; int ret = 0; @@ -2386,9 +2419,9 @@ again: * adding their roots to the list while we are * doing this splice */ - mutex_lock(&root->fs_info->reloc_mutex); + mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); while (!list_empty(&reloc_roots)) { found = 1; @@ -2396,7 +2429,7 @@ again: struct btrfs_root, root_list); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - root = read_fs_root(reloc_root->fs_info, + root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); @@ -2412,14 +2445,6 @@ again: list_del_init(&reloc_root->root_list); } - /* - * we keep the old last snapshot transid in rtranid when we - * created the relocation tree. - */ - last_snap = btrfs_root_rtransid(&reloc_root->root_item); - otransid = btrfs_root_otransid(&reloc_root->root_item); - objectid = reloc_root->root_key.offset; - ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1); if (ret < 0) { if (list_empty(&reloc_root->root_list)) @@ -2435,14 +2460,14 @@ again: } out: if (ret) { - btrfs_handle_fs_error(root->fs_info, ret, NULL); + btrfs_handle_fs_error(fs_info, ret, NULL); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); /* new reloc root may be added */ - mutex_lock(&root->fs_info->reloc_mutex); + mutex_lock(&fs_info->reloc_mutex); list_splice_init(&rc->reloc_roots, &reloc_roots); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); if (!list_empty(&reloc_roots)) free_reloc_roots(&reloc_roots); } @@ -2464,12 +2489,13 @@ static void free_block_list(struct rb_root *blocks) static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *reloc_root) { + struct btrfs_fs_info *fs_info = reloc_root->fs_info; struct btrfs_root *root; if (reloc_root->last_trans == trans->transid) return 0; - root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset); + root = read_fs_root(fs_info, reloc_root->root_key.offset); BUG_ON(IS_ERR(root)); BUG_ON(root->reloc_root != reloc_root); @@ -2579,6 +2605,7 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, struct backref_node *node, int reserve) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct backref_node *next = node; struct backref_edge *edge; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; @@ -2593,7 +2620,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, if (next->processed && (reserve || next != node)) break; - num_bytes += rc->extent_root->nodesize; + num_bytes += fs_info->nodesize; if (list_empty(&next->upper)) break; @@ -2613,6 +2640,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, struct backref_node *node) { struct btrfs_root *root = rc->extent_root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 num_bytes; int ret; u64 tmp; @@ -2630,7 +2658,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes, BTRFS_RESERVE_FLUSH_LIMIT); if (ret) { - tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES; + tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES; while (tmp <= rc->reserved_bytes) tmp <<= 1; /* @@ -2640,8 +2668,8 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans, * space for relocation and we will return eailer in * enospc case. */ - rc->block_rsv->size = tmp + rc->extent_root->nodesize * - RELOCATION_RESERVED_NODES; + rc->block_rsv->size = tmp + fs_info->nodesize * + RELOCATION_RESERVED_NODES; return -EAGAIN; } @@ -2661,6 +2689,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, struct btrfs_key *key, struct btrfs_path *path, int lowest) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct backref_node *upper; struct backref_edge *edge; struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; @@ -2741,9 +2770,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } - blocksize = root->nodesize; + blocksize = root->fs_info->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, generation); + eb = read_tree_block(fs_info, bytenr, generation); if (IS_ERR(eb)) { err = PTR_ERR(eb); goto next; @@ -2772,7 +2801,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, trans->transid); btrfs_mark_buffer_dirty(upper->eb); - ret = btrfs_inc_extent_ref(trans, root, + ret = btrfs_inc_extent_ref(trans, root->fs_info, node->eb->start, blocksize, upper->eb->start, btrfs_header_owner(upper->eb), @@ -2854,7 +2883,7 @@ static void __mark_block_processed(struct reloc_control *rc, u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { - blocksize = rc->extent_root->nodesize; + blocksize = rc->extent_root->fs_info->nodesize; mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; @@ -2894,7 +2923,7 @@ static void update_processed_blocks(struct reloc_control *rc, static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { - u32 blocksize = rc->extent_root->nodesize; + u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) @@ -2902,14 +2931,13 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) return 0; } -static int get_tree_block_key(struct reloc_control *rc, +static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { struct extent_buffer *eb; BUG_ON(block->key_ready); - eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.offset); + eb = read_tree_block(fs_info, block->bytenr, block->key.offset); if (IS_ERR(eb)) { return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { @@ -2988,6 +3016,7 @@ static noinline_for_stack int relocate_tree_blocks(struct btrfs_trans_handle *trans, struct reloc_control *rc, struct rb_root *blocks) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct backref_node *node; struct btrfs_path *path; struct tree_block *block; @@ -3005,7 +3034,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - readahead_tree_block(rc->extent_root, block->bytenr); + readahead_tree_block(fs_info, block->bytenr); rb_node = rb_next(rb_node); } @@ -3013,7 +3042,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) { - err = get_tree_block_key(rc, block); + err = get_tree_block_key(fs_info, block); if (err) goto out_free_path; } @@ -3107,7 +3136,7 @@ static noinline_for_stack int setup_extent_mapping(struct inode *inode, u64 start, u64 end, u64 block_start) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; int ret = 0; @@ -3120,7 +3149,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - em->bdev = root->fs_info->fs_devices->latest_bdev; + em->bdev = fs_info->fs_devices->latest_bdev; set_bit(EXTENT_FLAG_PINNED, &em->flags); lock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -3141,6 +3170,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end, static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 page_start; u64 page_end; u64 offset = BTRFS_I(inode)->index_cnt; @@ -3236,7 +3266,7 @@ static int relocate_file_extent_cluster(struct inode *inode, index++; balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_throttle(BTRFS_I(inode)->root); + btrfs_throttle(fs_info); } WARN_ON(nr != cluster->nr); out: @@ -3376,7 +3406,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = rc->extent_root->nodesize; + block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3395,11 +3425,11 @@ static int __add_tree_block(struct reloc_control *rc, u64 bytenr, u32 blocksize, struct rb_root *blocks) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; int ret; - bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info, - SKINNY_METADATA); + bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA); if (tree_block_processed(bytenr, rc)) return 0; @@ -3465,7 +3495,7 @@ static int block_use_full_backref(struct reloc_control *rc, btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) return 1; - ret = btrfs_lookup_extent_info(NULL, rc->extent_root, + ret = btrfs_lookup_extent_info(NULL, rc->extent_root->fs_info, eb->start, btrfs_header_level(eb), 1, NULL, &flags); BUG_ON(ret); @@ -3502,7 +3532,7 @@ static int delete_block_group_cache(struct btrfs_fs_info *fs_info, } truncate: - ret = btrfs_check_trunc_cache_free_space(root, + ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); if (ret) goto out; @@ -3515,8 +3545,8 @@ truncate: ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); out: iput(inode); return ret; @@ -3532,6 +3562,7 @@ static int find_data_references(struct reloc_control *rc, struct btrfs_extent_data_ref *ref, struct rb_root *blocks) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_path *path; struct tree_block *block; struct btrfs_root *root; @@ -3558,8 +3589,7 @@ static int find_data_references(struct reloc_control *rc, * it and redo the search. */ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { - ret = delete_block_group_cache(rc->extent_root->fs_info, - rc->block_group, + ret = delete_block_group_cache(fs_info, rc->block_group, NULL, ref_objectid); if (ret != -ENOENT) return ret; @@ -3571,7 +3601,7 @@ static int find_data_references(struct reloc_control *rc, return -ENOMEM; path->reada = READA_FORWARD; - root = read_fs_root(rc->extent_root->fs_info, ref_root); + root = read_fs_root(fs_info, ref_root); if (IS_ERR(root)) { err = PTR_ERR(root); goto out; @@ -3706,7 +3736,7 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize = rc->extent_root->nodesize; + u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; int err = 0; @@ -3797,6 +3827,7 @@ static noinline_for_stack int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, struct btrfs_key *extent_key) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct btrfs_key key; struct extent_buffer *leaf; u64 start, end, last; @@ -3848,7 +3879,7 @@ next: } if (key.type == BTRFS_METADATA_ITEM_KEY && - key.objectid + rc->extent_root->nodesize <= + key.objectid + fs_info->nodesize <= rc->search_start) { path->slots[0]++; goto next; @@ -3866,7 +3897,7 @@ next: rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + - rc->extent_root->nodesize; + fs_info->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } @@ -3913,7 +3944,7 @@ int prepare_to_relocate(struct reloc_control *rc) struct btrfs_trans_handle *trans; int ret; - rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root, + rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rc->block_rsv) return -ENOMEM; @@ -3924,7 +3955,7 @@ int prepare_to_relocate(struct reloc_control *rc) rc->nodes_relocated = 0; rc->merging_rsv_size = 0; rc->reserved_bytes = 0; - rc->block_rsv->size = rc->extent_root->nodesize * + rc->block_rsv->size = rc->extent_root->fs_info->nodesize * RELOCATION_RESERVED_NODES; ret = btrfs_block_rsv_refill(rc->extent_root, rc->block_rsv, rc->block_rsv->size, @@ -3945,96 +3976,13 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans, rc->extent_root); + btrfs_commit_transaction(trans); return 0; } -/* - * Qgroup fixer for data chunk relocation. - * The data relocation is done in the following steps - * 1) Copy data extents into data reloc tree - * 2) Create tree reloc tree(special snapshot) for related subvolumes - * 3) Modify file extents in tree reloc tree - * 4) Merge tree reloc tree with original fs tree, by swapping tree blocks - * - * The problem is, data and tree reloc tree are not accounted to qgroup, - * and 4) will only info qgroup to track tree blocks change, not file extents - * in the tree blocks. - * - * The good news is, related data extents are all in data reloc tree, so we - * only need to info qgroup to track all file extents in data reloc tree - * before commit trans. - */ -static int qgroup_fix_relocated_data_extents(struct btrfs_trans_handle *trans, - struct reloc_control *rc) -{ - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct inode *inode = rc->data_inode; - struct btrfs_root *data_reloc_root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - int ret = 0; - - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) - return 0; - - /* - * Only for stage where we update data pointers the qgroup fix is - * valid. - * For MOVING_DATA stage, we will miss the timing of swapping tree - * blocks, and won't fix it. - */ - if (!(rc->stage == UPDATE_DATA_PTRS && rc->extents_found)) - return 0; - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = 0; - - ret = btrfs_search_slot(NULL, data_reloc_root, &key, path, 0, 0); - if (ret < 0) - goto out; - - lock_extent(&BTRFS_I(inode)->io_tree, 0, (u64)-1); - while (1) { - struct btrfs_file_extent_item *fi; - - btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid > btrfs_ino(inode)) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - goto next; - fi = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(path->nodes[0], fi) != - BTRFS_FILE_EXTENT_REG) - goto next; - ret = btrfs_qgroup_insert_dirty_extent(trans, fs_info, - btrfs_file_extent_disk_bytenr(path->nodes[0], fi), - btrfs_file_extent_disk_num_bytes(path->nodes[0], fi), - GFP_NOFS); - if (ret < 0) - break; -next: - ret = btrfs_next_item(data_reloc_root, path); - if (ret < 0) - break; - if (ret > 0) { - ret = 0; - break; - } - } - unlock_extent(&BTRFS_I(inode)->io_tree, 0 , (u64)-1); -out: - btrfs_free_path(path); - return ret; -} - static noinline_for_stack int relocate_block_group(struct reloc_control *rc) { + struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; struct rb_root blocks = RB_ROOT; struct btrfs_key key; struct btrfs_trans_handle *trans = NULL; @@ -4075,7 +4023,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) } restart: if (update_backref_cache(trans, &rc->backref_cache)) { - btrfs_end_transaction(trans, rc->extent_root); + btrfs_end_transaction(trans); continue; } @@ -4163,8 +4111,8 @@ restart: } } - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root); + btrfs_end_transaction_throttle(trans); + btrfs_btree_balance_dirty(fs_info); trans = NULL; if (rc->stage == MOVE_DATA_EXTENTS && @@ -4179,7 +4127,7 @@ restart: } } if (trans && progress && err == -ENOSPC) { - ret = btrfs_force_chunk_alloc(trans, rc->extent_root, + ret = btrfs_force_chunk_alloc(trans, fs_info, rc->block_group->flags); if (ret == 1) { err = 0; @@ -4192,8 +4140,8 @@ restart: clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { - btrfs_end_transaction_throttle(trans, rc->extent_root); - btrfs_btree_balance_dirty(rc->extent_root); + btrfs_end_transaction_throttle(trans); + btrfs_btree_balance_dirty(fs_info); } if (!err) { @@ -4207,7 +4155,7 @@ restart: set_reloc_control(rc); backref_cache_cleanup(&rc->backref_cache); - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); err = prepare_to_merge(rc, err); @@ -4215,7 +4163,7 @@ restart: rc->merge_reloc_tree = 0; unset_reloc_control(rc); - btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); + btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1); /* get rid of pinned extents */ trans = btrfs_join_transaction(rc->extent_root); @@ -4223,16 +4171,9 @@ restart: err = PTR_ERR(trans); goto out_free; } - ret = qgroup_fix_relocated_data_extents(trans, rc); - if (ret < 0) { - btrfs_abort_transaction(trans, ret); - if (!err) - err = ret; - goto out_free; - } - btrfs_commit_transaction(trans, rc->extent_root); + btrfs_commit_transaction(trans); out_free: - btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); + btrfs_free_block_rsv(fs_info, rc->block_rsv); btrfs_free_path(path); return err; } @@ -4255,7 +4196,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); @@ -4300,14 +4241,14 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, key.objectid = objectid; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - inode = btrfs_iget(root->fs_info->sb, &key, root, NULL); + inode = btrfs_iget(fs_info->sb, &key, root, NULL); BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); BTRFS_I(inode)->index_cnt = group->key.objectid; err = btrfs_orphan_add(trans, inode); out: - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); if (err) { if (inode) iput(inode); @@ -4333,11 +4274,50 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) } /* + * Print the block group being relocated + */ +static void describe_relocation(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + char buf[128]; /* prefixed by a '|' that'll be dropped */ + u64 flags = block_group->flags; + + /* Shouldn't happen */ + if (!flags) { + strcpy(buf, "|NONE"); + } else { + char *bp = buf; + +#define DESCRIBE_FLAG(f, d) \ + if (flags & BTRFS_BLOCK_GROUP_##f) { \ + bp += snprintf(bp, buf - bp + sizeof(buf), "|%s", d); \ + flags &= ~BTRFS_BLOCK_GROUP_##f; \ + } + DESCRIBE_FLAG(DATA, "data"); + DESCRIBE_FLAG(SYSTEM, "system"); + DESCRIBE_FLAG(METADATA, "metadata"); + DESCRIBE_FLAG(RAID0, "raid0"); + DESCRIBE_FLAG(RAID1, "raid1"); + DESCRIBE_FLAG(DUP, "dup"); + DESCRIBE_FLAG(RAID10, "raid10"); + DESCRIBE_FLAG(RAID5, "raid5"); + DESCRIBE_FLAG(RAID6, "raid6"); + if (flags) + snprintf(buf, buf - bp + sizeof(buf), "|0x%llx", flags); +#undef DESCRIBE_FLAG + } + + btrfs_info(fs_info, + "relocating block group %llu flags %s", + block_group->key.objectid, buf + 1); +} + +/* * function to relocate all extents in a block group. */ -int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) { - struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; struct reloc_control *rc; struct inode *inode; struct btrfs_path *path; @@ -4388,9 +4368,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) goto out; } - btrfs_info(extent_root->fs_info, - "relocating block group %llu flags %llu", - rc->block_group->key.objectid, rc->block_group->flags); + describe_relocation(fs_info, rc->block_group); btrfs_wait_block_group_reservations(rc->block_group); btrfs_wait_nocow_writers(rc->block_group); @@ -4410,8 +4388,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(extent_root->fs_info, "found %llu extents", - rc->extents_found); + btrfs_info(fs_info, "found %llu extents", rc->extents_found); if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) { ret = btrfs_wait_ordered_range(rc->data_inode, 0, @@ -4431,7 +4408,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); out: if (err && rw) - btrfs_dec_block_group_ro(extent_root, rc->block_group); + btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); btrfs_put_block_group(rc->block_group); kfree(rc); @@ -4440,10 +4417,11 @@ out: static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret, err; - trans = btrfs_start_transaction(root->fs_info->tree_root, 0); + trans = btrfs_start_transaction(fs_info->tree_root, 0); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4451,10 +4429,10 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) sizeof(root->root_item.drop_progress)); root->root_item.drop_level = 0; btrfs_set_root_refs(&root->root_item, 0); - ret = btrfs_update_root(trans, root->fs_info->tree_root, + ret = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); - err = btrfs_end_transaction(trans, root->fs_info->tree_root); + err = btrfs_end_transaction(trans); if (err) return err; return ret; @@ -4468,6 +4446,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root) */ int btrfs_recover_relocation(struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; LIST_HEAD(reloc_roots); struct btrfs_key key; struct btrfs_root *fs_root; @@ -4489,7 +4468,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, + ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) { err = ret; @@ -4517,7 +4496,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) list_add(&reloc_root->root_list, &reloc_roots); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - fs_root = read_fs_root(root->fs_info, + fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { ret = PTR_ERR(fs_root); @@ -4543,13 +4522,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(root->fs_info); + rc = alloc_reloc_control(fs_info); if (!rc) { err = -ENOMEM; goto out; } - rc->extent_root = root->fs_info->extent_root; + rc->extent_root = fs_info->extent_root; set_reloc_control(rc); @@ -4573,8 +4552,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) continue; } - fs_root = read_fs_root(root->fs_info, - reloc_root->root_key.offset); + fs_root = read_fs_root(fs_info, reloc_root->root_key.offset); if (IS_ERR(fs_root)) { err = PTR_ERR(fs_root); goto out_free; @@ -4585,7 +4563,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) fs_root->reloc_root = reloc_root; } - err = btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans); if (err) goto out_free; @@ -4598,12 +4576,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) err = PTR_ERR(trans); goto out_free; } - err = qgroup_fix_relocated_data_extents(trans, rc); - if (err < 0) { - btrfs_abort_transaction(trans, err); - goto out_free; - } - err = btrfs_commit_transaction(trans, rc->extent_root); + err = btrfs_commit_transaction(trans); out_free: kfree(rc); out: @@ -4614,8 +4587,7 @@ out: if (err == 0) { /* cleanup orphan inode in data relocation tree */ - fs_root = read_fs_root(root->fs_info, - BTRFS_DATA_RELOC_TREE_OBJECTID); + fs_root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); else @@ -4632,9 +4604,9 @@ out: */ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_sum *sums; struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; u64 disk_bytenr; u64 new_bytenr; @@ -4644,7 +4616,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) BUG_ON(ordered->file_offset != file_pos || ordered->len != len); disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; - ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, disk_bytenr + len - 1, &list, 0); if (ret) goto out; @@ -4679,13 +4651,14 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct reloc_control *rc; struct backref_node *node; int first_cow = 0; int level; int ret = 0; - rc = root->fs_info->reloc_ctl; + rc = fs_info->reloc_ctl; if (!rc) return 0; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index edae751e870c..4c6735491ee0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -132,6 +132,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_root_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *l; int ret; @@ -150,9 +151,8 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root } if (ret != 0) { - btrfs_print_leaf(root, path->nodes[0]); - btrfs_crit(root->fs_info, - "unable to update root key %llu %u %llu", + btrfs_print_leaf(fs_info, path->nodes[0]); + btrfs_crit(fs_info, "unable to update root key %llu %u %llu", key->objectid, key->type, key->offset); BUG_ON(1); } @@ -216,8 +216,9 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, return btrfs_insert_item(trans, root, key, item, sizeof(*item)); } -int btrfs_find_orphan_roots(struct btrfs_root *tree_root) +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) { + struct btrfs_root *tree_root = fs_info->tree_root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_key key; @@ -227,7 +228,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) int ret; bool can_recover = true; - if (tree_root->fs_info->sb->s_flags & MS_RDONLY) + if (fs_info->sb->s_flags & MS_RDONLY) can_recover = false; path = btrfs_alloc_path(); @@ -275,8 +276,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) * in turn reads and inserts fs roots while doing backref * walking. */ - root = btrfs_lookup_fs_root(tree_root->fs_info, - root_key.objectid); + root = btrfs_lookup_fs_root(fs_info, root_key.objectid); if (root) { WARN_ON(!test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)); @@ -297,15 +297,15 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) trans = btrfs_join_transaction(tree_root); if (IS_ERR(trans)) { err = PTR_ERR(trans); - btrfs_handle_fs_error(tree_root->fs_info, err, + btrfs_handle_fs_error(fs_info, err, "Failed to start trans to delete orphan item"); break; } err = btrfs_del_orphan_item(trans, tree_root, root_key.objectid); - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction(trans); if (err) { - btrfs_handle_fs_error(tree_root->fs_info, err, + btrfs_handle_fs_error(fs_info, err, "Failed to delete root orphan item"); break; } @@ -320,7 +320,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root) set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state); - err = btrfs_insert_fs_root(root->fs_info, root); + err = btrfs_insert_fs_root(fs_info, root); if (err) { BUG_ON(err == -EEXIST); btrfs_free_fs_root(root); @@ -358,11 +358,12 @@ out: } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len) { + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_path *path; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -429,10 +430,11 @@ out: * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, int name_len) { + struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_key key; int ret; struct btrfs_path *path; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index fffb9ab8526e..9a94670536a6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -171,7 +171,7 @@ struct scrub_wr_ctx { struct scrub_ctx { struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX]; - struct btrfs_root *dev_root; + struct btrfs_fs_info *fs_info; int first_free; int curr; atomic_t bios_in_flight; @@ -356,7 +356,7 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) */ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; atomic_inc(&sctx->refs); /* @@ -388,7 +388,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) /* used for workers that require transaction commits */ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx) { - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; /* * see scrub_pending_trans_workers_inc() why we're pretending @@ -458,7 +458,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { struct scrub_ctx *sctx; int i; - struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); @@ -468,7 +468,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; - sctx->dev_root = dev->dev_root; + sctx->fs_info = dev->fs_info; for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) { struct scrub_bio *sbio; @@ -489,8 +489,8 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx->bios[i]->next_free = -1; } sctx->first_free = 0; - sctx->nodesize = dev->dev_root->nodesize; - sctx->sectorsize = dev->dev_root->sectorsize; + sctx->nodesize = fs_info->nodesize; + sctx->sectorsize = fs_info->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); atomic_set(&sctx->cancel_req, 0); @@ -524,7 +524,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct scrub_warning *swarn = warn_ctx; - struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = swarn->dev->fs_info; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key root_key; @@ -618,7 +618,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) WARN_ON(sblock->page_count < 1); dev = sblock->pagev[0]->dev; - fs_info = sblock->sctx->dev_root->fs_info; + fs_info = sblock->sctx->fs_info; path = btrfs_alloc_path(); if (!path) @@ -789,6 +789,7 @@ out: static void scrub_fixup_nodatasum(struct btrfs_work *work) { + struct btrfs_fs_info *fs_info; int ret; struct scrub_fixup_nodatasum *fixup; struct scrub_ctx *sctx; @@ -798,6 +799,7 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) fixup = container_of(work, struct scrub_fixup_nodatasum, work); sctx = fixup->sctx; + fs_info = fixup->root->fs_info; path = btrfs_alloc_path(); if (!path) { @@ -823,9 +825,8 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) * (once it's finished) and rewrite the failed sector if a good copy * can be found. */ - ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, - path, scrub_fixup_readpage, - fixup); + ret = iterate_inodes_from_logical(fixup->logical, fs_info, path, + scrub_fixup_readpage, fixup); if (ret < 0) { uncorrectable = 1; goto out; @@ -838,15 +839,14 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) out: if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans, fixup->root); + btrfs_end_transaction(trans); if (uncorrectable) { spin_lock(&sctx->stat_lock); ++sctx->stat.uncorrectable_errors; spin_unlock(&sctx->stat_lock); btrfs_dev_replace_stats_inc( - &sctx->dev_root->fs_info->dev_replace. - num_uncorrectable_read_errors); - btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + &fs_info->dev_replace.num_uncorrectable_read_errors); + btrfs_err_rl_in_rcu(fs_info, "unable to fixup (nodatasum) error at logical %llu on dev %s", fixup->logical, rcu_str_deref(fixup->dev->name)); } @@ -898,7 +898,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); - fs_info = sctx->dev_root->fs_info; + fs_info = sctx->fs_info; if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* * if we find an error in a super block, we just report it. @@ -1177,9 +1177,7 @@ nodatasum_case: if (scrub_write_page_to_dev_replace(sblock_other, page_num) != 0) { btrfs_dev_replace_stats_inc( - &sctx->dev_root-> - fs_info->dev_replace. - num_write_errors); + &fs_info->dev_replace.num_write_errors); success = 0; } } else if (sblock_other) { @@ -1302,7 +1300,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck) { struct scrub_ctx *sctx = original_sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = original_sblock->page_count * PAGE_SIZE; u64 logical = original_sblock->pagev[0]->logical; u64 generation = original_sblock->pagev[0]->generation; @@ -1334,8 +1332,8 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ - ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, - &mapped_length, &bbio, 0, 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &mapped_length, &bbio, 0, 1); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); return -EIO; @@ -1452,7 +1450,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio, + ret = raid56_parity_recover(fs_info, bio, page->recover->bbio, page->recover->map_length, page->mirror_num, 0); if (ret) @@ -1565,6 +1563,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, { struct scrub_page *page_bad = sblock_bad->pagev[page_num]; struct scrub_page *page_good = sblock_good->pagev[page_num]; + struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; BUG_ON(page_bad->page == NULL); BUG_ON(page_good->page == NULL); @@ -1574,7 +1573,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, int ret; if (!page_bad->dev->bdev) { - btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info, + btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } @@ -1596,8 +1595,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, btrfs_dev_stat_inc_and_print(page_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); btrfs_dev_replace_stats_inc( - &sblock_bad->sctx->dev_root->fs_info-> - dev_replace.num_write_errors); + &fs_info->dev_replace.num_write_errors); bio_put(bio); return -EIO; } @@ -1609,6 +1607,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { + struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; int page_num; /* @@ -1624,8 +1623,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) ret = scrub_write_page_to_dev_replace(sblock, page_num); if (ret) btrfs_dev_replace_stats_inc( - &sblock->sctx->dev_root->fs_info->dev_replace. - num_write_errors); + &fs_info->dev_replace.num_write_errors); } } @@ -1740,7 +1738,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) static void scrub_wr_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sbio->dev->fs_info; sbio->err = bio->bi_error; sbio->bio = bio; @@ -1759,7 +1757,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO); if (sbio->err) { struct btrfs_dev_replace *dev_replace = - &sbio->sctx->dev_root->fs_info->dev_replace; + &sbio->sctx->fs_info->dev_replace; for (i = 0; i < sbio->page_count; i++) { struct scrub_page *spage = sbio->pagev[i]; @@ -1859,8 +1857,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_header *h; - struct btrfs_root *root = sctx->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; @@ -2126,7 +2123,7 @@ again: static void scrub_missing_raid56_end_io(struct bio *bio) { struct scrub_block *sblock = bio->bi_private; - struct btrfs_fs_info *fs_info = sblock->sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; if (bio->bi_error) sblock->no_io_error_seen = 0; @@ -2140,6 +2137,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; u64 logical; struct btrfs_device *dev; @@ -2153,14 +2151,14 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + btrfs_err_rl_in_rcu(fs_info, "IO error rebuilding logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); - btrfs_err_rl_in_rcu(sctx->dev_root->fs_info, + btrfs_err_rl_in_rcu(fs_info, "failed to rebuild valid logical %llu for dev %s", logical, rcu_str_deref(dev->name)); } else { @@ -2182,7 +2180,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) static void scrub_missing_raid56_pages(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; struct btrfs_bio *bbio = NULL; @@ -2191,8 +2189,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; - ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical, &length, - &bbio, 0, 1); + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, + &length, &bbio, 0, 1); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2215,7 +2213,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(sctx->dev_root, bio, bbio, length); + rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); if (!rbio) goto rbio_out; @@ -2334,7 +2332,7 @@ leave_nomem: static void scrub_bio_end_io(struct bio *bio) { struct scrub_bio *sbio = bio->bi_private; - struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sbio->dev->fs_info; sbio->err = bio->bi_error; sbio->bio = bio; @@ -2391,7 +2389,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, { u32 offset; int nsectors; - int sectorsize = sparity->sctx->dev_root->sectorsize; + int sectorsize = sparity->sctx->fs_info->sectorsize; if (len >= sparity->stripe_len) { bitmap_set(bitmap, 0, sparity->nsectors); @@ -2750,6 +2748,7 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) static void scrub_parity_bio_endio(struct bio *bio) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, @@ -2759,13 +2758,13 @@ static void scrub_parity_bio_endio(struct bio *bio) btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, scrub_parity_bio_endio_worker, NULL, NULL); - btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, - &sparity->work); + btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) { struct scrub_ctx *sctx = sparity->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; struct scrub_page *spage; @@ -2778,8 +2777,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; - ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE, - sparity->logic_start, + ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, &length, &bbio, 0, 1); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2792,7 +2790,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio, + rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, length, sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); @@ -2844,7 +2842,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 logic_start, u64 logic_end) { - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -2866,7 +2864,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nsectors = div_u64(map->stripe_len, root->sectorsize); + nsectors = div_u64(map->stripe_len, fs_info->sectorsize); bitmap_len = scrub_calc_parity_bitmap_len(nsectors); sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, GFP_NOFS); @@ -2937,7 +2935,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, goto next; if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = root->nodesize; + bytes = fs_info->nodesize; else bytes = key.offset; @@ -2988,8 +2986,9 @@ again: mapped_length = extent_len; bbio = NULL; - ret = btrfs_map_block(fs_info, READ, extent_logical, - &mapped_length, &bbio, 0); + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, + extent_logical, &mapped_length, &bbio, + 0); if (!ret) { if (!bbio || mapped_length < extent_len) ret = -EIO; @@ -3068,7 +3067,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int is_dev_replace) { struct btrfs_path *path, *ppath; - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; @@ -3289,7 +3288,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, goto next; if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = root->nodesize; + bytes = fs_info->nodesize; else bytes = key.offset; @@ -3442,8 +3441,8 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, struct btrfs_block_group_cache *cache, int is_dev_replace) { - struct btrfs_mapping_tree *map_tree = - &sctx->dev_root->fs_info->mapping_tree; + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; int i; @@ -3496,8 +3495,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, { struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; - struct btrfs_root *root = sctx->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *root = fs_info->dev_root; u64 length; u64 chunk_offset; int ret = 0; @@ -3617,8 +3616,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (IS_ERR(trans)) ret = PTR_ERR(trans); else - ret = btrfs_commit_transaction(trans, - root); + ret = btrfs_commit_transaction(trans); if (ret) { scrub_pause_off(fs_info); btrfs_put_block_group(cache); @@ -3693,7 +3691,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_dev_replace_unlock(&fs_info->dev_replace, 1); if (ro_set) - btrfs_dec_block_group_ro(root, cache); + btrfs_dec_block_group_ro(cache); /* * We might have prevented the cleaner kthread from deleting @@ -3746,16 +3744,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, u64 bytenr; u64 gen; int ret; - struct btrfs_root *root = sctx->dev_root; + struct btrfs_fs_info *fs_info = sctx->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EIO; /* Seed devices of a new filesystem has their own generation. */ - if (scrub_dev->fs_devices != root->fs_info->fs_devices) + if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else - gen = root->fs_info->last_trans_committed; + gen = fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); @@ -3847,7 +3845,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (btrfs_fs_closing(fs_info)) return -EINVAL; - if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { + if (fs_info->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum * the way scrub is implemented. Do not handle this @@ -3855,31 +3853,31 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, */ btrfs_err(fs_info, "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails", - fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN); + fs_info->nodesize, + BTRFS_STRIPE_LEN); return -EINVAL; } - if (fs_info->chunk_root->sectorsize != PAGE_SIZE) { + if (fs_info->sectorsize != PAGE_SIZE) { /* not supported for data w/o checksums */ btrfs_err_rl(fs_info, "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails", - fs_info->chunk_root->sectorsize, PAGE_SIZE); + fs_info->sectorsize, PAGE_SIZE); return -EINVAL; } - if (fs_info->chunk_root->nodesize > + if (fs_info->nodesize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || - fs_info->chunk_root->sectorsize > - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { /* * would exhaust the array bounds of pagev member in * struct scrub_block */ btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->chunk_root->nodesize, + fs_info->nodesize, SCRUB_MAX_PAGES_PER_BLOCK, - fs_info->chunk_root->sectorsize, + fs_info->sectorsize, SCRUB_MAX_PAGES_PER_BLOCK); return -EINVAL; } @@ -3979,10 +3977,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, return ret; } -void btrfs_scrub_pause(struct btrfs_root *root) +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; - mutex_lock(&fs_info->scrub_lock); atomic_inc(&fs_info->scrub_pause_req); while (atomic_read(&fs_info->scrubs_paused) != @@ -3996,10 +3992,8 @@ void btrfs_scrub_pause(struct btrfs_root *root) mutex_unlock(&fs_info->scrub_lock); } -void btrfs_scrub_continue(struct btrfs_root *root) +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; - atomic_dec(&fs_info->scrub_pause_req); wake_up(&fs_info->scrub_pause_wait); } @@ -4048,19 +4042,19 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress) { struct btrfs_device *dev; struct scrub_ctx *sctx = NULL; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(root->fs_info, devid, NULL, NULL); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + dev = btrfs_find_device(fs_info, devid, NULL, NULL); if (dev) sctx = dev->scrub_device; if (sctx) memcpy(progress, &sctx->stat, sizeof(*progress)); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } @@ -4076,7 +4070,7 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, int ret; mapped_length = extent_len; - ret = btrfs_map_block(fs_info, READ, extent_logical, + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, &mapped_length, &bbio, 0); if (ret || !bbio || mapped_length < extent_len || !bbio->stripes[0].dev->bdev) { @@ -4122,7 +4116,7 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len, int mirror_num, u64 physical_for_dev_replace) { struct scrub_copy_nocow_ctx *nocow_ctx; - struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = sctx->fs_info; nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS); if (!nocow_ctx) { @@ -4170,20 +4164,17 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) struct scrub_copy_nocow_ctx *nocow_ctx = container_of(work, struct scrub_copy_nocow_ctx, work); struct scrub_ctx *sctx = nocow_ctx->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *root = fs_info->extent_root; u64 logical = nocow_ctx->logical; u64 len = nocow_ctx->len; int mirror_num = nocow_ctx->mirror_num; u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; int ret; struct btrfs_trans_handle *trans = NULL; - struct btrfs_fs_info *fs_info; struct btrfs_path *path; - struct btrfs_root *root; int not_written = 0; - fs_info = sctx->dev_root->fs_info; - root = fs_info->extent_root; - path = btrfs_alloc_path(); if (!path) { spin_lock(&sctx->stat_lock); @@ -4210,7 +4201,7 @@ static void copy_nocow_pages_worker(struct btrfs_work *work) goto out; } - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); trans = NULL; while (!list_empty(&nocow_ctx->inodes)) { struct scrub_nocow_inode *entry; @@ -4238,7 +4229,7 @@ out: kfree(entry); } if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); if (not_written) btrfs_dev_replace_stats_inc(&fs_info->dev_replace. num_uncorrectable_read_errors); @@ -4296,7 +4287,7 @@ out_unlock: static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, struct scrub_copy_nocow_ctx *nocow_ctx) { - struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info; + struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info; struct btrfs_key key; struct inode *inode; struct page *page; @@ -4426,7 +4417,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, if (!dev) return -EIO; if (!dev->bdev) { - btrfs_warn_rl(dev->dev_root->fs_info, + btrfs_warn_rl(dev->fs_info, "scrub write_page_nocow(bdev == NULL) is unexpected"); return -EIO; } @@ -4440,7 +4431,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; bio->bi_bdev = dev->bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { leave_with_eio: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 71261b459863..d145ce804620 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1054,7 +1054,8 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { + if (name_len + data_len > + BTRFS_MAX_XATTR_SIZE(root->fs_info)) { ret = -E2BIG; goto out; } @@ -1430,9 +1431,9 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = logical - found_key.objectid; else extent_item_pos = 0; - ret = iterate_extent_inodes(fs_info, - found_key.objectid, extent_item_pos, 1, - __iterate_backrefs, backref_ctx); + ret = iterate_extent_inodes(fs_info, found_key.objectid, + extent_item_pos, 1, __iterate_backrefs, + backref_ctx); if (ret < 0) goto out; @@ -3434,6 +3435,7 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, struct recorded_ref *parent_ref, const bool is_orphan) { + struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key di_key; @@ -3462,8 +3464,8 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, goto out; } - di = btrfs_match_dir_item_name(sctx->parent_root, path, - parent_ref->name, parent_ref->name_len); + di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name, + parent_ref->name_len); if (!di) { ret = 0; goto out; @@ -5264,7 +5266,7 @@ static int get_last_extent(struct send_ctx *sctx, u64 offset) u64 size = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], fi); extent_end = ALIGN(key.offset + size, - sctx->send_root->sectorsize); + sctx->send_root->fs_info->sectorsize); } else { extent_end = key.offset + btrfs_file_extent_num_bytes(path->nodes[0], fi); @@ -5299,7 +5301,7 @@ static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path, u64 size = btrfs_file_extent_inline_len(path->nodes[0], path->slots[0], fi); extent_end = ALIGN(key->offset + size, - sctx->send_root->sectorsize); + sctx->send_root->fs_info->sectorsize); } else { extent_end = key->offset + btrfs_file_extent_num_bytes(path->nodes[0], fi); @@ -6110,7 +6112,7 @@ again: goto commit_trans; if (trans) - return btrfs_end_transaction(trans, sctx->send_root); + return btrfs_end_transaction(trans); return 0; @@ -6123,7 +6125,7 @@ commit_trans: goto again; } - return btrfs_commit_transaction(trans, sctx->send_root); + return btrfs_commit_transaction(trans); } static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) @@ -6136,17 +6138,17 @@ static void btrfs_root_dec_send_in_progress(struct btrfs_root* root) */ if (root->send_in_progress < 0) btrfs_err(root->fs_info, - "send_in_progres unbalanced %d root %llu", - root->send_in_progress, root->root_key.objectid); + "send_in_progres unbalanced %d root %llu", + root->send_in_progress, root->root_key.objectid); spin_unlock(&root->root_item_lock); } long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) { int ret = 0; - struct btrfs_root *send_root; + struct btrfs_root *send_root = BTRFS_I(file_inode(mnt_file))->root; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; - struct btrfs_fs_info *fs_info; struct btrfs_ioctl_send_args *arg = NULL; struct btrfs_key key; struct send_ctx *sctx = NULL; @@ -6160,9 +6162,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - send_root = BTRFS_I(file_inode(mnt_file))->root; - fs_info = send_root->fs_info; - /* * The subvolume must remain read-only during send, protect against * making it RW. This also protects against deletion. diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 74ed5aae6cea..b5ae7d3d1896 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -202,27 +202,25 @@ static struct ratelimit_state printk_limits[] = { void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { struct super_block *sb = fs_info->sb; - char lvl[4]; + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; va_list args; - const char *type = logtypes[4]; int kern_level; - struct ratelimit_state *ratelimit; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; va_start(args, fmt); - kern_level = printk_get_level(fmt); - if (kern_level) { + while ((kern_level = printk_get_level(fmt)) != 0) { size_t size = printk_skip_level(fmt) - fmt; - memcpy(lvl, fmt, size); - lvl[size] = '\0'; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } fmt += size; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } else { - *lvl = '\0'; - /* Default to debug output */ - ratelimit = &printk_limits[7]; } vaf.fmt = fmt; @@ -305,7 +303,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - close_ctree(btrfs_sb(sb)->tree_root); + close_ctree(btrfs_sb(sb)); } enum { @@ -396,10 +394,9 @@ static const match_table_t tokens = { * reading in a new superblock is parsed here. * XXX JDM: This needs to be cleaned up for remount. */ -int btrfs_parse_options(struct btrfs_root *root, char *options, +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags) { - struct btrfs_fs_info *info = root->fs_info; substring_t args[MAX_OPT_ARGS]; char *p, *num, *orig = NULL; u64 cache_gen; @@ -411,8 +408,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, bool saved_compress_force; int no_compress = 0; - cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) + cache_gen = btrfs_super_cache_generation(info->super_copy); + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); @@ -442,7 +439,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, token = match_token(p, tokens, args); switch (token) { case Opt_degraded: - btrfs_info(root->fs_info, "allowing degraded mounts"); + btrfs_info(info, "allowing degraded mounts"); btrfs_set_opt(info->mount_opt, DEGRADED); break; case Opt_subvol: @@ -461,11 +458,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, case Opt_datasum: if (btrfs_test_opt(info, NODATASUM)) { if (btrfs_test_opt(info, NODATACOW)) - btrfs_info(root->fs_info, + btrfs_info(info, "setting datasum, datacow enabled"); else - btrfs_info(root->fs_info, - "setting datasum"); + btrfs_info(info, "setting datasum"); } btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -474,11 +470,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, if (!btrfs_test_opt(info, NODATACOW)) { if (!btrfs_test_opt(info, COMPRESS) || !btrfs_test_opt(info, FORCE_COMPRESS)) { - btrfs_info(root->fs_info, + btrfs_info(info, "setting nodatacow, compression disabled"); } else { - btrfs_info(root->fs_info, - "setting nodatacow"); + btrfs_info(info, "setting nodatacow"); } } btrfs_clear_opt(info->mount_opt, COMPRESS); @@ -545,8 +540,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, compress_force != saved_compress_force)) || (!btrfs_test_opt(info, COMPRESS) && no_compress == 1)) { - btrfs_info(root->fs_info, - "%s %s compression", + btrfs_info(info, "%s %s compression", (compress_force) ? "force" : "use", compress_type); } @@ -594,10 +588,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, if (info->max_inline) { info->max_inline = min_t(u64, info->max_inline, - root->sectorsize); + info->sectorsize); } - btrfs_info(root->fs_info, "max_inline at %llu", - info->max_inline); + btrfs_info(info, "max_inline at %llu", + info->max_inline); } else { ret = -ENOMEM; goto out; @@ -610,8 +604,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, info->alloc_start = memparse(num, NULL); mutex_unlock(&info->chunk_mutex); kfree(num); - btrfs_info(root->fs_info, - "allocations start at %llu", + btrfs_info(info, "allocations start at %llu", info->alloc_start); } else { ret = -ENOMEM; @@ -620,16 +613,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL - root->fs_info->sb->s_flags |= MS_POSIXACL; + info->sb->s_flags |= MS_POSIXACL; break; #else - btrfs_err(root->fs_info, - "support for ACL not compiled in!"); + btrfs_err(info, "support for ACL not compiled in!"); ret = -EINVAL; goto out; #endif case Opt_noacl: - root->fs_info->sb->s_flags &= ~MS_POSIXACL; + info->sb->s_flags &= ~MS_POSIXACL; break; case Opt_notreelog: btrfs_set_and_info(info, NOTREELOG, @@ -658,8 +650,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, goto out; } else if (intarg >= 0) { info->metadata_ratio = intarg; - btrfs_info(root->fs_info, "metadata ratio %d", - info->metadata_ratio); + btrfs_info(info, "metadata ratio %d", + info->metadata_ratio); } else { ret = -EINVAL; goto out; @@ -677,15 +669,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, case Opt_space_cache_version: if (token == Opt_space_cache || strcmp(args[0].from, "v1") == 0) { - btrfs_clear_opt(root->fs_info->mount_opt, + btrfs_clear_opt(info->mount_opt, FREE_SPACE_TREE); btrfs_set_and_info(info, SPACE_CACHE, - "enabling disk space caching"); + "enabling disk space caching"); } else if (strcmp(args[0].from, "v2") == 0) { - btrfs_clear_opt(root->fs_info->mount_opt, + btrfs_clear_opt(info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(info, - FREE_SPACE_TREE, + btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { ret = -EINVAL; @@ -697,14 +688,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, break; case Opt_no_space_cache: if (btrfs_test_opt(info, SPACE_CACHE)) { - btrfs_clear_and_info(info, - SPACE_CACHE, - "disabling disk space caching"); + btrfs_clear_and_info(info, SPACE_CACHE, + "disabling disk space caching"); } if (btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_clear_and_info(info, - FREE_SPACE_TREE, - "disabling free space tree"); + btrfs_clear_and_info(info, FREE_SPACE_TREE, + "disabling free space tree"); } break; case Opt_inode_cache: @@ -737,10 +726,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, "disabling auto defrag"); break; case Opt_recovery: - btrfs_warn(root->fs_info, + btrfs_warn(info, "'recovery' is deprecated, use 'usebackuproot' instead"); case Opt_usebackuproot: - btrfs_info(root->fs_info, + btrfs_info(info, "trying to use backup root at mount time"); btrfs_set_opt(info->mount_opt, USEBACKUPROOT); break; @@ -749,14 +738,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: - btrfs_info(root->fs_info, + btrfs_info(info, "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: - btrfs_info(root->fs_info, "enabling check integrity"); + btrfs_info(info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity_print_mask: @@ -765,7 +754,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, goto out; } else if (intarg >= 0) { info->check_integrity_print_mask = intarg; - btrfs_info(root->fs_info, + btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); } else { @@ -777,8 +766,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, case Opt_check_integrity_including_extent_data: case Opt_check_integrity: case Opt_check_integrity_print_mask: - btrfs_err(root->fs_info, - "support for check_integrity* not compiled in!"); + btrfs_err(info, + "support for check_integrity* not compiled in!"); ret = -EINVAL; goto out; #endif @@ -798,20 +787,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, intarg = 0; ret = match_int(&args[0], &intarg); if (ret < 0) { - btrfs_err(root->fs_info, - "invalid commit interval"); + btrfs_err(info, "invalid commit interval"); ret = -EINVAL; goto out; } if (intarg > 0) { if (intarg > 300) { - btrfs_warn(root->fs_info, + btrfs_warn(info, "excessive commit interval %d", intarg); } info->commit_interval = intarg; } else { - btrfs_info(root->fs_info, + btrfs_info(info, "using default commit interval %ds", BTRFS_DEFAULT_COMMIT_INTERVAL); info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; @@ -819,23 +807,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options, break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: - btrfs_info(root->fs_info, "fragmenting all space"); + btrfs_info(info, "fragmenting all space"); btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); break; case Opt_fragment_metadata: - btrfs_info(root->fs_info, "fragmenting metadata"); + btrfs_info(info, "fragmenting metadata"); btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); break; case Opt_fragment_data: - btrfs_info(root->fs_info, "fragmenting data"); + btrfs_info(info, "fragmenting data"); btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); break; #endif case Opt_err: - btrfs_info(root->fs_info, - "unrecognized mount option '%s'", p); + btrfs_info(info, "unrecognized mount option '%s'", p); ret = -EINVAL; goto out; default: @@ -847,22 +834,22 @@ check: * Extra check for current option against current flag */ if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { - btrfs_err(root->fs_info, + btrfs_err(info, "nologreplay must be used with ro mount option"); ret = -EINVAL; } out: - if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, FREE_SPACE_TREE) && !btrfs_test_opt(info, CLEAR_CACHE)) { - btrfs_err(root->fs_info, "cannot disable free space tree"); + btrfs_err(info, "cannot disable free space tree"); ret = -EINVAL; } if (!ret && btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(root->fs_info, "disk space caching is enabled"); + btrfs_info(info, "disk space caching is enabled"); if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(root->fs_info, "using free space tree"); + btrfs_info(info, "using free space tree"); kfree(orig); return ret; } @@ -1173,7 +1160,7 @@ static int btrfs_fill_super(struct super_block *sb, return 0; fail_close: - close_ctree(fs_info->tree_root); + close_ctree(fs_info); return err; } @@ -1217,13 +1204,12 @@ int btrfs_sync_fs(struct super_block *sb, int wait) if (IS_ERR(trans)) return PTR_ERR(trans); } - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); } static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = info->tree_root; char *compress_type; if (btrfs_test_opt(info, DEGRADED)) @@ -1265,7 +1251,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); - if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) + if (!(info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); @@ -1744,7 +1730,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } } - ret = btrfs_parse_options(root, data, *flags); + ret = btrfs_parse_options(fs_info, data, *flags); if (ret) { ret = -EINVAL; goto restore; @@ -1784,11 +1770,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_scrub_cancel(fs_info); btrfs_pause_balance(fs_info); - ret = btrfs_commit_super(root); + ret = btrfs_commit_super(fs_info); if (ret) goto restore; } else { - if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { btrfs_err(fs_info, "Remounting read-write after error is not allowed"); ret = -EINVAL; @@ -1901,9 +1887,10 @@ static inline void btrfs_descending_sort_devices( * The helper to calc the free space on the devices that can be used to store * file data. */ -static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) +static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, + u64 *free_bytes) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_device_info *devices_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -2086,10 +2073,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - /* - * holding chunk_mutex to avoid allocating new chunks, holding - * device_list_mutex to avoid the device being removed - */ rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { @@ -2141,7 +2124,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) spin_unlock(&block_rsv->lock); buf->f_bavail = div_u64(total_free_data, factor); - ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); + ret = btrfs_calc_avail_data_space(fs_info, &total_free_data); if (ret) return ret; buf->f_bavail += div_u64(total_free_data, factor); @@ -2249,9 +2232,10 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, static int btrfs_freeze(struct super_block *sb) { struct btrfs_trans_handle *trans; - struct btrfs_root *root = btrfs_sb(sb)->tree_root; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_root *root = fs_info->tree_root; - root->fs_info->fs_frozen = 1; + fs_info->fs_frozen = 1; /* * We don't need a barrier here, we'll wait for any transaction that * could be in progress on other threads (and do delayed iputs that @@ -2265,14 +2249,12 @@ static int btrfs_freeze(struct super_block *sb) return 0; return PTR_ERR(trans); } - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); } static int btrfs_unfreeze(struct super_block *sb) { - struct btrfs_root *root = btrfs_sb(sb)->tree_root; - - root->fs_info->fs_frozen = 0; + btrfs_sb(sb)->fs_frozen = 0; return 0; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index bf62ad919a95..ea272432c930 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -79,7 +79,7 @@ static void btrfs_destroy_test_fs(void) unregister_filesystem(&test_type); } -struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); @@ -100,6 +100,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) return NULL; } + fs_info->nodesize = nodesize; + fs_info->sectorsize = sectorsize; + if (init_srcu_struct(&fs_info->subvol_srcu)) { kfree(fs_info->fs_devices); kfree(fs_info->super_copy); @@ -162,6 +165,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) slot = radix_tree_iter_retry(&iter); continue; } + slot = radix_tree_iter_resume(slot, &iter); spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); spin_lock(&fs_info->buffer_lock); @@ -189,7 +193,8 @@ void btrfs_free_dummy_root(struct btrfs_root *root) } struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) +btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, + unsigned long length) { struct btrfs_block_group_cache *cache; @@ -206,8 +211,9 @@ btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize) cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = sectorsize; - cache->full_stripe_len = sectorsize; + cache->sectorsize = fs_info->sectorsize; + cache->full_stripe_len = fs_info->sectorsize; + cache->fs_info = fs_info; INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b17ffbe8f9f3..266f1e3d1784 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -34,11 +34,11 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize); int btrfs_test_qgroups(u32 sectorsize, u32 nodesize); int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize); struct inode *btrfs_new_test_inode(void); -struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); +struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize); void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info); void btrfs_free_dummy_root(struct btrfs_root *root); struct btrfs_block_group_cache * -btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize); +btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long length); void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 199569174637..b9142c614114 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -41,13 +41,13 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) test_msg("Running btrfs_split_item tests\n"); - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Could not allocate fs_info\n"); return -ENOMEM; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Could not allocate root\n"); ret = PTR_ERR(root); @@ -61,8 +61,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize, - nodesize); + path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); if (!eb) { test_msg("Could not allocate dummy buffer\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index caad80bb9bd0..133753232a94 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -306,7 +306,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, int ret; memset(bitmap, 0, len); - memset_extent_buffer(eb, 0, 0, len); + memzero_extent_buffer(eb, 0, len); if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { test_msg("Bitmap was not zeroed\n"); return -EINVAL; @@ -383,6 +383,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) { + struct btrfs_fs_info *fs_info; unsigned long len; unsigned long *bitmap; struct extent_buffer *eb; @@ -397,13 +398,15 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) len = (sectorsize < BTRFS_MAX_METADATA_BLOCKSIZE) ? sectorsize * 4 : sectorsize; + fs_info = btrfs_alloc_dummy_fs_info(len, len); + bitmap = kmalloc(len, GFP_KERNEL); if (!bitmap) { test_msg("Couldn't allocate test bitmap\n"); return -ENOMEM; } - eb = __alloc_dummy_extent_buffer(NULL, 0, len); + eb = __alloc_dummy_extent_buffer(fs_info, 0, len); if (!eb) { test_msg("Couldn't allocate test extent buffer\n"); kfree(bitmap); diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 3221c8dee272..eca6412d42bd 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -843,33 +843,31 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize) int ret = -ENOMEM; test_msg("Running btrfs free space cache tests\n"); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) + return -ENOMEM; + /* * For ppc64 (with 64k page size), bytes per bitmap might be * larger than 1G. To make bitmap test available in ppc64, * alloc dummy block group whose size cross bitmaps. */ - cache = btrfs_alloc_dummy_block_group(BITS_PER_BITMAP * sectorsize - + PAGE_SIZE, sectorsize); + cache = btrfs_alloc_dummy_block_group(fs_info, + BITS_PER_BITMAP * sectorsize + PAGE_SIZE); if (!cache) { test_msg("Couldn't run the tests\n"); + btrfs_free_dummy_fs_info(fs_info); return 0; } - fs_info = btrfs_alloc_dummy_fs_info(); - if (!fs_info) { - ret = -ENOMEM; - goto out; - } - - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out; } root->fs_info->extent_root = root; - cache->fs_info = root->fs_info; ret = test_extents(cache); if (ret) diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 6e144048a72e..b29954c01673 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -455,14 +455,14 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, struct btrfs_path *path = NULL; int ret; - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Couldn't allocate dummy fs info\n"); ret = -ENOMEM; goto out; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Couldn't allocate dummy root\n"); ret = PTR_ERR(root); @@ -474,8 +474,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, root->fs_info->free_space_root = root; root->fs_info->tree_root = root; - root->node = alloc_test_extent_buffer(root->fs_info, - nodesize, nodesize); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -485,7 +484,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - cache = btrfs_alloc_dummy_block_group(8 * alignment, sectorsize); + cache = btrfs_alloc_dummy_block_group(fs_info, 8 * alignment); if (!cache) { test_msg("Couldn't allocate dummy block group cache\n"); ret = -ENOMEM; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 0bf46808ce8f..4d0f038e14f1 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -249,19 +249,19 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); + root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -854,19 +854,19 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; } - root->node = alloc_dummy_extent_buffer(NULL, nodesize, nodesize); + root->node = alloc_dummy_extent_buffer(fs_info, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); goto out; @@ -950,13 +950,13 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) return ret; } - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Couldn't allocate dummy fs info\n"); goto out; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index ca7cb5e6d385..0f4ce970d195 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -458,13 +458,13 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) struct btrfs_root *tmp_root; int ret = 0; - fs_info = btrfs_alloc_dummy_fs_info(); + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); if (!fs_info) { test_msg("Couldn't allocate dummy fs info\n"); return -ENOMEM; } - root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(root)) { test_msg("Couldn't allocate root\n"); ret = PTR_ERR(root); @@ -486,8 +486,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) * Can't use bytenr 0, some things freak out * *cough*backref walking code*cough* */ - root->node = alloc_test_extent_buffer(root->fs_info, nodesize, - nodesize); + root->node = alloc_test_extent_buffer(root->fs_info, nodesize); if (!root->node) { test_msg("Couldn't allocate dummy buffer\n"); ret = -ENOMEM; @@ -497,7 +496,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) btrfs_set_header_nritems(root->node, 0); root->alloc_bytenr += 2 * nodesize; - tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); @@ -512,7 +511,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize) goto out; } - tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize); + tmp_root = btrfs_alloc_dummy_root(fs_info); if (IS_ERR(tmp_root)) { test_msg("Couldn't allocate a fs root\n"); ret = PTR_ERR(tmp_root); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9517de0e668c..0e0508f488b2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -184,10 +184,10 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) /* * either allocate a new transaction or hop into the existing one */ -static noinline int join_transaction(struct btrfs_root *root, unsigned int type) +static noinline int join_transaction(struct btrfs_fs_info *fs_info, + unsigned int type) { struct btrfs_transaction *cur_trans; - struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); loop: @@ -314,9 +314,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, int force) { + struct btrfs_fs_info *fs_info = root->fs_info; + if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) && root->last_trans < trans->transid) || force) { - WARN_ON(root == root->fs_info->extent_root); + WARN_ON(root == fs_info->extent_root); WARN_ON(root->commit_root != root->node); /* @@ -331,15 +333,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, */ smp_wmb(); - spin_lock(&root->fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { - spin_unlock(&root->fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } - radix_tree_tag_set(&root->fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - BTRFS_ROOT_TRANS_TAG); - spin_unlock(&root->fs_info->fs_roots_radix_lock); + radix_tree_tag_set(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to @@ -372,6 +374,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; /* Add ourselves to the transaction dropped list */ @@ -380,16 +383,18 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ - spin_lock(&root->fs_info->fs_roots_radix_lock); - radix_tree_tag_clear(&root->fs_info->fs_roots_radix, + spin_lock(&fs_info->fs_roots_radix_lock); + radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); - spin_unlock(&root->fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { + struct btrfs_fs_info *fs_info = root->fs_info; + if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return 0; @@ -402,9 +407,9 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; - mutex_lock(&root->fs_info->reloc_mutex); + mutex_lock(&fs_info->reloc_mutex); record_root_in_trans(trans, root, 0); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); return 0; } @@ -420,35 +425,35 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans) * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. */ -static void wait_current_trans(struct btrfs_root *root) +static void wait_current_trans(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *cur_trans; - spin_lock(&root->fs_info->trans_lock); - cur_trans = root->fs_info->running_transaction; + spin_lock(&fs_info->trans_lock); + cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { atomic_inc(&cur_trans->use_count); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); - wait_event(root->fs_info->transaction_wait, + wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || cur_trans->aborted); btrfs_put_transaction(cur_trans); } else { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } } -static int may_wait_transaction(struct btrfs_root *root, int type) +static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { - if (test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return 0; if (type == TRANS_USERSPACE) return 1; if (type == TRANS_START && - !atomic_read(&root->fs_info->open_ioctl_trans)) + !atomic_read(&fs_info->open_ioctl_trans)) return 1; return 0; @@ -456,7 +461,9 @@ static int may_wait_transaction(struct btrfs_root *root, int type) static inline bool need_reserve_reloc_root(struct btrfs_root *root) { - if (!root->fs_info->reloc_ctl || + struct btrfs_fs_info *fs_info = root->fs_info; + + if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) @@ -469,6 +476,8 @@ static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; @@ -479,7 +488,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, /* Send isn't supposed to start transactions. */ ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); - if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) + if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -496,23 +505,22 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * Do the reservation before we join the transaction so we can do all * the appropriate flushing if need be. */ - if (num_items > 0 && root != root->fs_info->chunk_root) { - qgroup_reserved = num_items * root->nodesize; + if (num_items > 0 && root != fs_info->chunk_root) { + qgroup_reserved = num_items * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved); if (ret) return ERR_PTR(ret); - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); /* * Do the reservation for the relocation root creation */ if (need_reserve_reloc_root(root)) { - num_bytes += root->nodesize; + num_bytes += fs_info->nodesize; reloc_reserved = true; } - ret = btrfs_block_rsv_add(root, - &root->fs_info->trans_block_rsv, + ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv, num_bytes, flush); if (ret) goto reserve_fail; @@ -535,15 +543,15 @@ again: * transaction and commit it, so we needn't do sb_start_intwrite(). */ if (type & __TRANS_FREEZABLE) - sb_start_intwrite(root->fs_info->sb); + sb_start_intwrite(fs_info->sb); - if (may_wait_transaction(root, type)) - wait_current_trans(root); + if (may_wait_transaction(fs_info, type)) + wait_current_trans(fs_info); do { - ret = join_transaction(root, type); + ret = join_transaction(fs_info, type); if (ret == -EBUSY) { - wait_current_trans(root); + wait_current_trans(fs_info); if (unlikely(type == TRANS_ATTACH)) ret = -ENOENT; } @@ -552,7 +560,7 @@ again: if (ret < 0) goto join_fail; - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; h->transid = cur_trans->transid; h->transaction = cur_trans; @@ -567,16 +575,16 @@ again: smp_mb(); if (cur_trans->state >= TRANS_STATE_BLOCKED && - may_wait_transaction(root, type)) { + may_wait_transaction(fs_info, type)) { current->journal_info = h; - btrfs_commit_transaction(h, root); + btrfs_commit_transaction(h); goto again; } if (num_bytes) { - trace_btrfs_space_reservation(root->fs_info, "transaction", + trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); - h->block_rsv = &root->fs_info->trans_block_rsv; + h->block_rsv = &fs_info->trans_block_rsv; h->bytes_reserved = num_bytes; h->reloc_reserved = reloc_reserved; } @@ -590,11 +598,11 @@ got_it: join_fail: if (type & __TRANS_FREEZABLE) - sb_end_intwrite(root->fs_info->sb); + sb_end_intwrite(fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) - btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, + btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, num_bytes); reserve_fail: btrfs_qgroup_free_meta(root, qgroup_reserved); @@ -612,6 +620,7 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( unsigned int num_items, int min_factor) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 num_bytes; int ret; @@ -624,19 +633,17 @@ struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( if (IS_ERR(trans)) return trans; - num_bytes = btrfs_calc_trans_metadata_size(root, num_items); - ret = btrfs_cond_migrate_bytes(root->fs_info, - &root->fs_info->trans_block_rsv, - num_bytes, - min_factor); + num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items); + ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv, + num_bytes, min_factor); if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ERR_PTR(ret); } - trans->block_rsv = &root->fs_info->trans_block_rsv; + trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(root->fs_info, "transaction", + trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, num_bytes, 1); return trans; @@ -702,30 +709,29 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) - btrfs_wait_for_commit(root, 0); + btrfs_wait_for_commit(root->fs_info, 0); return trans; } /* wait for a transaction commit to be fully complete */ -static noinline void wait_for_commit(struct btrfs_root *root, - struct btrfs_transaction *commit) +static noinline void wait_for_commit(struct btrfs_transaction *commit) { wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED); } -int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) +int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; int ret = 0; if (transid) { - if (transid <= root->fs_info->last_trans_committed) + if (transid <= fs_info->last_trans_committed) goto out; /* find specified transaction */ - spin_lock(&root->fs_info->trans_lock); - list_for_each_entry(t, &root->fs_info->trans_list, list) { + spin_lock(&fs_info->trans_lock); + list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; atomic_inc(&cur_trans->use_count); @@ -737,21 +743,21 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) break; } } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); /* * The specified transaction doesn't exist, or we * raced with btrfs_commit_transaction */ if (!cur_trans) { - if (transid > root->fs_info->last_trans_committed) + if (transid > fs_info->last_trans_committed) ret = -EINVAL; goto out; } } else { /* find newest transaction that is committing | committed */ - spin_lock(&root->fs_info->trans_lock); - list_for_each_entry_reverse(t, &root->fs_info->trans_list, + spin_lock(&fs_info->trans_lock); + list_for_each_entry_reverse(t, &fs_info->trans_list, list) { if (t->state >= TRANS_STATE_COMMIT_START) { if (t->state == TRANS_STATE_COMPLETED) @@ -761,37 +767,38 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) break; } } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); if (!cur_trans) goto out; /* nothing committing|committed */ } - wait_for_commit(root, cur_trans); + wait_for_commit(cur_trans); btrfs_put_transaction(cur_trans); out: return ret; } -void btrfs_throttle(struct btrfs_root *root) +void btrfs_throttle(struct btrfs_fs_info *fs_info) { - if (!atomic_read(&root->fs_info->open_ioctl_trans)) - wait_current_trans(root); + if (!atomic_read(&fs_info->open_ioctl_trans)) + wait_current_trans(fs_info); } -static int should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int should_end_transaction(struct btrfs_trans_handle *trans) { - if (root->fs_info->global_block_rsv.space_info->full && - btrfs_check_space_for_delayed_refs(trans, root)) + struct btrfs_fs_info *fs_info = trans->fs_info; + + if (fs_info->global_block_rsv.space_info->full && + btrfs_check_space_for_delayed_refs(trans, fs_info)) return 1; - return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5); + return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); } -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; + struct btrfs_fs_info *fs_info = trans->fs_info; int updates; int err; @@ -803,19 +810,19 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates * 2); + err = btrfs_run_delayed_refs(trans, fs_info, updates * 2); if (err) /* Error code will also eval true */ return err; } - return should_end_transaction(trans, root); + return should_end_transaction(trans); } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int throttle) + int throttle) { + struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_fs_info *info = root->fs_info; u64 transid = trans->transid; unsigned long cur = trans->delayed_ref_updates; int lock = (trans->type != TRANS_JOIN_NOLOCK); @@ -828,16 +835,16 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, return 0; } - btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_metadata(trans, info); trans->block_rsv = NULL; if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); + btrfs_create_pending_block_groups(trans, info); trans->delayed_ref_updates = 0; if (!trans->sync) { must_run_delayed_refs = - btrfs_should_throttle_delayed_refs(trans, root); + btrfs_should_throttle_delayed_refs(trans, info); cur = max_t(unsigned long, cur, 32); /* @@ -849,16 +856,16 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, must_run_delayed_refs = 2; } - btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_metadata(trans, info); trans->block_rsv = NULL; if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); + btrfs_create_pending_block_groups(trans, info); btrfs_trans_release_chunk_metadata(trans); - if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && - should_end_transaction(trans, root) && + if (lock && !atomic_read(&info->open_ioctl_trans) && + should_end_transaction(trans) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { spin_lock(&info->trans_lock); if (cur_trans->state == TRANS_STATE_RUNNING) @@ -868,13 +875,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) { if (throttle) - return btrfs_commit_transaction(trans, root); + return btrfs_commit_transaction(trans); else wake_up_process(info->transaction_kthread); } if (trans->type & __TRANS_FREEZABLE) - sb_end_intwrite(root->fs_info->sb); + sb_end_intwrite(info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); @@ -893,10 +900,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; if (throttle) - btrfs_run_delayed_iputs(root); + btrfs_run_delayed_iputs(info); if (trans->aborted || - test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { + test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { wake_up_process(info->transaction_kthread); err = -EIO; } @@ -904,22 +911,20 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { - btrfs_async_run_delayed_refs(root, cur, transid, + btrfs_async_run_delayed_refs(info, cur, transid, must_run_delayed_refs == 1); } return err; } -int btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_end_transaction(struct btrfs_trans_handle *trans) { - return __btrfs_end_transaction(trans, root, 0); + return __btrfs_end_transaction(trans, 0); } -int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) { - return __btrfs_end_transaction(trans, root, 1); + return __btrfs_end_transaction(trans, 1); } /* @@ -927,12 +932,12 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, * them in one of two extent_io trees. This is used to make sure all of * those extents are sent to disk but does not wait on them */ -int btrfs_write_marked_extents(struct btrfs_root *root, +int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { int err = 0; int werr = 0; - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; @@ -949,11 +954,11 @@ int btrfs_write_marked_extents(struct btrfs_root *root, * time a temporary error. So when it happens, ignore the error * and wait for writeback of this range to finish - because we * failed to set the bit EXTENT_NEED_WAIT for the range, a call - * to btrfs_wait_marked_extents() would not know that writeback - * for this range started and therefore wouldn't wait for it to - * finish - we don't want to commit a superblock that points to - * btree nodes/leafs for which writeback hasn't finished yet - * (and without errors). + * to __btrfs_wait_marked_extents() would not know that + * writeback for this range started and therefore wouldn't + * wait for it to finish - we don't want to commit a + * superblock that points to btree nodes/leafs for which + * writeback hasn't finished yet (and without errors). * We cleanup any entries left in the io tree when committing * the transaction (through clear_btree_io_tree()). */ @@ -981,16 +986,15 @@ int btrfs_write_marked_extents(struct btrfs_root *root, * those extents are on disk for transaction or log commit. We wait * on all the pages and clear them from the dirty pages state tree */ -int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark) +static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages) { int err = 0; int werr = 0; - struct address_space *mapping = root->fs_info->btree_inode->i_mapping; + struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; - bool errors = false; while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -1018,27 +1022,45 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, } if (err) werr = err; + return werr; +} - if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - if ((mark & EXTENT_DIRTY) && - test_and_clear_bit(BTRFS_FS_LOG1_ERR, - &root->fs_info->flags)) - errors = true; +int btrfs_wait_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages) +{ + bool errors = false; + int err; - if ((mark & EXTENT_NEW) && - test_and_clear_bit(BTRFS_FS_LOG2_ERR, - &root->fs_info->flags)) - errors = true; - } else { - if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, - &root->fs_info->flags)) - errors = true; - } + err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) + errors = true; - if (errors && !werr) - werr = -EIO; + if (errors && !err) + err = -EIO; + return err; +} - return werr; +int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) +{ + struct btrfs_fs_info *fs_info = log_root->fs_info; + struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; + bool errors = false; + int err; + + ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + + err = __btrfs_wait_marked_extents(fs_info, dirty_pages); + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) + errors = true; + + if (errors && !err) + err = -EIO; + return err; } /* @@ -1046,7 +1068,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit */ -static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, +static int btrfs_write_and_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { int ret; @@ -1054,9 +1076,9 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct blk_plug plug; blk_start_plug(&plug); - ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret = btrfs_write_marked_extents(fs_info, dirty_pages, mark); blk_finish_plug(&plug); - ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_extents(fs_info, dirty_pages); if (ret) return ret; @@ -1066,11 +1088,11 @@ static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, } static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { int ret; - ret = btrfs_write_and_wait_marked_extents(root, + ret = btrfs_write_and_wait_marked_extents(fs_info, &trans->transaction->dirty_pages, EXTENT_DIRTY); clear_btree_io_tree(&trans->transaction->dirty_pages); @@ -1094,7 +1116,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, int ret; u64 old_root_bytenr; u64 old_root_used; - struct btrfs_root *tree_root = root->fs_info->tree_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *tree_root = fs_info->tree_root; old_root_used = btrfs_root_used(&root->root_item); @@ -1125,9 +1148,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; @@ -1143,30 +1165,31 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) return ret; - ret = btrfs_run_dev_stats(trans, root->fs_info); + ret = btrfs_run_dev_stats(trans, fs_info); if (ret) return ret; - ret = btrfs_run_dev_replace(trans, root->fs_info); + ret = btrfs_run_dev_replace(trans, fs_info); if (ret) return ret; - ret = btrfs_run_qgroups(trans, root->fs_info); + ret = btrfs_run_qgroups(trans, fs_info); if (ret) return ret; - ret = btrfs_setup_space_cache(trans, root); + ret = btrfs_setup_space_cache(trans, fs_info); if (ret) return ret; /* run_qgroups might have added some more refs */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { + struct btrfs_root *root; next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); @@ -1178,16 +1201,16 @@ again: ret = update_cowonly_root(trans, root); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) return ret; } while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { - ret = btrfs_write_dirty_block_groups(trans, root); + ret = btrfs_write_dirty_block_groups(trans, fs_info); if (ret) return ret; - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) return ret; } @@ -1209,20 +1232,21 @@ again: */ void btrfs_add_dead_root(struct btrfs_root *root) { - spin_lock(&root->fs_info->trans_lock); + struct btrfs_fs_info *fs_info = root->fs_info; + + spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) - list_add_tail(&root->root_list, &root->fs_info->dead_roots); - spin_unlock(&root->fs_info->trans_lock); + list_add_tail(&root->root_list, &fs_info->dead_roots); + spin_unlock(&fs_info->trans_lock); } /* * update all the cowonly tree roots on disk */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { struct btrfs_root *gang[8]; - struct btrfs_fs_info *fs_info = root->fs_info; int i; int ret; int err = 0; @@ -1236,7 +1260,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, if (ret == 0) break; for (i = 0; i < ret; i++) { - root = gang[i]; + struct btrfs_root *root = gang[i]; radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); @@ -1292,8 +1316,8 @@ int btrfs_defrag_root(struct btrfs_root *root) ret = btrfs_defrag_leaves(trans, root); - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(info->tree_root); + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(info); cond_resched(); if (btrfs_fs_closing(info) || ret != -EAGAIN) @@ -1343,7 +1367,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans, src); + ret = commit_fs_roots(trans, fs_info); if (ret) goto out; ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); @@ -1372,11 +1396,11 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * like chunk and root tree, as they won't affect qgroup. * And we don't write super to avoid half committed status. */ - ret = commit_cowonly_roots(trans, src); + ret = commit_cowonly_roots(trans, fs_info); if (ret) goto out; switch_commit_roots(trans->transaction, fs_info); - ret = btrfs_write_and_wait_transaction(trans, src); + ret = btrfs_write_and_wait_transaction(trans, fs_info); if (ret) btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction for qgroup"); @@ -1462,7 +1486,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; - trace_btrfs_space_reservation(root->fs_info, "transaction", + trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); dentry = pending->dentry; @@ -1499,7 +1523,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * otherwise we corrupt the FS during * snapshot */ - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); if (ret) { /* Transaction aborted */ btrfs_abort_transaction(trans, ret); goto fail; @@ -1572,7 +1596,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* * insert root back/forward references */ - ret = btrfs_add_root_ref(trans, tree_root, objectid, + ret = btrfs_add_root_ref(trans, fs_info, objectid, parent_root->root_key.objectid, btrfs_ino(parent_inode), index, dentry->d_name.name, dentry->d_name.len); @@ -1582,7 +1606,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.offset = (u64)-1; - pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); + pending->snap = btrfs_read_fs_root_no_name(fs_info, &key); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); btrfs_abort_transaction(trans, ret); @@ -1595,7 +1619,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1632,14 +1656,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); goto fail; } - ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b, + ret = btrfs_uuid_tree_add(trans, fs_info, new_uuid.b, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + ret = btrfs_uuid_tree_add(trans, fs_info, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); @@ -1649,7 +1673,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1690,25 +1714,25 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, return ret; } -static void update_super_roots(struct btrfs_root *root) +static void update_super_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root_item *root_item; struct btrfs_super_block *super; - super = root->fs_info->super_copy; + super = fs_info->super_copy; - root_item = &root->fs_info->chunk_root->root_item; + root_item = &fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; super->chunk_root_generation = root_item->generation; super->chunk_root_level = root_item->level; - root_item = &root->fs_info->tree_root->root_item; + root_item = &fs_info->tree_root->root_item; super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; - if (btrfs_test_opt(root->fs_info, SPACE_CACHE)) + if (btrfs_test_opt(fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; - if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &root->fs_info->flags)) + if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; } @@ -1742,24 +1766,23 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info) * wait for the current transaction commit to start and block subsequent * transaction joins */ -static void wait_current_trans_commit_start(struct btrfs_root *root, +static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info, struct btrfs_transaction *trans) { - wait_event(root->fs_info->transaction_blocked_wait, - trans->state >= TRANS_STATE_COMMIT_START || - trans->aborted); + wait_event(fs_info->transaction_blocked_wait, + trans->state >= TRANS_STATE_COMMIT_START || trans->aborted); } /* * wait for the current transaction to start and then become unblocked. * caller holds ref. */ -static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, - struct btrfs_transaction *trans) +static void wait_current_trans_commit_start_and_unblock( + struct btrfs_fs_info *fs_info, + struct btrfs_transaction *trans) { - wait_event(root->fs_info->transaction_wait, - trans->state >= TRANS_STATE_UNBLOCKED || - trans->aborted); + wait_event(fs_info->transaction_wait, + trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted); } /* @@ -1768,7 +1791,6 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, */ struct btrfs_async_commit { struct btrfs_trans_handle *newtrans; - struct btrfs_root *root; struct work_struct work; }; @@ -1782,18 +1804,18 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); + __sb_writers_acquired(ac->newtrans->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; - btrfs_commit_transaction(ac->newtrans, ac->root); + btrfs_commit_transaction(ac->newtrans); kfree(ac); } int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait_for_unblock) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_async_commit *ac; struct btrfs_transaction *cur_trans; @@ -1802,8 +1824,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, return -ENOMEM; INIT_WORK(&ac->work, do_async_commit); - ac->root = root; - ac->newtrans = btrfs_join_transaction(root); + ac->newtrans = btrfs_join_transaction(trans->root); if (IS_ERR(ac->newtrans)) { int err = PTR_ERR(ac->newtrans); kfree(ac); @@ -1814,22 +1835,22 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, cur_trans = trans->transaction; atomic_inc(&cur_trans->use_count); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); /* * Tell lockdep we've released the freeze rwsem, since the * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); + __sb_writers_release(fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); /* wait for transaction to start and unblock */ if (wait_for_unblock) - wait_current_trans_commit_start_and_unblock(root, cur_trans); + wait_current_trans_commit_start_and_unblock(fs_info, cur_trans); else - wait_current_trans_commit_start(root, cur_trans); + wait_current_trans_commit_start(fs_info, cur_trans); if (current->journal_info == trans) current->journal_info = NULL; @@ -1842,6 +1863,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, static void cleanup_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, int err) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; DEFINE_WAIT(wait); @@ -1849,7 +1871,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, err); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); /* * If the transaction is removed from the list, it means this @@ -1859,25 +1881,25 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, BUG_ON(list_empty(&cur_trans->list)); list_del_init(&cur_trans->list); - if (cur_trans == root->fs_info->running_transaction) { + if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); - btrfs_cleanup_one_transaction(trans->transaction, root); + btrfs_cleanup_one_transaction(trans->transaction, fs_info); - spin_lock(&root->fs_info->trans_lock); - if (cur_trans == root->fs_info->running_transaction) - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); + if (cur_trans == fs_info->running_transaction) + fs_info->running_transaction = NULL; + spin_unlock(&fs_info->trans_lock); if (trans->type & __TRANS_FREEZABLE) - sb_end_intwrite(root->fs_info->sb); + sb_end_intwrite(fs_info->sb); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); @@ -1885,7 +1907,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, if (current->journal_info == trans) current->journal_info = NULL; - btrfs_scrub_cancel(root->fs_info); + btrfs_scrub_cancel(fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } @@ -1910,9 +1932,9 @@ btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans) atomic_read(&cur_trans->pending_ordered) == 0); } -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; @@ -1920,20 +1942,20 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, /* Stop the commit early if ->aborted is set */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } /* make a pass through all the delayed refs we have so far * any runnings procs may add more while we are here */ - ret = btrfs_run_delayed_refs(trans, root, 0); + ret = btrfs_run_delayed_refs(trans, fs_info, 0); if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } - btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_metadata(trans, fs_info); trans->block_rsv = NULL; cur_trans = trans->transaction; @@ -1946,11 +1968,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, smp_wmb(); if (!list_empty(&trans->new_bgs)) - btrfs_create_pending_block_groups(trans, root); + btrfs_create_pending_block_groups(trans, fs_info); - ret = btrfs_run_delayed_refs(trans, root, 0); + ret = btrfs_run_delayed_refs(trans, fs_info, 0); if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } @@ -1970,27 +1992,27 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * hurt to have more than one go through, but there's no * real advantage to it either. */ - mutex_lock(&root->fs_info->ro_block_group_mutex); + mutex_lock(&fs_info->ro_block_group_mutex); if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) run_it = 1; - mutex_unlock(&root->fs_info->ro_block_group_mutex); + mutex_unlock(&fs_info->ro_block_group_mutex); if (run_it) - ret = btrfs_start_dirty_block_groups(trans, root); + ret = btrfs_start_dirty_block_groups(trans, fs_info); } if (ret) { - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); atomic_inc(&cur_trans->use_count); - ret = btrfs_end_transaction(trans, root); + ret = btrfs_end_transaction(trans); - wait_for_commit(root, cur_trans); + wait_for_commit(cur_trans); if (unlikely(cur_trans->aborted)) ret = cur_trans->aborted; @@ -2001,35 +2023,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, } cur_trans->state = TRANS_STATE_COMMIT_START; - wake_up(&root->fs_info->transaction_blocked_wait); + wake_up(&fs_info->transaction_blocked_wait); - if (cur_trans->list.prev != &root->fs_info->trans_list) { + if (cur_trans->list.prev != &fs_info->trans_list) { prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { atomic_inc(&prev_trans->use_count); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); - wait_for_commit(root, prev_trans); + wait_for_commit(prev_trans); ret = prev_trans->aborted; btrfs_put_transaction(prev_trans); if (ret) goto cleanup_transaction; } else { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } } else { - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); } extwriter_counter_dec(cur_trans, trans->type); - ret = btrfs_start_delalloc_flush(root->fs_info); + ret = btrfs_start_delalloc_flush(fs_info); if (ret) goto cleanup_transaction; - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); if (ret) goto cleanup_transaction; @@ -2037,23 +2059,23 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); if (ret) goto cleanup_transaction; - btrfs_wait_delalloc_flush(root->fs_info); + btrfs_wait_delalloc_flush(fs_info); btrfs_wait_pending_ordered(cur_trans); - btrfs_scrub_pause(root); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_COMMIT_DOING; - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); @@ -2067,16 +2089,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * the balancing code from coming in and moving * extents around in the middle of the commit */ - mutex_lock(&root->fs_info->reloc_mutex); + mutex_lock(&fs_info->reloc_mutex); /* * We needn't worry about the delayed items because we will * deal with them in create_pending_snapshot(), which is the * core function of the snapshot creation. */ - ret = create_pending_snapshots(trans, root->fs_info); + ret = create_pending_snapshots(trans, fs_info); if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } @@ -2090,22 +2112,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * because all the tree which are snapshoted will be forced to COW * the nodes and leaves. */ - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); if (ret) { - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } @@ -2113,7 +2135,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * make sure none of the code above managed to slip in a * delayed item */ - btrfs_assert_delayed_root_empty(root); + btrfs_assert_delayed_root_empty(fs_info); WARN_ON(cur_trans != trans->transaction); @@ -2130,12 +2152,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * from now until after the super is written, we avoid races * with the tree-log code. */ - mutex_lock(&root->fs_info->tree_log_mutex); + mutex_lock(&fs_info->tree_log_mutex); - ret = commit_fs_roots(trans, root); + ret = commit_fs_roots(trans, fs_info); if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } @@ -2143,28 +2165,28 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * Since the transaction is done, we can apply the pending changes * before the next transaction. */ - btrfs_apply_pending_changes(root->fs_info); + btrfs_apply_pending_changes(fs_info); /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ - btrfs_free_log_root_tree(trans, root->fs_info); + btrfs_free_log_root_tree(trans, fs_info); /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ - ret = btrfs_qgroup_account_extents(trans, root->fs_info); + ret = btrfs_qgroup_account_extents(trans, fs_info); if (ret < 0) { - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - ret = commit_cowonly_roots(trans, root); + ret = commit_cowonly_roots(trans, fs_info); if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } @@ -2174,64 +2196,64 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; - mutex_unlock(&root->fs_info->tree_log_mutex); - mutex_unlock(&root->fs_info->reloc_mutex); + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); goto scrub_continue; } - btrfs_prepare_extent_commit(trans, root); + btrfs_prepare_extent_commit(trans, fs_info); - cur_trans = root->fs_info->running_transaction; + cur_trans = fs_info->running_transaction; - btrfs_set_root_node(&root->fs_info->tree_root->root_item, - root->fs_info->tree_root->node); - list_add_tail(&root->fs_info->tree_root->dirty_list, + btrfs_set_root_node(&fs_info->tree_root->root_item, + fs_info->tree_root->node); + list_add_tail(&fs_info->tree_root->dirty_list, &cur_trans->switch_commits); - btrfs_set_root_node(&root->fs_info->chunk_root->root_item, - root->fs_info->chunk_root->node); - list_add_tail(&root->fs_info->chunk_root->dirty_list, + btrfs_set_root_node(&fs_info->chunk_root->root_item, + fs_info->chunk_root->node); + list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - switch_commit_roots(cur_trans, root->fs_info); + switch_commit_roots(cur_trans, fs_info); assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); - update_super_roots(root); + update_super_roots(fs_info); - btrfs_set_super_log_root(root->fs_info->super_copy, 0); - btrfs_set_super_log_root_level(root->fs_info->super_copy, 0); - memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, - sizeof(*root->fs_info->super_copy)); + btrfs_set_super_log_root(fs_info->super_copy, 0); + btrfs_set_super_log_root_level(fs_info->super_copy, 0); + memcpy(fs_info->super_for_commit, fs_info->super_copy, + sizeof(*fs_info->super_copy)); - btrfs_update_commit_device_size(root->fs_info); - btrfs_update_commit_device_bytes_used(root, cur_trans); + btrfs_update_commit_device_size(fs_info); + btrfs_update_commit_device_bytes_used(fs_info, cur_trans); - clear_bit(BTRFS_FS_LOG1_ERR, &root->fs_info->flags); - clear_bit(BTRFS_FS_LOG2_ERR, &root->fs_info->flags); + clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); + clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); btrfs_trans_release_chunk_metadata(trans); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; - root->fs_info->running_transaction = NULL; - spin_unlock(&root->fs_info->trans_lock); - mutex_unlock(&root->fs_info->reloc_mutex); + fs_info->running_transaction = NULL; + spin_unlock(&fs_info->trans_lock); + mutex_unlock(&fs_info->reloc_mutex); - wake_up(&root->fs_info->transaction_wait); + wake_up(&fs_info->transaction_wait); - ret = btrfs_write_and_wait_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(trans, fs_info); if (ret) { - btrfs_handle_fs_error(root->fs_info, ret, - "Error while writing out transaction"); - mutex_unlock(&root->fs_info->tree_log_mutex); + btrfs_handle_fs_error(fs_info, ret, + "Error while writing out transaction"); + mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } - ret = write_ctree_super(trans, root, 0); + ret = write_ctree_super(trans, fs_info, 0); if (ret) { - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } @@ -2239,14 +2261,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * the super is written, we can safely allow the tree-loggers * to go about their business */ - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&fs_info->tree_log_mutex); - btrfs_finish_extent_commit(trans, root); + btrfs_finish_extent_commit(trans, fs_info); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) - btrfs_clear_space_info_full(root->fs_info); + btrfs_clear_space_info_full(fs_info); - root->fs_info->last_trans_committed = cur_trans->transid; + fs_info->last_trans_committed = cur_trans->transid; /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2254,19 +2276,19 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); - spin_lock(&root->fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); - spin_unlock(&root->fs_info->trans_lock); + spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); if (trans->type & __TRANS_FREEZABLE) - sb_end_intwrite(root->fs_info->sb); + sb_end_intwrite(fs_info->sb); - trace_btrfs_transaction_commit(root); + trace_btrfs_transaction_commit(trans->root); - btrfs_scrub_continue(root); + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) current->journal_info = NULL; @@ -2277,23 +2299,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, * If fs has been frozen, we can not handle delayed iputs, otherwise * it'll result in deadlock about SB_FREEZE_FS. */ - if (current != root->fs_info->transaction_kthread && - current != root->fs_info->cleaner_kthread && - !root->fs_info->fs_frozen) - btrfs_run_delayed_iputs(root); + if (current != fs_info->transaction_kthread && + current != fs_info->cleaner_kthread && !fs_info->fs_frozen) + btrfs_run_delayed_iputs(fs_info); return ret; scrub_continue: - btrfs_scrub_continue(root); + btrfs_scrub_continue(fs_info); cleanup_transaction: - btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_metadata(trans, fs_info); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; - btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); + btrfs_warn(fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; - cleanup_transaction(trans, root, ret); + cleanup_transaction(trans, trans->root, ret); return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6cf0d37d4f76..5dfb5590fff6 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -123,11 +123,6 @@ struct btrfs_trans_handle { bool sync; bool dirty; unsigned int type; - /* - * this root is only needed to validate that the root passed to - * start_transaction is the same as the one passed to end_transaction. - * Subvolume quota depends on this - */ struct btrfs_root *root; struct btrfs_fs_info *fs_info; struct seq_list delayed_ref_elem; @@ -185,8 +180,7 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -int btrfs_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +int btrfs_end_transaction(struct btrfs_trans_handle *trans); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items); struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( @@ -202,27 +196,24 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); struct btrfs_trans_handle *btrfs_attach_transaction_barrier( struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); -int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); +int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); int btrfs_defrag_root(struct btrfs_root *root); int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root); -int btrfs_commit_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans); int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, - struct btrfs_root *root, int wait_for_unblock); -int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -void btrfs_throttle(struct btrfs_root *root); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans); +int btrfs_should_end_transaction(struct btrfs_trans_handle *trans); +void btrfs_throttle(struct btrfs_fs_info *fs_info); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages, int mark); -int btrfs_wait_marked_extents(struct btrfs_root *root, +int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark); +int btrfs_wait_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages); +int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3d33c4e41e5f..eeffff84f280 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -37,6 +37,7 @@ */ #define LOG_INODE_ALL 0 #define LOG_INODE_EXISTS 1 +#define LOG_OTHER_INODE 2 /* * directory trouble cases @@ -142,12 +143,13 @@ static int start_log_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; mutex_lock(&root->log_mutex); if (root->log_root) { - if (btrfs_need_log_full_commit(root->fs_info, trans)) { + if (btrfs_need_log_full_commit(fs_info, trans)) { ret = -EAGAIN; goto out; } @@ -159,10 +161,10 @@ static int start_log_trans(struct btrfs_trans_handle *trans, set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state); } } else { - mutex_lock(&root->fs_info->tree_log_mutex); - if (!root->fs_info->log_root_tree) - ret = btrfs_init_log_root_tree(trans, root->fs_info); - mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_lock(&fs_info->tree_log_mutex); + if (!fs_info->log_root_tree) + ret = btrfs_init_log_root_tree(trans, fs_info); + mutex_unlock(&fs_info->tree_log_mutex); if (ret) goto out; @@ -292,25 +294,26 @@ static int process_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen) { + struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; /* * If this fs is mixed then we need to be able to process the leaves to * pin down any logged extents, so we have to read the block. */ - if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) { + if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { ret = btrfs_read_buffer(eb, gen); if (ret) return ret; } if (wc->pin) - ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, - eb->start, eb->len); + ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start, + eb->len); if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) { if (wc->pin && btrfs_header_level(eb) == 0) - ret = btrfs_exclude_logged_extents(log, eb); + ret = btrfs_exclude_logged_extents(fs_info, eb); if (wc->write) btrfs_write_tree_block(eb); if (wc->wait) @@ -339,6 +342,7 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; u32 item_size; u64 saved_i_size = 0; @@ -459,9 +463,9 @@ insert: found_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(root, path, item_size, 1); + btrfs_truncate_item(fs_info, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(root, path, + btrfs_extend_item(fs_info, path, item_size - found_size); } else if (ret) { return ret; @@ -582,6 +586,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_fs_info *fs_info = root->fs_info; int found_type; u64 extent_end; u64 start = key->offset; @@ -608,7 +613,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size = btrfs_file_extent_inline_len(eb, slot, item); nbytes = btrfs_file_extent_ram_bytes(eb, item); - extent_end = ALIGN(start + size, root->sectorsize); + extent_end = ALIGN(start + size, + fs_info->sectorsize); } else { ret = 0; goto out; @@ -689,7 +695,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * as the owner of the file extent changed from log tree * (doesn't affect qgroup) to fs/file tree(affects qgroup) */ - ret = btrfs_qgroup_insert_dirty_extent(trans, root->fs_info, + ret = btrfs_qgroup_trace_extent(trans, fs_info, btrfs_file_extent_disk_bytenr(eb, item), btrfs_file_extent_disk_num_bytes(eb, item), GFP_NOFS); @@ -704,10 +710,10 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * is this extent already allocated in the extent * allocation tree? If so, just add a reference */ - ret = btrfs_lookup_data_extent(root, ins.objectid, + ret = btrfs_lookup_data_extent(fs_info, ins.objectid, ins.offset); if (ret == 0) { - ret = btrfs_inc_extent_ref(trans, root, + ret = btrfs_inc_extent_ref(trans, fs_info, ins.objectid, ins.offset, 0, root->root_key.objectid, key->objectid, offset); @@ -719,7 +725,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * allocation tree */ ret = btrfs_alloc_logged_file_extent(trans, - root, root->root_key.objectid, + fs_info, + root->root_key.objectid, key->objectid, offset, &ins); if (ret) goto out; @@ -796,14 +803,12 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum, list); if (!ret) - ret = btrfs_del_csums(trans, - root->fs_info->csum_root, - sums->bytenr, - sums->len); + ret = btrfs_del_csums(trans, fs_info, + sums->bytenr, + sums->len); if (!ret) ret = btrfs_csum_file_blocks(trans, - root->fs_info->csum_root, - sums); + fs_info->csum_root, sums); list_del(&sums->list); kfree(sums); } @@ -841,6 +846,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct inode *dir, struct btrfs_dir_item *di) { + struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode; char *name; int name_len; @@ -873,7 +879,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; else - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); out: kfree(name); iput(inode); @@ -991,6 +997,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, u64 ref_index, char *name, int namelen, int *search_done) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; char *victim_name; int victim_name_len; @@ -1049,7 +1056,7 @@ again: kfree(victim_name); if (ret) return ret; - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); if (ret) return ret; *search_done = 1; @@ -1120,7 +1127,8 @@ again: victim_name_len); if (!ret) ret = btrfs_run_delayed_items( - trans, root); + trans, + fs_info); } iput(victim_parent); kfree(victim_name); @@ -1811,6 +1819,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u32 item_size = btrfs_item_size_nr(eb, slot); struct btrfs_dir_item *di; @@ -1823,7 +1832,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(root, eb, di)) + if (verify_dir_item(fs_info, eb, di)) return -EIO; name_len = btrfs_dir_name_len(eb, di); ret = replay_one_name(trans, root, path, eb, di, key); @@ -1940,12 +1949,11 @@ static noinline int find_dir_range(struct btrfs_root *root, next: /* check the next slot in the tree to see if it is a valid item */ nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; if (path->slots[0] >= nritems) { ret = btrfs_next_leaf(root, path); if (ret) goto out; - } else { - path->slots[0]++; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); @@ -1978,6 +1986,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct inode *dir, struct btrfs_key *dir_key) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct extent_buffer *eb; int slot; @@ -1999,7 +2008,7 @@ again: ptr_end = ptr + item_size; while (ptr < ptr_end) { di = (struct btrfs_dir_item *)ptr; - if (verify_dir_item(root, eb, di)) { + if (verify_dir_item(fs_info, eb, di)) { ret = -EIO; goto out; } @@ -2046,7 +2055,7 @@ again: ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); if (!ret) - ret = btrfs_run_delayed_items(trans, root); + ret = btrfs_run_delayed_items(trans, fs_info); kfree(name); iput(inode); if (ret) @@ -2407,6 +2416,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 root_owner; u64 bytenr; u64 ptr_gen; @@ -2432,12 +2442,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = root->nodesize; + blocksize = fs_info->nodesize; parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); - next = btrfs_find_create_tree_block(root, bytenr); + next = btrfs_find_create_tree_block(fs_info, bytenr); if (IS_ERR(next)) return PTR_ERR(next); @@ -2459,16 +2469,16 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root->fs_info, - next); + clean_tree_block(trans, fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(root, - bytenr, blocksize); + ret = btrfs_free_and_pin_reserved_extent( + fs_info, bytenr, + blocksize); if (ret) { free_extent_buffer(next); return ret; @@ -2505,6 +2515,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, struct btrfs_path *path, int *level, struct walk_control *wc) { + struct btrfs_fs_info *fs_info = root->fs_info; u64 root_owner; int i; int slot; @@ -2538,14 +2549,14 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root->fs_info, - next); + clean_tree_block(trans, fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(root, + ret = btrfs_free_and_pin_reserved_extent( + fs_info, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) @@ -2567,6 +2578,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, static int walk_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct walk_control *wc) { + struct btrfs_fs_info *fs_info = log->fs_info; int ret = 0; int wret; int level; @@ -2615,15 +2627,15 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, log->fs_info, next); + clean_tree_block(trans, fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } WARN_ON(log->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(log, next->start, - next->len); + ret = btrfs_free_and_pin_reserved_extent(fs_info, + next->start, next->len); if (ret) goto out; } @@ -2641,14 +2653,15 @@ out: static int update_log_root(struct btrfs_trans_handle *trans, struct btrfs_root *log) { + struct btrfs_fs_info *fs_info = log->fs_info; int ret; if (log->log_transid == 1) { /* insert root item on the first sync */ - ret = btrfs_insert_root(trans, log->fs_info->log_root_tree, + ret = btrfs_insert_root(trans, fs_info->log_root_tree, &log->root_key, &log->root_item); } else { - ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + ret = btrfs_update_root(trans, fs_info->log_root_tree, &log->root_key, &log->root_item); } return ret; @@ -2742,8 +2755,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, int index2; int mark; int ret; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; - struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; int log_transid = 0; struct btrfs_log_ctx root_log_ctx; struct blk_plug plug; @@ -2771,7 +2785,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, while (1) { int batch = atomic_read(&root->log_batch); /* when we're on an ssd, just kick the log commit out */ - if (!btrfs_test_opt(root->fs_info, SSD) && + if (!btrfs_test_opt(fs_info, SSD) && test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) { mutex_unlock(&root->log_mutex); schedule_timeout_uninterruptible(1); @@ -2783,7 +2797,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } /* bail out if we need to do a full commit */ - if (btrfs_need_log_full_commit(root->fs_info, trans)) { + if (btrfs_need_log_full_commit(fs_info, trans)) { ret = -EAGAIN; btrfs_free_logged_extents(log, log_transid); mutex_unlock(&root->log_mutex); @@ -2799,12 +2813,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * wait for them until later. */ blk_start_plug(&plug); - ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark); if (ret) { blk_finish_plug(&plug); btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); mutex_unlock(&root->log_mutex); goto out; } @@ -2849,14 +2863,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); if (ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); mutex_unlock(&log_root_tree->log_mutex); goto out; } - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_tree_log_extents(log, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; @@ -2874,8 +2888,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); - ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, - mark); + ret = btrfs_wait_tree_log_extents(log, mark); btrfs_wait_logged_extents(trans, log, log_transid); wait_log_commit(log_root_tree, root_log_ctx.log_transid); @@ -2898,43 +2911,42 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * now that we've moved on to the tree of log tree roots, * check the full commit flag again */ - if (btrfs_need_log_full_commit(root->fs_info, trans)) { + if (btrfs_need_log_full_commit(fs_info, trans)) { blk_finish_plug(&plug); - btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + btrfs_wait_tree_log_extents(log, mark); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } - ret = btrfs_write_marked_extents(log_root_tree, + ret = btrfs_write_marked_extents(fs_info, &log_root_tree->dirty_log_pages, EXTENT_DIRTY | EXTENT_NEW); blk_finish_plug(&plug); if (ret) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } - ret = btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); + ret = btrfs_wait_tree_log_extents(log, mark); if (!ret) - ret = btrfs_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages, - EXTENT_NEW | EXTENT_DIRTY); + ret = btrfs_wait_tree_log_extents(log_root_tree, + EXTENT_NEW | EXTENT_DIRTY); if (ret) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); btrfs_free_logged_extents(log, log_transid); mutex_unlock(&log_root_tree->log_mutex); goto out_wake_log_root; } btrfs_wait_logged_extents(trans, log, log_transid); - btrfs_set_super_log_root(root->fs_info->super_for_commit, - log_root_tree->node->start); - btrfs_set_super_log_root_level(root->fs_info->super_for_commit, - btrfs_header_level(log_root_tree->node)); + btrfs_set_super_log_root(fs_info->super_for_commit, + log_root_tree->node->start); + btrfs_set_super_log_root_level(fs_info->super_for_commit, + btrfs_header_level(log_root_tree->node)); log_root_tree->log_transid++; mutex_unlock(&log_root_tree->log_mutex); @@ -2946,9 +2958,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * the running transaction open, so a full commit can't hop * in and cause problems either. */ - ret = write_ctree_super(trans, root->fs_info->tree_root, 1); + ret = write_ctree_super(trans, fs_info, 1); if (ret) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); btrfs_abort_transaction(trans, ret); goto out_wake_log_root; } @@ -3182,6 +3194,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, const char *name, int name_len, struct inode *inode, u64 dirid) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log; u64 index; int ret; @@ -3199,7 +3212,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, dirid, &index); mutex_unlock(&BTRFS_I(inode)->log_mutex); if (ret == -ENOSPC) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); ret = 0; } else if (ret < 0 && ret != -ENOENT) btrfs_abort_transaction(trans, ret); @@ -3606,6 +3619,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int start_slot, int nr, int inode_only, u64 logged_isize) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long src_offset; unsigned long dst_offset; struct btrfs_root *log = BTRFS_I(inode)->root->log_root; @@ -3716,7 +3730,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, } ret = btrfs_lookup_csums_range( - log->fs_info->csum_root, + fs_info->csum_root, ds + cs, ds + cs + cl - 1, &ordered_sums, 0); if (ret) { @@ -3789,7 +3803,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_path->slots[0], extent); *last_extent = ALIGN(key.offset + len, - log->sectorsize); + fs_info->sectorsize); } else { len = btrfs_file_extent_num_bytes(src, extent); *last_extent = key.offset + len; @@ -3852,7 +3866,8 @@ fill_holes: if (btrfs_file_extent_type(src, extent) == BTRFS_FILE_EXTENT_INLINE) { len = btrfs_file_extent_inline_len(src, i, extent); - extent_end = ALIGN(key.offset + len, log->sectorsize); + extent_end = ALIGN(key.offset + len, + fs_info->sectorsize); } else { len = btrfs_file_extent_num_bytes(src, extent); extent_end = key.offset + len; @@ -3902,6 +3917,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, const struct list_head *logged_list, bool *ordered_io_error) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; struct btrfs_root *log = root->log_root; u64 mod_start = em->mod_start; @@ -4018,7 +4034,7 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - ret = btrfs_lookup_csums_range(log->fs_info->csum_root, + ret = btrfs_lookup_csums_range(fs_info->csum_root, em->block_start + csum_offset, em->block_start + csum_offset + csum_len - 1, &ordered_sums, 0); @@ -4361,6 +4377,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret; struct btrfs_key key; u64 hole_start; @@ -4370,7 +4387,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(inode); - if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + if (!btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = ino; @@ -4427,7 +4444,7 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, if (hole_size == 0) return 0; - hole_size = ALIGN(hole_size, root->sectorsize); + hole_size = ALIGN(hole_size, fs_info->sectorsize); ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, hole_size, 0, hole_size, 0, 0, 0); return ret; @@ -4585,6 +4602,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, const loff_t end, struct btrfs_log_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_path *dst_path; struct btrfs_key min_key; @@ -4624,7 +4642,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode) || (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags) && - inode_only == LOG_INODE_EXISTS)) + inode_only >= LOG_INODE_EXISTS)) max_key.type = BTRFS_XATTR_ITEM_KEY; else max_key.type = (u8)-1; @@ -4637,7 +4655,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items). */ if (S_ISDIR(inode->i_mode) || - BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) + BTRFS_I(inode)->generation > fs_info->last_trans_committed) ret = btrfs_commit_inode_delayed_items(trans, inode); else ret = btrfs_commit_inode_delayed_inode(inode); @@ -4648,7 +4666,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, return ret; } - mutex_lock(&BTRFS_I(inode)->log_mutex); + if (inode_only == LOG_OTHER_INODE) { + inode_only = LOG_INODE_EXISTS; + mutex_lock_nested(&BTRFS_I(inode)->log_mutex, + SINGLE_DEPTH_NESTING); + } else { + mutex_lock(&BTRFS_I(inode)->log_mutex); + } /* * a brute force approach to making sure we get the most uptodate @@ -4774,7 +4798,7 @@ again: inode_key.objectid = other_ino; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; - other_inode = btrfs_iget(root->fs_info->sb, + other_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); /* @@ -4800,7 +4824,7 @@ again: * unpin it. */ err = btrfs_log_inode(trans, root, other_inode, - LOG_INODE_EXISTS, + LOG_OTHER_INODE, 0, LLONG_MAX, ctx); iput(other_inode); if (err) @@ -5138,6 +5162,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct inode *start_inode, struct btrfs_log_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *log = root->log_root; struct btrfs_path *path; LIST_HEAD(dir_list); @@ -5205,8 +5230,8 @@ process_leaf: if (di_key.type == BTRFS_ROOT_ITEM_KEY) continue; - di_inode = btrfs_iget(root->fs_info->sb, &di_key, - root, NULL); + btrfs_release_path(path); + di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL); if (IS_ERR(di_inode)) { ret = PTR_ERR(di_inode); goto next_dir_inode; @@ -5214,13 +5239,12 @@ process_leaf: if (btrfs_inode_in_log(di_inode, trans->transid)) { iput(di_inode); - continue; + break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; - btrfs_release_path(path); ret = btrfs_log_inode(trans, root, di_inode, log_mode, 0, LLONG_MAX, ctx); if (!ret && @@ -5268,6 +5292,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_log_ctx *ctx) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret; struct btrfs_path *path; struct btrfs_key key; @@ -5332,7 +5357,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, cur_offset = item_size; } - dir_inode = btrfs_iget(root->fs_info->sb, &inode_key, + dir_inode = btrfs_iget(fs_info->sb, &inode_key, root, NULL); /* If parent inode was deleted, skip it. */ if (IS_ERR(dir_inode)) @@ -5374,17 +5399,18 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, int exists_only, struct btrfs_log_ctx *ctx) { + struct btrfs_fs_info *fs_info = root->fs_info; int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; struct super_block *sb; struct dentry *old_parent = NULL; int ret = 0; - u64 last_committed = root->fs_info->last_trans_committed; + u64 last_committed = fs_info->last_trans_committed; bool log_dentries = false; struct inode *orig_inode = inode; sb = inode->i_sb; - if (btrfs_test_opt(root->fs_info, NOTREELOG)) { + if (btrfs_test_opt(fs_info, NOTREELOG)) { ret = 1; goto end_no_trans; } @@ -5393,8 +5419,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * The prev transaction commit doesn't complete, we need do * full commit by ourselves. */ - if (root->fs_info->last_trans_log_full_commit > - root->fs_info->last_trans_committed) { + if (fs_info->last_trans_log_full_commit > + fs_info->last_trans_committed) { ret = 1; goto end_no_trans; } @@ -5515,7 +5541,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: dput(old_parent); if (ret < 0) { - btrfs_set_log_full_commit(root->fs_info, trans); + btrfs_set_log_full_commit(fs_info, trans); ret = 1; } @@ -5675,7 +5701,7 @@ again: btrfs_free_path(path); /* step 4: commit the transaction, which also unpins the blocks */ - ret = btrfs_commit_transaction(trans, fs_info->tree_root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; @@ -5687,7 +5713,7 @@ again: return 0; error: if (wc.trans) - btrfs_end_transaction(wc.trans, fs_info->tree_root); + btrfs_end_transaction(wc.trans); btrfs_free_path(path); return ret; } @@ -5786,6 +5812,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *old_dir, struct dentry *parent) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root * root = BTRFS_I(inode)->root; /* @@ -5800,9 +5827,9 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, * from hasn't been logged, we don't need to log it */ if (BTRFS_I(inode)->logged_trans <= - root->fs_info->last_trans_committed && + fs_info->last_trans_committed && (!old_dir || BTRFS_I(old_dir)->logged_trans <= - root->fs_info->last_trans_committed)) + fs_info->last_trans_committed)) return 0; return btrfs_log_inode_parent(trans, root, inode, parent, 0, diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 7fc89e4adb41..726f928238d0 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -92,9 +92,10 @@ out: } int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, - struct btrfs_root *uuid_root, u8 *uuid, u8 type, + struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, u64 subid_cpu) { + struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -132,13 +133,13 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(uuid_root, path, sizeof(subid_le)); + btrfs_extend_item(fs_info, path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le); } else if (ret < 0) { - btrfs_warn(uuid_root->fs_info, + btrfs_warn(fs_info, "insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!", ret, (unsigned long long)key.objectid, (unsigned long long)key.offset, type); @@ -156,9 +157,10 @@ out: } int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, - struct btrfs_root *uuid_root, u8 *uuid, u8 type, + struct btrfs_fs_info *fs_info, u8 *uuid, u8 type, u64 subid) { + struct btrfs_root *uuid_root = fs_info->uuid_root; int ret; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -185,8 +187,8 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1); if (ret < 0) { - btrfs_warn(uuid_root->fs_info, - "error %d while searching for uuid item!", ret); + btrfs_warn(fs_info, "error %d while searching for uuid item!", + ret); goto out; } if (ret > 0) { @@ -199,8 +201,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, offset = btrfs_item_ptr_offset(eb, slot); item_size = btrfs_item_size_nr(eb, slot); if (!IS_ALIGNED(item_size, sizeof(u64))) { - btrfs_warn(uuid_root->fs_info, - "uuid item with illegal size %lu!", + btrfs_warn(fs_info, "uuid item with illegal size %lu!", (unsigned long)item_size); ret = -ENOENT; goto out; @@ -230,7 +231,7 @@ int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1); + btrfs_truncate_item(fs_info, path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); @@ -250,8 +251,8 @@ static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type, goto out; } - ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid); - btrfs_end_transaction(trans, uuid_root); + ret = btrfs_uuid_tree_rem(trans, uuid_root->fs_info, uuid, type, subid); + btrfs_end_transaction(trans); out: return ret; @@ -351,7 +352,5 @@ skip: out: btrfs_free_path(path); - if (ret) - btrfs_warn(fs_info, "btrfs_uuid_tree_iterate failed %d", ret); - return 0; + return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 71a60cc01451..3c3c69c0eee4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -134,9 +134,9 @@ const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = { }; static int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_device *device); -static int btrfs_relocate_sys_chunks(struct btrfs_root *root); +static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); @@ -343,9 +343,9 @@ static void requeue_list(struct btrfs_pending_bios *pending_bios, */ static noinline void run_scheduled_bios(struct btrfs_device *device) { + struct btrfs_fs_info *fs_info = device->fs_info; struct bio *pending; struct backing_dev_info *bdi; - struct btrfs_fs_info *fs_info; struct btrfs_pending_bios *pending_bios; struct bio *tail; struct bio *cur; @@ -367,7 +367,6 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) blk_start_plug(&plug); bdi = blk_get_backing_dev_info(device->bdev); - fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; @@ -1179,7 +1178,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length) { struct btrfs_key key; - struct btrfs_root *root = device->dev_root; + struct btrfs_root *root = device->fs_info->dev_root; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; u64 extent_end; @@ -1262,7 +1261,7 @@ static int contains_pending_extent(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 *start, u64 len) { - struct btrfs_fs_info *fs_info = device->dev_root->fs_info; + struct btrfs_fs_info *fs_info = device->fs_info; struct extent_map *em; struct list_head *search_list = &fs_info->pinned_chunks; int ret = 0; @@ -1338,8 +1337,9 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, u64 *len) { + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; - struct btrfs_root *root = device->dev_root; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; u64 hole_size; @@ -1357,7 +1357,7 @@ int find_free_dev_extent_start(struct btrfs_transaction *transaction, * used by the boot loader (grub for example), so we make sure to start * at an offset of at least 1MB. */ - min_search_start = max(root->fs_info->alloc_start, 1024ull * 1024); + min_search_start = max(fs_info->alloc_start, 1024ull * 1024); search_start = max(search_start, min_search_start); path = btrfs_alloc_path(); @@ -1508,9 +1508,10 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) { + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; int ret; struct btrfs_path *path; - struct btrfs_root *root = device->dev_root; struct btrfs_key key; struct btrfs_key found_key; struct extent_buffer *leaf = NULL; @@ -1544,7 +1545,7 @@ again: extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); } else { - btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed"); + btrfs_handle_fs_error(fs_info, ret, "Slot search failed"); goto out; } @@ -1552,8 +1553,8 @@ again: ret = btrfs_del_item(trans, root, path); if (ret) { - btrfs_handle_fs_error(root->fs_info, ret, - "Failed to remove dev extent item"); + btrfs_handle_fs_error(fs_info, ret, + "Failed to remove dev extent item"); } else { set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); } @@ -1569,7 +1570,8 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, { int ret; struct btrfs_path *path; - struct btrfs_root *root = device->dev_root; + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -1595,8 +1597,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, - btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE); + write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); @@ -1667,9 +1668,10 @@ error: * the btrfs_device struct should be fully filled in */ static int btrfs_add_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_device *device) { + struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_dev_item *dev_item; @@ -1677,8 +1679,6 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, struct btrfs_key key; unsigned long ptr; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1713,7 +1713,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1737,16 +1737,15 @@ static void update_dev_time(char *path_name) filp_close(filp, NULL); } -static int btrfs_rm_dev_item(struct btrfs_root *root, +static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, struct btrfs_device *device) { + struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; struct btrfs_trans_handle *trans; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -1774,7 +1773,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, goto out; out: btrfs_free_path(path); - btrfs_commit_transaction(trans, root); + btrfs_commit_transaction(trans); return ret; } @@ -1853,7 +1852,7 @@ void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, fs_info->fs_devices->latest_bdev = next_device->bdev; } -int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, char *device_path, u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -1863,20 +1862,20 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) mutex_lock(&uuid_mutex); - num_devices = root->fs_info->fs_devices->num_devices; - btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { + num_devices = fs_info->fs_devices->num_devices; + btrfs_dev_replace_lock(&fs_info->dev_replace, 0); + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { WARN_ON(num_devices < 1); num_devices--; } - btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); + btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); - ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1); + ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - ret = btrfs_find_device_by_devspec(root, devid, device_path, - &device); + ret = btrfs_find_device_by_devspec(fs_info, devid, device_path, + &device); if (ret) goto out; @@ -1885,16 +1884,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) goto out; } - if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + if (device->writeable && fs_info->fs_devices->rw_devices == 1) { ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; goto out; } if (device->writeable) { - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); clear_super = true; } @@ -1909,12 +1908,12 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) * counter although write_all_supers() is not locked out. This * could give a filesystem state which requires a degraded mount. */ - ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + ret = btrfs_rm_dev_item(fs_info, device); if (ret) goto error_undo; device->in_fs_metadata = 0; - btrfs_scrub_cancel_dev(root->fs_info, device); + btrfs_scrub_cancel_dev(fs_info, device); /* * the device list mutex makes sure that we don't change @@ -1927,7 +1926,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) */ cur_devices = device->fs_devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_del_rcu(&device->dev_list); device->fs_devices->num_devices--; @@ -1936,17 +1935,17 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) if (device->missing) device->fs_devices->missing_devices--; - btrfs_assign_next_active_device(root->fs_info, device, NULL); + btrfs_assign_next_active_device(fs_info, device, NULL); if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); } - num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1; - btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; + btrfs_set_super_num_devices(fs_info->super_copy, num_devices); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* * at this point, the device is zero sized and detached from @@ -1961,7 +1960,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) if (cur_devices->open_devices == 0) { struct btrfs_fs_devices *fs_devices; - fs_devices = root->fs_info->fs_devices; + fs_devices = fs_info->fs_devices; while (fs_devices) { if (fs_devices->seed == cur_devices) { fs_devices->seed = cur_devices->seed; @@ -1974,8 +1973,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid) free_fs_devices(cur_devices); } - root->fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); out: mutex_unlock(&uuid_mutex); @@ -1983,11 +1982,11 @@ out: error_undo: if (device->writeable) { - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); list_add(&device->dev_alloc_list, - &root->fs_info->fs_devices->alloc_list); + &fs_info->fs_devices->alloc_list); device->fs_devices->rw_devices++; - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); } goto out; } @@ -2092,7 +2091,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, call_rcu(&tgtdev->rcu, free_device); } -static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, +static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info, + char *device_path, struct btrfs_device **device) { int ret = 0; @@ -2104,14 +2104,13 @@ static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, *device = NULL; ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - root->fs_info->bdev_holder, 0, &bdev, &bh); + fs_info->bdev_holder, 0, &bdev, &bh); if (ret) return ret; disk_super = (struct btrfs_super_block *)bh->b_data; devid = btrfs_stack_device_id(&disk_super->dev_item); dev_uuid = disk_super->dev_item.uuid; - *device = btrfs_find_device(root->fs_info, devid, dev_uuid, - disk_super->fsid); + *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid); brelse(bh); if (!*device) ret = -ENOENT; @@ -2119,7 +2118,7 @@ static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, return ret; } -int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, +int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, char *device_path, struct btrfs_device **device) { @@ -2128,7 +2127,7 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, struct list_head *devices; struct btrfs_device *tmp; - devices = &root->fs_info->fs_devices->devices; + devices = &fs_info->fs_devices->devices; /* * It is safe to read the devices since the volume_mutex * is held by the caller. @@ -2145,30 +2144,28 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, return 0; } else { - return btrfs_find_device_by_path(root, device_path, device); + return btrfs_find_device_by_path(fs_info, device_path, device); } } /* * Lookup a device given by device id, or the path if the id is 0. */ -int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, - char *devpath, - struct btrfs_device **device) +int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, + char *devpath, struct btrfs_device **device) { int ret; if (devid) { ret = 0; - *device = btrfs_find_device(root->fs_info, devid, NULL, - NULL); + *device = btrfs_find_device(fs_info, devid, NULL, NULL); if (!*device) ret = -ENOENT; } else { if (!devpath || !devpath[0]) return -EINVAL; - ret = btrfs_find_device_missing_or_by_path(root, devpath, + ret = btrfs_find_device_missing_or_by_path(fs_info, devpath, device); } return ret; @@ -2177,12 +2174,12 @@ int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, /* * does all the dirty work required for changing file system's UUID. */ -static int btrfs_prepare_sprout(struct btrfs_root *root) +static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *old_devices; struct btrfs_fs_devices *seed_devices; - struct btrfs_super_block *disk_super = root->fs_info->super_copy; + struct btrfs_super_block *disk_super = fs_info->super_copy; struct btrfs_device *device; u64 super_flags; @@ -2208,15 +2205,15 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) INIT_LIST_HEAD(&seed_devices->alloc_list); mutex_init(&seed_devices->device_list_mutex); - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); fs_devices->seeding = 0; fs_devices->num_devices = 0; @@ -2226,9 +2223,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) fs_devices->seed = seed_devices; generate_random_uuid(fs_devices->fsid); - memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); super_flags = btrfs_super_flags(disk_super) & ~BTRFS_SUPER_FLAG_SEEDING; @@ -2241,8 +2238,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) * Store the expected generation for seed devices in device items. */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, - struct btrfs_root *root) + struct btrfs_fs_info *fs_info) { + struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dev_item *dev_item; @@ -2257,7 +2255,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - root = root->fs_info->chunk_root; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; @@ -2293,8 +2290,7 @@ next_slot: BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - device = btrfs_find_device(root->fs_info, devid, dev_uuid, - fs_uuid); + device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2312,28 +2308,29 @@ error: return ret; } -int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *device_path) { + struct btrfs_root *root = fs_info->dev_root; struct request_queue *q; struct btrfs_trans_handle *trans; struct btrfs_device *device; struct block_device *bdev; struct list_head *devices; - struct super_block *sb = root->fs_info->sb; + struct super_block *sb = fs_info->sb; struct rcu_string *name; u64 tmp; int seeding_dev = 0; int ret = 0; - if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + if ((sb->s_flags & MS_RDONLY) && !fs_info->fs_devices->seeding) return -EROFS; bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, - root->fs_info->bdev_holder); + fs_info->bdev_holder); if (IS_ERR(bdev)) return PTR_ERR(bdev); - if (root->fs_info->fs_devices->seeding) { + if (fs_info->fs_devices->seeding) { seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); @@ -2341,20 +2338,20 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) filemap_write_and_wait(bdev->bd_inode->i_mapping); - devices = &root->fs_info->fs_devices->devices; + devices = &fs_info->fs_devices->devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; mutex_unlock( - &root->fs_info->fs_devices->device_list_mutex); + &fs_info->fs_devices->device_list_mutex); goto error; } } - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); - device = btrfs_alloc_device(root->fs_info, NULL, NULL); + device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); @@ -2382,13 +2379,13 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->can_discard = 1; device->writeable = 1; device->generation = trans->transid; - device->io_width = root->sectorsize; - device->io_align = root->sectorsize; - device->sector_size = root->sectorsize; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; - device->dev_root = root->fs_info->dev_root; + device->fs_info = fs_info; device->bdev = bdev; device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 0; @@ -2398,61 +2395,60 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; - ret = btrfs_prepare_sprout(root); + ret = btrfs_prepare_sprout(fs_info); BUG_ON(ret); /* -ENOMEM */ } - device->fs_devices = root->fs_info->fs_devices; + device->fs_devices = fs_info->fs_devices; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); - lock_chunks(root); - list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); + mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->chunk_mutex); + list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, - &root->fs_info->fs_devices->alloc_list); - root->fs_info->fs_devices->num_devices++; - root->fs_info->fs_devices->open_devices++; - root->fs_info->fs_devices->rw_devices++; - root->fs_info->fs_devices->total_devices++; - root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + &fs_info->fs_devices->alloc_list); + fs_info->fs_devices->num_devices++; + fs_info->fs_devices->open_devices++; + fs_info->fs_devices->rw_devices++; + fs_info->fs_devices->total_devices++; + fs_info->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += device->total_bytes; - spin_unlock(&root->fs_info->free_chunk_lock); + spin_lock(&fs_info->free_chunk_lock); + fs_info->free_chunk_space += device->total_bytes; + spin_unlock(&fs_info->free_chunk_lock); if (!blk_queue_nonrot(bdev_get_queue(bdev))) - root->fs_info->fs_devices->rotating = 1; + fs_info->fs_devices->rotating = 1; - tmp = btrfs_super_total_bytes(root->fs_info->super_copy); - btrfs_set_super_total_bytes(root->fs_info->super_copy, + tmp = btrfs_super_total_bytes(fs_info->super_copy); + btrfs_set_super_total_bytes(fs_info->super_copy, tmp + device->total_bytes); - tmp = btrfs_super_num_devices(root->fs_info->super_copy); - btrfs_set_super_num_devices(root->fs_info->super_copy, - tmp + 1); + tmp = btrfs_super_num_devices(fs_info->super_copy); + btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1); /* add sysfs device entry */ - btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device); + btrfs_sysfs_add_device_link(fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space * infos */ - btrfs_clear_space_info_full(root->fs_info); + btrfs_clear_space_info_full(fs_info); - unlock_chunks(root); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (seeding_dev) { - lock_chunks(root); - ret = init_first_rw_device(trans, root, device); - unlock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); + ret = init_first_rw_device(trans, fs_info, device); + mutex_unlock(&fs_info->chunk_mutex); if (ret) { btrfs_abort_transaction(trans, ret); goto error_trans; } } - ret = btrfs_add_device(trans, root, device); + ret = btrfs_add_device(trans, fs_info, device); if (ret) { btrfs_abort_transaction(trans, ret); goto error_trans; @@ -2461,7 +2457,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (seeding_dev) { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; - ret = btrfs_finish_sprout(trans, root); + ret = btrfs_finish_sprout(trans, fs_info); if (ret) { btrfs_abort_transaction(trans, ret); goto error_trans; @@ -2471,16 +2467,15 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) * so rename the fsid on the sysfs */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", - root->fs_info->fsid); - if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj, - fsid_buf)) - btrfs_warn(root->fs_info, - "sysfs: failed to create fsid for sprout"); + fs_info->fsid); + if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf)) + btrfs_warn(fs_info, + "sysfs: failed to create fsid for sprout"); } - root->fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); - ret = btrfs_commit_transaction(trans, root); + fs_info->num_tolerated_disk_barrier_failures = + btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); + ret = btrfs_commit_transaction(trans); if (seeding_dev) { mutex_unlock(&uuid_mutex); @@ -2489,9 +2484,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (ret) /* transaction commit */ return ret; - ret = btrfs_relocate_sys_chunks(root); + ret = btrfs_relocate_sys_chunks(fs_info); if (ret < 0) - btrfs_handle_fs_error(root->fs_info, ret, + btrfs_handle_fs_error(fs_info, ret, "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command."); trans = btrfs_attach_transaction(root); if (IS_ERR(trans)) { @@ -2499,7 +2494,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return 0; return PTR_ERR(trans); } - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); } /* Update ctime/mtime for libblkid */ @@ -2507,9 +2502,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return ret; error_trans: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); rcu_string_free(device->name); - btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device); + btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2520,14 +2515,14 @@ error: return ret; } -int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, +int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out) { struct request_queue *q; struct btrfs_device *device; struct block_device *bdev; - struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; @@ -2585,19 +2580,19 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; - mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); device->writeable = 1; device->generation = 0; - device->io_width = root->sectorsize; - device->io_align = root->sectorsize; - device->sector_size = root->sectorsize; + device->io_width = fs_info->sectorsize; + device->io_align = fs_info->sectorsize; + device->sector_size = fs_info->sectorsize; device->total_bytes = btrfs_device_get_total_bytes(srcdev); device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); device->bytes_used = btrfs_device_get_bytes_used(srcdev); ASSERT(list_empty(&srcdev->resized_list)); device->commit_total_bytes = srcdev->commit_total_bytes; device->commit_bytes_used = device->bytes_used; - device->dev_root = fs_info->dev_root; + device->fs_info = fs_info; device->bdev = bdev; device->in_fs_metadata = 1; device->is_tgtdev_for_dev_replace = 1; @@ -2608,7 +2603,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); *device_out = device; return ret; @@ -2621,11 +2616,13 @@ error: void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev) { + u32 sectorsize = fs_info->sectorsize; + WARN_ON(fs_info->fs_devices->rw_devices == 0); - tgtdev->io_width = fs_info->dev_root->sectorsize; - tgtdev->io_align = fs_info->dev_root->sectorsize; - tgtdev->sector_size = fs_info->dev_root->sectorsize; - tgtdev->dev_root = fs_info->dev_root; + tgtdev->io_width = sectorsize; + tgtdev->io_align = sectorsize; + tgtdev->sector_size = sectorsize; + tgtdev->fs_info = fs_info; tgtdev->in_fs_metadata = 1; } @@ -2634,13 +2631,11 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, { int ret; struct btrfs_path *path; - struct btrfs_root *root; + struct btrfs_root *root = device->fs_info->chunk_root; struct btrfs_dev_item *dev_item; struct extent_buffer *leaf; struct btrfs_key key; - root = device->dev_root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2680,8 +2675,8 @@ out: int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { - struct btrfs_super_block *super_copy = - device->dev_root->fs_info->super_copy; + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_fs_devices *fs_devices; u64 old_total; u64 diff; @@ -2689,41 +2684,41 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (!device->writeable) return -EACCES; - lock_chunks(device->dev_root); + mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); diff = new_size - device->total_bytes; if (new_size <= device->total_bytes || device->is_tgtdev_for_dev_replace) { - unlock_chunks(device->dev_root); + mutex_unlock(&fs_info->chunk_mutex); return -EINVAL; } - fs_devices = device->dev_root->fs_info->fs_devices; + fs_devices = fs_info->fs_devices; btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); - btrfs_clear_space_info_full(device->dev_root->fs_info); + btrfs_clear_space_info_full(device->fs_info); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, &fs_devices->resized_devices); - unlock_chunks(device->dev_root); + mutex_unlock(&fs_info->chunk_mutex); return btrfs_update_device(trans, device); } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 chunk_objectid, + struct btrfs_fs_info *fs_info, u64 chunk_objectid, u64 chunk_offset) { + struct btrfs_root *root = fs_info->chunk_root; int ret; struct btrfs_path *path; struct btrfs_key key; - root = root->fs_info->chunk_root; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2736,25 +2731,25 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (ret < 0) goto out; else if (ret > 0) { /* Logic error or corruption */ - btrfs_handle_fs_error(root->fs_info, -ENOENT, - "Failed lookup while freeing chunk."); + btrfs_handle_fs_error(fs_info, -ENOENT, + "Failed lookup while freeing chunk."); ret = -ENOENT; goto out; } ret = btrfs_del_item(trans, root, path); if (ret < 0) - btrfs_handle_fs_error(root->fs_info, ret, - "Failed to delete chunk item."); + btrfs_handle_fs_error(fs_info, ret, + "Failed to delete chunk item."); out: btrfs_free_path(path); return ret; } -static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 - chunk_offset) +static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, + u64 chunk_objectid, u64 chunk_offset) { - struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; u8 *ptr; @@ -2765,7 +2760,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 u32 cur; struct btrfs_key key; - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -2795,25 +2790,22 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 cur += len; } } - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); return ret; } int btrfs_remove_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 chunk_offset) + struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map_tree *em_tree; struct extent_map *em; - struct btrfs_root *extent_root = root->fs_info->extent_root; struct map_lookup *map; u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - /* Just in case */ - root = root->fs_info->chunk_root; - em_tree = &root->fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); @@ -2832,9 +2824,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = em->map_lookup; - lock_chunks(root->fs_info->chunk_root); - check_system_chunk(trans, extent_root, map->type); - unlock_chunks(root->fs_info->chunk_root); + mutex_lock(&fs_info->chunk_mutex); + check_system_chunk(trans, fs_info, map->type); + mutex_unlock(&fs_info->chunk_mutex); /* * Take the device list mutex to prevent races with the final phase of @@ -2854,14 +2846,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } if (device->bytes_used > 0) { - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_bytes_used(device, device->bytes_used - dev_extent_len); - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += dev_extent_len; - spin_unlock(&root->fs_info->free_chunk_lock); - btrfs_clear_space_info_full(root->fs_info); - unlock_chunks(root); + spin_lock(&fs_info->free_chunk_lock); + fs_info->free_chunk_space += dev_extent_len; + spin_unlock(&fs_info->free_chunk_lock); + btrfs_clear_space_info_full(fs_info); + mutex_unlock(&fs_info->chunk_mutex); } if (map->stripes[i].dev) { @@ -2875,23 +2867,24 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } mutex_unlock(&fs_devices->device_list_mutex); - ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); + ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - trace_btrfs_chunk_free(root, map, chunk_offset, em->len); + trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + ret = btrfs_del_sys_chunk(fs_info, chunk_objectid, + chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } - ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em); + ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2903,15 +2896,12 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) +static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct btrfs_root *extent_root; + struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; int ret; - root = root->fs_info->chunk_root; - extent_root = root->fs_info->extent_root; - /* * Prevent races with automatic removal of unused block groups. * After we relocate and before we remove the chunk with offset @@ -2924,16 +2914,16 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex)); - ret = btrfs_can_relocate(extent_root, chunk_offset); + ret = btrfs_can_relocate(fs_info, chunk_offset); if (ret) return -ENOSPC; /* step one, relocate all the extents inside this chunk */ - btrfs_scrub_pause(root); - ret = btrfs_relocate_block_group(extent_root, chunk_offset); - btrfs_scrub_continue(root); + btrfs_scrub_pause(fs_info); + ret = btrfs_relocate_block_group(fs_info, chunk_offset); + btrfs_scrub_continue(fs_info); if (ret) return ret; @@ -2949,14 +2939,14 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset) * step two, delete the device extents and the * chunk tree entries */ - ret = btrfs_remove_chunk(trans, root, chunk_offset); - btrfs_end_transaction(trans, extent_root); + ret = btrfs_remove_chunk(trans, fs_info, chunk_offset); + btrfs_end_transaction(trans); return ret; } -static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) { - struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_chunk *chunk; @@ -2977,10 +2967,10 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; } BUG_ON(ret == 0); /* Corruption */ @@ -2988,7 +2978,7 @@ again: ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -3003,14 +2993,13 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, - found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset); if (ret == -ENOSPC) failed++; else BUG_ON(ret); } - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3029,9 +3018,10 @@ error: return ret; } -static int insert_balance_item(struct btrfs_root *root, +static int insert_balance_item(struct btrfs_fs_info *fs_info, struct btrfs_balance_control *bctl) { + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_balance_item *item; struct btrfs_disk_balance_args disk_bargs; @@ -3062,7 +3052,7 @@ static int insert_balance_item(struct btrfs_root *root, leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item); - memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item)); btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data); btrfs_set_balance_data(leaf, item, &disk_bargs); @@ -3076,14 +3066,15 @@ static int insert_balance_item(struct btrfs_root *root, btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; } -static int del_balance_item(struct btrfs_root *root) +static int del_balance_item(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; struct btrfs_path *path; struct btrfs_key key; @@ -3114,7 +3105,7 @@ static int del_balance_item(struct btrfs_root *root) ret = btrfs_del_item(trans, root, path); out: btrfs_free_path(path); - err = btrfs_commit_transaction(trans, root); + err = btrfs_commit_transaction(trans); if (err && !ret) ret = err; return ret; @@ -3369,11 +3360,11 @@ static int chunk_soft_convert_filter(u64 chunk_type, return 0; } -static int should_balance_chunk(struct btrfs_root *root, +static int should_balance_chunk(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 chunk_offset) { - struct btrfs_balance_control *bctl = root->fs_info->balance_ctl; + struct btrfs_balance_control *bctl = fs_info->balance_ctl; struct btrfs_balance_args *bargs = NULL; u64 chunk_type = btrfs_chunk_type(leaf, chunk); @@ -3398,10 +3389,10 @@ static int should_balance_chunk(struct btrfs_root *root, /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && - chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { + chunk_usage_filter(fs_info, chunk_offset, bargs)) { return 0; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && - chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) { + chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { return 0; } @@ -3521,7 +3512,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) ret = btrfs_grow_device(trans, device, old_size); if (ret) { - btrfs_end_transaction(trans, dev_root); + btrfs_end_transaction(trans); /* btrfs_grow_device never returns ret > 0 */ WARN_ON(ret > 0); btrfs_info_in_rcu(fs_info, @@ -3531,7 +3522,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) goto error; } - btrfs_end_transaction(trans, dev_root); + btrfs_end_transaction(trans); } /* step two, relocate all the chunks */ @@ -3606,7 +3597,7 @@ again: spin_unlock(&fs_info->balance_lock); } - ret = should_balance_chunk(chunk_root, leaf, chunk, + ret = should_balance_chunk(fs_info, leaf, chunk, found_key.offset); btrfs_release_path(path); @@ -3659,9 +3650,9 @@ again: goto error; } - ret = btrfs_force_chunk_alloc(trans, chunk_root, + ret = btrfs_force_chunk_alloc(trans, fs_info, BTRFS_BLOCK_GROUP_DATA); - btrfs_end_transaction(trans, chunk_root); + btrfs_end_transaction(trans); if (ret < 0) { mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; @@ -3669,8 +3660,7 @@ again: chunk_reserved = 1; } - ret = btrfs_relocate_chunk(chunk_root, - found_key.offset); + ret = btrfs_relocate_chunk(fs_info, found_key.offset); mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; @@ -3741,7 +3731,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) int ret; unset_balance_control(fs_info); - ret = del_balance_item(fs_info->tree_root); + ret = del_balance_item(fs_info); if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); @@ -3874,7 +3864,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, bctl->sys.target)); } - ret = insert_balance_item(fs_info->tree_root, bctl); + ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; @@ -4166,7 +4156,7 @@ static int btrfs_uuid_scan_kthread(void *data) } update_tree: if (!btrfs_is_empty_uuid(root_item.uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + ret = btrfs_uuid_tree_add(trans, fs_info, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, key.objectid); @@ -4178,7 +4168,7 @@ update_tree: } if (!btrfs_is_empty_uuid(root_item.received_uuid)) { - ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, + ret = btrfs_uuid_tree_add(trans, fs_info, root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, key.objectid); @@ -4191,7 +4181,7 @@ update_tree: skip: if (trans) { - ret = btrfs_end_transaction(trans, fs_info->uuid_root); + ret = btrfs_end_transaction(trans); trans = NULL; if (ret) break; @@ -4216,7 +4206,7 @@ skip: out: btrfs_free_path(path); if (trans && !IS_ERR(trans)) - btrfs_end_transaction(trans, fs_info->uuid_root); + btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); else @@ -4310,13 +4300,13 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) if (IS_ERR(uuid_root)) { ret = PTR_ERR(uuid_root); btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans, tree_root); + btrfs_end_transaction(trans); return ret; } fs_info->uuid_root = uuid_root; - ret = btrfs_commit_transaction(trans, tree_root); + ret = btrfs_commit_transaction(trans); if (ret) return ret; @@ -4355,8 +4345,9 @@ int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) */ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) { + struct btrfs_fs_info *fs_info = device->fs_info; + struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; - struct btrfs_root *root = device->dev_root; struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; @@ -4368,7 +4359,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; - struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); u64 old_size = btrfs_device_get_total_bytes(device); u64 diff = old_size - new_size; @@ -4382,16 +4373,16 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) path->reada = READA_FORWARD; - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space -= diff; - spin_unlock(&root->fs_info->free_chunk_lock); + spin_lock(&fs_info->free_chunk_lock); + fs_info->free_chunk_space -= diff; + spin_unlock(&fs_info->free_chunk_lock); } - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); again: key.objectid = device->devid; @@ -4399,16 +4390,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { - mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4422,7 +4413,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4431,7 +4422,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4439,8 +4430,8 @@ again: chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_offset); - mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); + ret = btrfs_relocate_chunk(fs_info, chunk_offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4463,7 +4454,7 @@ again: goto done; } - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); /* * We checked in the above loop all device extents that were already in @@ -4483,11 +4474,11 @@ again: if (contains_pending_extent(trans->transaction, device, &start, len)) { - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); checked_pending_chunks = true; failed = 0; retried = false; - ret = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans); if (ret) goto done; goto again; @@ -4497,44 +4488,44 @@ again: btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, - &root->fs_info->fs_devices->resized_devices); + &fs_info->fs_devices->resized_devices); WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, old_total - diff); - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); done: btrfs_free_path(path); if (ret) { - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); + spin_lock(&fs_info->free_chunk_lock); + fs_info->free_chunk_space += diff; + spin_unlock(&fs_info->free_chunk_lock); + mutex_unlock(&fs_info->chunk_mutex); } return ret; } -static int btrfs_add_system_chunk(struct btrfs_root *root, +static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_chunk *chunk, int item_size) { - struct btrfs_super_block *super_copy = root->fs_info->super_copy; + struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key disk_key; u32 array_size; u8 *ptr; - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); return -EFBIG; } @@ -4545,7 +4536,7 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); return 0; } @@ -4583,7 +4574,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) btrfs_set_fs_incompat(info, RAID56); } -#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \ +#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -4593,10 +4584,10 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) / sizeof(struct btrfs_stripe) + 1) static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 start, + struct btrfs_fs_info *fs_info, u64 start, u64 type) { - struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; struct list_head *cur; struct map_lookup *map = NULL; @@ -4762,12 +4753,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (type & BTRFS_BLOCK_GROUP_RAID5) { raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - extent_root->stripesize); + info->stripesize); data_stripes = num_stripes - 1; } if (type & BTRFS_BLOCK_GROUP_RAID6) { raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - extent_root->stripesize); + info->stripesize); data_stripes = num_stripes - 2; } @@ -4812,7 +4803,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, j * stripe_size; } } - map->sector_size = extent_root->sectorsize; + map->sector_size = info->sectorsize; map->stripe_len = raid_stripe_len; map->io_align = raid_stripe_len; map->io_width = raid_stripe_len; @@ -4821,7 +4812,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, num_bytes = stripe_size * data_stripes; - trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes); + trace_btrfs_chunk_alloc(info, map, start, num_bytes); em = alloc_extent_map(); if (!em) { @@ -4837,7 +4828,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, em->block_len = em->len; em->orig_block_len = stripe_size; - em_tree = &extent_root->fs_info->mapping_tree.map_tree; + em_tree = &info->mapping_tree.map_tree; write_lock(&em_tree->lock); ret = add_extent_mapping(em_tree, em, 0); if (!ret) { @@ -4850,7 +4841,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } - ret = btrfs_make_block_group(trans, extent_root, 0, type, + ret = btrfs_make_block_group(trans, info, 0, type, BTRFS_FIRST_CHUNK_TREE_OBJECTID, start, num_bytes); if (ret) @@ -4861,13 +4852,12 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); } - spin_lock(&extent_root->fs_info->free_chunk_lock); - extent_root->fs_info->free_chunk_space -= (stripe_size * - map->num_stripes); - spin_unlock(&extent_root->fs_info->free_chunk_lock); + spin_lock(&info->free_chunk_lock); + info->free_chunk_space -= (stripe_size * map->num_stripes); + spin_unlock(&info->free_chunk_lock); free_extent_map(em); - check_raid56_incompat_flag(extent_root->fs_info, type); + check_raid56_incompat_flag(info, type); kfree(devices_info); return 0; @@ -4889,11 +4879,12 @@ error: } int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, + struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 chunk_size) { + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_root *chunk_root = fs_info->chunk_root; struct btrfs_key key; - struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; @@ -4906,20 +4897,19 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &extent_root->fs_info->mapping_tree.map_tree; + em_tree = &fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); read_unlock(&em_tree->lock); if (!em) { - btrfs_crit(extent_root->fs_info, - "unable to find logical %Lu len %Lu", + btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", chunk_offset, chunk_size); return -EINVAL; } if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(extent_root->fs_info, + btrfs_crit(fs_info, "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", chunk_offset, chunk_size, em->start, em->len); free_extent_map(em); @@ -4943,7 +4933,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, * at any time during that final phase of the device replace operation * (dev-replace.c:btrfs_dev_replace_finishing()). */ - mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; @@ -4960,7 +4950,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, break; } if (ret) { - mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); goto out; } @@ -4974,7 +4964,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } - mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); @@ -4983,7 +4973,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); - btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize); btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; @@ -4996,8 +4986,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, * TODO: Cleanup of inserted chunk root in case of * failure. */ - ret = btrfs_add_system_chunk(chunk_root, &key, chunk, - item_size); + ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size); } out: @@ -5014,36 +5003,34 @@ out: * bootstrap process of adding storage to a seed btrfs. */ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 type) + struct btrfs_fs_info *fs_info, u64 type) { u64 chunk_offset; - ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); - chunk_offset = find_next_chunk(extent_root->fs_info); - return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); + ASSERT(mutex_is_locked(&fs_info->chunk_mutex)); + chunk_offset = find_next_chunk(fs_info); + return __btrfs_alloc_chunk(trans, fs_info, chunk_offset, type); } static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, - struct btrfs_root *root, + struct btrfs_fs_info *fs_info, struct btrfs_device *device) { + struct btrfs_root *extent_root = fs_info->extent_root; u64 chunk_offset; u64 sys_chunk_offset; u64 alloc_profile; - struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_root *extent_root = fs_info->extent_root; int ret; chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_get_alloc_profile(extent_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset, - alloc_profile); + ret = __btrfs_alloc_chunk(trans, fs_info, chunk_offset, alloc_profile); if (ret) return ret; - sys_chunk_offset = find_next_chunk(root->fs_info); + sys_chunk_offset = find_next_chunk(fs_info); alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); - ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, + ret = __btrfs_alloc_chunk(trans, fs_info, sys_chunk_offset, alloc_profile); return ret; } @@ -5066,11 +5053,11 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) return max_errors; } -int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; @@ -5182,14 +5169,14 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return ret; } -unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, u64 logical) { struct extent_map *em; struct map_lookup *map; struct extent_map_tree *em_tree = &map_tree->map_tree; - unsigned long len = root->sectorsize; + unsigned long len = fs_info->sectorsize; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, logical, len); @@ -5329,7 +5316,8 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) kfree(bbio); } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -5414,7 +5402,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, raid56_full_stripe_start *= full_stripe_len; } - if (op == REQ_OP_DISCARD) { + if (op == BTRFS_MAP_DISCARD) { /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; @@ -5427,7 +5415,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, For other RAID types and for RAID[56] reads, just allow a single stripe (on a single disk). */ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && - (op == REQ_OP_WRITE)) { + (op == BTRFS_MAP_WRITE)) { max_len = stripe_len * nr_data_stripes(map) - (offset - raid56_full_stripe_start); } else { @@ -5452,8 +5440,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { + op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { /* * in dev-replace case, for repair case (that's the only * case where the mirror is selected explicitly when @@ -5474,7 +5462,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, int found = 0; u64 physical_of_found = 0; - ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &tmp_length, &tmp_bbio, 0, 0); if (ret) { WARN_ON(tmp_bbio != NULL); @@ -5484,7 +5472,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, tmp_num_stripes = tmp_bbio->num_stripes; if (mirror_num > tmp_num_stripes) { /* - * REQ_GET_READ_MIRRORS does not contain this + * BTRFS_MAP_GET_READ_MIRRORS does not contain this * mirror, that means that the requested area * is not left of the left cursor */ @@ -5540,17 +5528,17 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, (offset + *length); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == REQ_OP_DISCARD) + if (op == BTRFS_MAP_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || - op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || + op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5563,8 +5551,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD || - op == REQ_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || + op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5578,9 +5566,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; - if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == REQ_OP_DISCARD) + else if (op == BTRFS_MAP_DISCARD) num_stripes = min_t(u64, map->sub_stripes * (stripe_nr_end - stripe_nr_orig), map->num_stripes); @@ -5598,7 +5586,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (need_raid_map && - (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS || + (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div_u64(raid56_full_stripe_start, @@ -5626,8 +5614,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != REQ_OP_WRITE && op != REQ_OP_DISCARD && - op != REQ_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && + op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) mirror_num = 1; } } else { @@ -5650,9 +5638,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, num_alloc_stripes = num_stripes; if (dev_replace_is_ongoing) { - if (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) num_alloc_stripes <<= 1; - if (op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; tgtdev_indexes = num_stripes; } @@ -5668,7 +5656,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, /* build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && - ((op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) || + ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5693,7 +5681,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, RAID6_Q_STRIPE; } - if (op == REQ_OP_DISCARD) { + if (op == BTRFS_MAP_DISCARD) { u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; @@ -5773,7 +5761,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } - if (op == REQ_OP_WRITE || op == REQ_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) @@ -5781,7 +5769,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, tgtdev_indexes = 0; if (dev_replace_is_ongoing && - (op == REQ_OP_WRITE || op == REQ_OP_DISCARD) && + (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && dev_replace->tgtdev != NULL) { int index_where_to_add; u64 srcdev_devid = dev_replace->srcdev->devid; @@ -5816,7 +5804,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } } num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && (op == REQ_GET_READ_MIRRORS) && + } else if (dev_replace_is_ongoing && + op == BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { u64 srcdev_devid = dev_replace->srcdev->devid; int index_srcdev = 0; @@ -5888,7 +5877,7 @@ out: return ret; } -int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num) { @@ -5897,7 +5886,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, } /* For Scrub/replace */ -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map) @@ -6023,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio) else btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_READ_ERRS); - if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH) + if (bio->bi_opf & REQ_PREFLUSH) btrfs_dev_stat_inc(dev, BTRFS_DEV_STAT_FLUSH_ERRS); btrfs_dev_stat_print_on_error(dev); @@ -6069,10 +6058,10 @@ static void btrfs_end_bio(struct bio *bio) * This will add one bio to the pending list for a device and make sure * the work struct is scheduled. */ -static noinline void btrfs_schedule_bio(struct btrfs_root *root, - struct btrfs_device *device, +static noinline void btrfs_schedule_bio(struct btrfs_device *device, struct bio *bio) { + struct btrfs_fs_info *fs_info = device->fs_info; int should_queue = 1; struct btrfs_pending_bios *pending_bios; @@ -6095,12 +6084,12 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, * made progress against dirty pages when we've really just put it * on a queue for later */ - atomic_inc(&root->fs_info->nr_async_bios); + atomic_inc(&fs_info->nr_async_bios); WARN_ON(bio->bi_next); bio->bi_next = NULL; spin_lock(&device->io_lock); - if (bio->bi_opf & REQ_SYNC) + if (op_is_sync(bio->bi_opf)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; @@ -6117,15 +6106,14 @@ static noinline void btrfs_schedule_bio(struct btrfs_root *root, spin_unlock(&device->io_lock); if (should_queue) - btrfs_queue_work(root->fs_info->submit_workers, - &device->work); + btrfs_queue_work(fs_info->submit_workers, &device->work); } -static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, - struct bio *bio, u64 physical, int dev_nr, - int async) +static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, + u64 physical, int dev_nr, int async) { struct btrfs_device *dev = bbio->stripes[dev_nr].dev; + struct btrfs_fs_info *fs_info = bbio->fs_info; bio->bi_private = bbio; btrfs_io_bio(bio)->stripe_index = dev_nr; @@ -6148,10 +6136,10 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, #endif bio->bi_bdev = dev->bdev; - btrfs_bio_counter_inc_noblocked(root->fs_info); + btrfs_bio_counter_inc_noblocked(fs_info); if (async) - btrfs_schedule_bio(root, dev, bio); + btrfs_schedule_bio(dev, bio); else btrfsic_submit_bio(bio); } @@ -6170,7 +6158,7 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) } } -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, +int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit) { struct btrfs_device *dev; @@ -6186,11 +6174,11 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, length = bio->bi_iter.bi_size; map_length = length; - btrfs_bio_counter_inc_blocked(root->fs_info); - ret = __btrfs_map_block(root->fs_info, bio_op(bio), logical, + btrfs_bio_counter_inc_blocked(fs_info); + ret = __btrfs_map_block(fs_info, bio_op(bio), logical, &map_length, &bbio, mirror_num, 1); if (ret) { - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); return ret; } @@ -6198,7 +6186,7 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, bbio->orig_bio = first_bio; bbio->private = first_bio->bi_private; bbio->end_io = first_bio->bi_end_io; - bbio->fs_info = root->fs_info; + bbio->fs_info = fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && @@ -6206,18 +6194,19 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (bio_op(bio) == REQ_OP_WRITE) { - ret = raid56_parity_write(root, bio, bbio, map_length); + ret = raid56_parity_write(fs_info, bio, bbio, + map_length); } else { - ret = raid56_parity_recover(root, bio, bbio, map_length, - mirror_num, 1); + ret = raid56_parity_recover(fs_info, bio, bbio, + map_length, mirror_num, 1); } - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); return ret; } if (map_length < length) { - btrfs_crit(root->fs_info, + btrfs_crit(fs_info, "mapping failed logical %llu bio len %llu len %llu", logical, length, map_length); BUG(); @@ -6237,11 +6226,10 @@ int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, } else bio = first_bio; - submit_stripe_bio(root, bbio, bio, - bbio->stripes[dev_nr].physical, dev_nr, - async_submit); + submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, + dev_nr, async_submit); } - btrfs_bio_counter_dec(root->fs_info); + btrfs_bio_counter_dec(fs_info); return 0; } @@ -6265,8 +6253,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, return NULL; } -static struct btrfs_device *add_missing_dev(struct btrfs_root *root, - struct btrfs_fs_devices *fs_devices, +static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; @@ -6337,7 +6324,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, } /* Return -EIO if any error, otherwise return 0. */ -static int btrfs_check_chunk_valid(struct btrfs_root *root, +static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical) { @@ -6354,33 +6341,31 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root, type = btrfs_chunk_type(leaf, chunk); if (!num_stripes) { - btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + btrfs_err(fs_info, "invalid chunk num_stripes: %u", num_stripes); return -EIO; } - if (!IS_ALIGNED(logical, root->sectorsize)) { - btrfs_err(root->fs_info, - "invalid chunk logical %llu", logical); + if (!IS_ALIGNED(logical, fs_info->sectorsize)) { + btrfs_err(fs_info, "invalid chunk logical %llu", logical); return -EIO; } - if (btrfs_chunk_sector_size(leaf, chunk) != root->sectorsize) { - btrfs_err(root->fs_info, "invalid chunk sectorsize %u", + if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) { + btrfs_err(fs_info, "invalid chunk sectorsize %u", btrfs_chunk_sector_size(leaf, chunk)); return -EIO; } - if (!length || !IS_ALIGNED(length, root->sectorsize)) { - btrfs_err(root->fs_info, - "invalid chunk length %llu", length); + if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) { + btrfs_err(fs_info, "invalid chunk length %llu", length); return -EIO; } if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) { - btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + btrfs_err(fs_info, "invalid chunk stripe length: %llu", stripe_len); return -EIO; } if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & type) { - btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + btrfs_err(fs_info, "unrecognized chunk type: %llu", ~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & btrfs_chunk_type(leaf, chunk)); @@ -6393,7 +6378,7 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root, (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) || ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 && num_stripes != 1)) { - btrfs_err(root->fs_info, + btrfs_err(fs_info, "invalid num_stripes:sub_stripes %u:%u for profile %llu", num_stripes, sub_stripes, type & BTRFS_BLOCK_GROUP_PROFILE_MASK); @@ -6403,11 +6388,11 @@ static int btrfs_check_chunk_valid(struct btrfs_root *root, return 0; } -static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, +static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { - struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; struct extent_map *em; u64 logical; @@ -6424,7 +6409,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); - ret = btrfs_check_chunk_valid(root, leaf, chunk, logical); + ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); if (ret) return ret; @@ -6471,23 +6456,22 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(root->fs_info, devid, + map->stripes[i].dev = btrfs_find_device(fs_info, devid, uuid, NULL); if (!map->stripes[i].dev && - !btrfs_test_opt(root->fs_info, DEGRADED)) { + !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); return -EIO; } if (!map->stripes[i].dev) { map->stripes[i].dev = - add_missing_dev(root, root->fs_info->fs_devices, - devid, uuid); + add_missing_dev(fs_info->fs_devices, devid, + uuid); if (!map->stripes[i].dev) { free_extent_map(em); return -EIO; } - btrfs_warn(root->fs_info, - "devid %llu uuid %pU is missing", + btrfs_warn(fs_info, "devid %llu uuid %pU is missing", devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; @@ -6525,7 +6509,7 @@ static void fill_device_from_item(struct extent_buffer *leaf, read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } -static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, +static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, u8 *fsid) { struct btrfs_fs_devices *fs_devices; @@ -6533,7 +6517,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, BUG_ON(!mutex_is_locked(&uuid_mutex)); - fs_devices = root->fs_info->fs_devices->seed; + fs_devices = fs_info->fs_devices->seed; while (fs_devices) { if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) return fs_devices; @@ -6543,7 +6527,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, fs_devices = find_fsid(fsid); if (!fs_devices) { - if (!btrfs_test_opt(root->fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); fs_devices = alloc_fs_devices(fsid); @@ -6560,7 +6544,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, return fs_devices; ret = __btrfs_open_devices(fs_devices, FMODE_READ, - root->fs_info->bdev_holder); + fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); fs_devices = ERR_PTR(ret); @@ -6574,17 +6558,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, goto out; } - fs_devices->seed = root->fs_info->fs_devices->seed; - root->fs_info->fs_devices->seed = fs_devices; + fs_devices->seed = fs_info->fs_devices->seed; + fs_info->fs_devices->seed = fs_devices; out: return fs_devices; } -static int read_one_dev(struct btrfs_root *root, +static int read_one_dev(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; u64 devid; int ret; @@ -6597,24 +6581,24 @@ static int read_one_dev(struct btrfs_root *root, read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_UUID_SIZE); - if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { - fs_devices = open_seed_devices(root, fs_uuid); + if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) { + fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); } - device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); + device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(root->fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) return -EIO; - device = add_missing_dev(root, fs_devices, devid, dev_uuid); + device = add_missing_dev(fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; - btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + btrfs_warn(fs_info, "devid %llu uuid %pU missing", devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED)) + if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED)) return -EIO; if(!device->bdev && !device->missing) { @@ -6643,7 +6627,7 @@ static int read_one_dev(struct btrfs_root *root, } } - if (device->fs_devices != root->fs_info->fs_devices) { + if (device->fs_devices != fs_info->fs_devices) { BUG_ON(device->writeable); if (device->generation != btrfs_device_generation(leaf, dev_item)) @@ -6654,18 +6638,18 @@ static int read_one_dev(struct btrfs_root *root, device->in_fs_metadata = 1; if (device->writeable && !device->is_tgtdev_for_dev_replace) { device->fs_devices->total_rw_bytes += device->total_bytes; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += device->total_bytes - + spin_lock(&fs_info->free_chunk_lock); + fs_info->free_chunk_space += device->total_bytes - device->bytes_used; - spin_unlock(&root->fs_info->free_chunk_lock); + spin_unlock(&fs_info->free_chunk_lock); } ret = 0; return ret; } -int btrfs_read_sys_array(struct btrfs_root *root) +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *root = fs_info->tree_root; struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; @@ -6680,13 +6664,13 @@ int btrfs_read_sys_array(struct btrfs_root *root) u64 type; struct btrfs_key key; - ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize); + ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); /* * This will create extent buffer of nodesize, superblock size is * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will * overallocate but we can keep it as-is, only the first page is used. */ - sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); + sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET); if (IS_ERR(sb)) return PTR_ERR(sb); set_extent_buffer_uptodate(sb); @@ -6757,7 +6741,7 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (cur_offset + len > array_size) goto out_short_read; - ret = read_one_chunk(root, &key, sb, chunk); + ret = read_one_chunk(fs_info, &key, sb, chunk); if (ret) break; } else { @@ -6783,8 +6767,9 @@ out_short_read: return -EIO; } -int btrfs_read_chunk_tree(struct btrfs_root *root) +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { + struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key; @@ -6793,14 +6778,12 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) int slot; u64 total_dev = 0; - root = root->fs_info->chunk_root; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&uuid_mutex); - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); /* * Read all device items, and then all the chunk items. All @@ -6830,14 +6813,14 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item); - ret = read_one_dev(root, leaf, dev_item); + ret = read_one_dev(fs_info, leaf, dev_item); if (ret) goto error; total_dev++; } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { struct btrfs_chunk *chunk; chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = read_one_chunk(root, &found_key, leaf, chunk); + ret = read_one_chunk(fs_info, &found_key, leaf, chunk); if (ret) goto error; } @@ -6848,26 +6831,26 @@ int btrfs_read_chunk_tree(struct btrfs_root *root) * After loading chunk tree, we've got all device information, * do another round of validation checks. */ - if (total_dev != root->fs_info->fs_devices->total_devices) { - btrfs_err(root->fs_info, + if (total_dev != fs_info->fs_devices->total_devices) { + btrfs_err(fs_info, "super_num_devices %llu mismatch with num_devices %llu found here", - btrfs_super_num_devices(root->fs_info->super_copy), + btrfs_super_num_devices(fs_info->super_copy), total_dev); ret = -EINVAL; goto error; } - if (btrfs_super_total_bytes(root->fs_info->super_copy) < - root->fs_info->fs_devices->total_rw_bytes) { - btrfs_err(root->fs_info, + if (btrfs_super_total_bytes(fs_info->super_copy) < + fs_info->fs_devices->total_rw_bytes) { + btrfs_err(fs_info, "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu", - btrfs_super_total_bytes(root->fs_info->super_copy), - root->fs_info->fs_devices->total_rw_bytes); + btrfs_super_total_bytes(fs_info->super_copy), + fs_info->fs_devices->total_rw_bytes); ret = -EINVAL; goto error; } ret = 0; error: - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&uuid_mutex); btrfs_free_path(path); @@ -6882,7 +6865,7 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) while (fs_devices) { mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) - device->dev_root = fs_info->dev_root; + device->fs_info = fs_info; mutex_unlock(&fs_devices->device_list_mutex); fs_devices = fs_devices->seed; @@ -6959,9 +6942,10 @@ out: } static int update_dev_stat_item(struct btrfs_trans_handle *trans, - struct btrfs_root *dev_root, + struct btrfs_fs_info *fs_info, struct btrfs_device *device) { + struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_path *path; struct btrfs_key key; struct extent_buffer *eb; @@ -6977,7 +6961,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, BUG_ON(!path); ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { - btrfs_warn_in_rcu(dev_root->fs_info, + btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", ret, rcu_str_deref(device->name)); goto out; @@ -6988,7 +6972,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, /* need to delete old one and insert a new one */ ret = btrfs_del_item(trans, dev_root, path); if (ret != 0) { - btrfs_warn_in_rcu(dev_root->fs_info, + btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; @@ -7002,7 +6986,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, dev_root, path, &key, sizeof(*ptr)); if (ret < 0) { - btrfs_warn_in_rcu(dev_root->fs_info, + btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", rcu_str_deref(device->name), ret); goto out; @@ -7027,7 +7011,6 @@ out: int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; int stats_cnt; @@ -7039,7 +7022,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, continue; stats_cnt = atomic_read(&device->dev_stats_ccnt); - ret = update_dev_stat_item(trans, dev_root, device); + ret = update_dev_stat_item(trans, fs_info, device); if (!ret) atomic_sub(stats_cnt, &device->dev_stats_ccnt); } @@ -7058,7 +7041,7 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) { if (!dev->dev_stats_valid) return; - btrfs_err_rl_in_rcu(dev->dev_root->fs_info, + btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7078,7 +7061,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) if (i == BTRFS_DEV_STAT_VALUES_MAX) return; /* all values == 0, suppress message */ - btrfs_info_in_rcu(dev->dev_root->fs_info, + btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", rcu_str_deref(dev->name), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), @@ -7088,24 +7071,22 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS)); } -int btrfs_get_dev_stats(struct btrfs_root *root, +int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { struct btrfs_device *dev; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL); + dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { - btrfs_warn(root->fs_info, - "get dev_stats failed, device not found"); + btrfs_warn(fs_info, "get dev_stats failed, device not found"); return -ENODEV; } else if (!dev->dev_stats_valid) { - btrfs_warn(root->fs_info, - "get dev_stats failed, not yet valid"); + btrfs_warn(fs_info, "get dev_stats failed, not yet valid"); return -ENODEV; } else if (stats->flags & BTRFS_DEV_STATS_RESET) { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { @@ -7168,18 +7149,18 @@ void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) return; mutex_lock(&fs_devices->device_list_mutex); - lock_chunks(fs_info->dev_root); + mutex_lock(&fs_info->chunk_mutex); list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, resized_list) { list_del_init(&curr->resized_list); curr->commit_total_bytes = curr->disk_total_bytes; } - unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_devices->device_list_mutex); } /* Must be invoked during the transaction commit */ -void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, +void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, struct btrfs_transaction *transaction) { struct extent_map *em; @@ -7191,7 +7172,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, return; /* In order to kick the device replace finish process */ - lock_chunks(root); + mutex_lock(&fs_info->chunk_mutex); list_for_each_entry(em, &transaction->pending_chunks, list) { map = em->map_lookup; @@ -7200,7 +7181,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, dev->commit_bytes_used = dev->bytes_used; } } - unlock_chunks(root); + mutex_unlock(&fs_info->chunk_mutex); } void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 09ed29c67848..24ba6bc3ec34 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -51,8 +51,7 @@ struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; - - struct btrfs_root *dev_root; + struct btrfs_fs_info *fs_info; struct rcu_string *name; @@ -62,7 +61,7 @@ struct btrfs_device { int running_pending; /* regular prio bios */ struct btrfs_pending_bios pending_bios; - /* WRITE_SYNC bios */ + /* sync bios */ struct btrfs_pending_bios pending_sync_bios; struct block_device *bdev; @@ -371,27 +370,48 @@ struct btrfs_balance_control { struct btrfs_balance_progress stat; }; +enum btrfs_map_op { + BTRFS_MAP_READ, + BTRFS_MAP_WRITE, + BTRFS_MAP_DISCARD, + BTRFS_MAP_GET_READ_MIRRORS, +}; + +static inline enum btrfs_map_op btrfs_op(struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + return BTRFS_MAP_DISCARD; + case REQ_OP_WRITE: + return BTRFS_MAP_WRITE; + default: + WARN_ON_ONCE(1); + case REQ_OP_READ: + return BTRFS_MAP_READ; + } +} + int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, u64 end, u64 *length); void btrfs_get_bbio(struct btrfs_bio *bbio); void btrfs_put_bbio(struct btrfs_bio *bbio); -int btrfs_map_block(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num); -int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int op, +int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_bio **bbio_ret, int mirror_num, int need_raid_map); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); -int btrfs_read_sys_array(struct btrfs_root *root); -int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); +int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, u64 type); + struct btrfs_fs_info *fs_info, u64 type); void btrfs_mapping_init(struct btrfs_mapping_tree *tree); void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); -int btrfs_map_bio(struct btrfs_root *root, struct bio *bio, +int btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num, int async_submit); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); @@ -401,16 +421,17 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info, struct btrfs_device *device, struct btrfs_device *this_dev); -int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, +int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info, char *device_path, struct btrfs_device **device); -int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid, +int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, char *devpath, struct btrfs_device **device); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); -int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid); +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + char *device_path, u64 devid); void btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, @@ -418,8 +439,9 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, u8 *uuid, u8 *fsid); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); -int btrfs_init_new_device(struct btrfs_root *root, char *path); -int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, +int btrfs_init_new_device(struct btrfs_fs_info *fs_info, char *path); +int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, + char *device_path, struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, @@ -430,7 +452,7 @@ int btrfs_pause_balance(struct btrfs_fs_info *fs_info); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info); -int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent_start(struct btrfs_transaction *transaction, struct btrfs_device *device, u64 num_bytes, u64 search_start, u64 *start, u64 *max_avail); @@ -438,7 +460,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); -int btrfs_get_dev_stats(struct btrfs_root *root, +int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); void btrfs_init_devices_late(struct btrfs_fs_info *fs_info); int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info); @@ -455,14 +477,14 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path); int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len, int mirror_num); -unsigned long btrfs_full_stripe_len(struct btrfs_root *root, +unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, u64 logical); int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, - struct btrfs_root *extent_root, + struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 chunk_size); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 chunk_offset); + struct btrfs_fs_info *fs_info, u64 chunk_offset); static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) { @@ -509,19 +531,9 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, } void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); -void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, +void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info, struct btrfs_transaction *transaction); -static inline void lock_chunks(struct btrfs_root *root) -{ - mutex_lock(&root->fs_info->chunk_mutex); -} - -static inline void unlock_chunks(struct btrfs_root *root) -{ - mutex_unlock(&root->fs_info->chunk_mutex); -} - struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fccbf5567e78..9621c7f2503e 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -94,11 +94,12 @@ static int do_setxattr(struct btrfs_trans_handle *trans, { struct btrfs_dir_item *di = NULL; struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; size_t name_len = strlen(name); int ret = 0; - if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; path = btrfs_alloc_path(); @@ -149,14 +150,14 @@ static int do_setxattr(struct btrfs_trans_handle *trans, */ ret = 0; btrfs_assert_tree_locked(path->nodes[0]); - di = btrfs_match_dir_item_name(root, path, name, name_len); + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; goto out; } } else if (ret == -EEXIST) { ret = 0; - di = btrfs_match_dir_item_name(root, path, name, name_len); + di = btrfs_match_dir_item_name(fs_info, path, name, name_len); ASSERT(di); /* logic error */ } else if (ret) { goto out; @@ -185,7 +186,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, char *ptr; if (size > old_data_len) { - if (btrfs_leaf_free_space(root, leaf) < + if (btrfs_leaf_free_space(fs_info, leaf) < (size - old_data_len)) { ret = -ENOSPC; goto out; @@ -195,16 +196,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(root, path, + btrfs_extend_item(fs_info, path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(root, path, data_size, 1); + btrfs_truncate_item(fs_info, path, + data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(root, path, data_size); + btrfs_extend_item(fs_info, path, data_size); } item = btrfs_item_nr(slot); @@ -257,7 +259,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); out: - btrfs_end_transaction(trans, root); + btrfs_end_transaction(trans); return ret; } @@ -265,6 +267,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key; struct inode *inode = d_inode(dentry); + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret = 0; @@ -333,7 +336,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) u32 this_len = sizeof(*di) + name_len + data_len; unsigned long name_ptr = (unsigned long)(di + 1); - if (verify_dir_item(root, leaf, di)) { + if (verify_dir_item(fs_info, leaf, di)) { ret = -EIO; goto err; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 441b81a3e545..da497f184ff4 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -210,10 +210,9 @@ out: return ret; } -static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, +static int zlib_decompress_bio(struct list_head *ws, struct page **pages_in, u64 disk_start, - struct bio_vec *bvec, - int vcnt, + struct bio *orig_bio, size_t srclen) { struct workspace *workspace = list_entry(ws, struct workspace, list); @@ -222,10 +221,8 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, char *data_in; size_t total_out = 0; unsigned long page_in_index = 0; - unsigned long page_out_index = 0; unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); unsigned long buf_start; - unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; @@ -235,7 +232,6 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; workspace->strm.avail_out = PAGE_SIZE; - pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -250,6 +246,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -266,8 +263,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start, total_out, disk_start, - bvec, vcnt, - &page_out_index, &pg_offset); + orig_bio); if (ret2 == 0) { ret = 0; goto done; @@ -300,7 +296,7 @@ done: if (data_in) kunmap(pages_in[page_in_index]); if (!ret) - btrfs_clear_biovec_end(bvec, vcnt, page_out_index, pg_offset); + zero_fill_bio(orig_bio); return ret; } @@ -407,6 +403,6 @@ const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, - .decompress_biovec = zlib_decompress_biovec, + .decompress_bio = zlib_decompress_bio, .decompress = zlib_decompress, }; diff --git a/fs/buffer.c b/fs/buffer.c index b205a629001d..0e87401cf335 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -43,6 +43,7 @@ #include <linux/bitops.h> #include <linux/mpage.h> #include <linux/bit_spinlock.h> +#include <linux/pagevec.h> #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); @@ -753,7 +754,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) * still in flight on potentially older * contents. */ - write_dirty_buffer(bh, WRITE_SYNC); + write_dirty_buffer(bh, REQ_SYNC); /* * Kick off IO for the previous mapping. Note @@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page, } EXPORT_SYMBOL(create_empty_buffers); -/* - * We are taking a block for data and we don't want any output from any - * buffer-cache aliases starting from return from that function and - * until the moment when something will explicitly mark the buffer - * dirty (hopefully that will not happen until we will free that block ;-) - * We don't even need to mark it not-uptodate - nobody can expect - * anything from a newly allocated buffer anyway. We used to used - * unmap_buffer() for such invalidation, but that was wrong. We definitely - * don't want to mark the alias unmapped, for example - it would confuse - * anyone who might pick it with bread() afterwards... - * - * Also.. Note that bforget() doesn't lock the buffer. So there can - * be writeout I/O going on against recently-freed buffers. We don't - * wait on that I/O in bforget() - it's more efficient to wait on the I/O - * only if we really need to. That happens here. - */ -void unmap_underlying_metadata(struct block_device *bdev, sector_t block) +/** + * clean_bdev_aliases: clean a range of buffers in block device + * @bdev: Block device to clean buffers in + * @block: Start of a range of blocks to clean + * @len: Number of blocks to clean + * + * We are taking a range of blocks for data and we don't want writeback of any + * buffer-cache aliases starting from return from this function and until the + * moment when something will explicitly mark the buffer dirty (hopefully that + * will not happen until we will free that block ;-) We don't even need to mark + * it not-uptodate - nobody can expect anything from a newly allocated buffer + * anyway. We used to use unmap_buffer() for such invalidation, but that was + * wrong. We definitely don't want to mark the alias unmapped, for example - it + * would confuse anyone who might pick it with bread() afterwards... + * + * Also.. Note that bforget() doesn't lock the buffer. So there can be + * writeout I/O going on against recently-freed buffers. We don't wait on that + * I/O in bforget() - it's more efficient to wait on the I/O only if we really + * need to. That happens here. + */ +void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) { - struct buffer_head *old_bh; + struct inode *bd_inode = bdev->bd_inode; + struct address_space *bd_mapping = bd_inode->i_mapping; + struct pagevec pvec; + pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); + pgoff_t end; + int i; + struct buffer_head *bh; + struct buffer_head *head; - might_sleep(); + end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); + pagevec_init(&pvec, 0); + while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; - old_bh = __find_get_block_slow(bdev, block); - if (old_bh) { - clear_buffer_dirty(old_bh); - wait_on_buffer(old_bh); - clear_buffer_req(old_bh); - __brelse(old_bh); + index = page->index; + if (index > end) + break; + if (!page_has_buffers(page)) + continue; + /* + * We use page lock instead of bd_mapping->private_lock + * to pin buffers here since we can afford to sleep and + * it scales better than a global spinlock lock. + */ + lock_page(page); + /* Recheck when the page is locked which pins bhs */ + if (!page_has_buffers(page)) + goto unlock_page; + head = page_buffers(page); + bh = head; + do { + if (!buffer_mapped(bh) || (bh->b_blocknr < block)) + goto next; + if (bh->b_blocknr >= block + len) + break; + clear_buffer_dirty(bh); + wait_on_buffer(bh); + clear_buffer_req(bh); +next: + bh = bh->b_this_page; + } while (bh != head); +unlock_page: + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + index++; } } -EXPORT_SYMBOL(unmap_underlying_metadata); +EXPORT_SYMBOL(clean_bdev_aliases); /* * Size is a power-of-two in the range 512..PAGE_SIZE, @@ -1684,7 +1728,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode * * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this + * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */ int __block_write_full_page(struct inode *inode, struct page *page, @@ -1697,7 +1741,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int write_flags = wbc_to_write_flags(wbc); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, if (buffer_new(bh)) { /* blockdev mappings never come here */ clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); } } bh = bh->b_this_page; @@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len, } if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping, if (!buffer_mapped(bh)) is_mapped_to_disk = 0; if (buffer_new(bh)) - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { set_buffer_uptodate(bh); continue; @@ -3118,7 +3160,7 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * @@ -3210,7 +3252,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer); int sync_dirty_buffer(struct buffer_head *bh) { - return __sync_dirty_buffer(bh, WRITE_SYNC); + return __sync_dirty_buffer(bh, REQ_SYNC); } EXPORT_SYMBOL(sync_dirty_buffer); @@ -3403,7 +3445,7 @@ void free_buffer_head(struct buffer_head *bh) } EXPORT_SYMBOL(free_buffer_head); -static void buffer_exit_cpu(int cpu) +static int buffer_exit_cpu_dead(unsigned int cpu) { int i; struct bh_lru *b = &per_cpu(bh_lrus, cpu); @@ -3414,14 +3456,7 @@ static void buffer_exit_cpu(int cpu) } this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr); per_cpu(bh_accounting, cpu).nr = 0; -} - -static int buffer_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) - buffer_exit_cpu((unsigned long)hcpu); - return NOTIFY_OK; + return 0; } /** @@ -3471,6 +3506,7 @@ EXPORT_SYMBOL(bh_submit_read); void __init buffer_init(void) { unsigned long nrpages; + int ret; bh_cachep = kmem_cache_create("buffer_head", sizeof(struct buffer_head), 0, @@ -3483,5 +3519,7 @@ void __init buffer_init(void) */ nrpages = (nr_free_buffer_pages() * 10) / 100; max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head)); - hotcpu_notifier(buffer_cpu_notify, 0); + ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead", + NULL, buffer_exit_cpu_dead); + WARN_ON(ret < 0); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ef3ebd780aff..e4b066cd912a 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -315,7 +315,32 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) struct page **pages; pgoff_t next_index; int nr_pages = 0; - int ret; + int got = 0; + int ret = 0; + + if (!current->journal_info) { + /* caller of readpages does not hold buffer and read caps + * (fadvise, madvise and readahead cases) */ + int want = CEPH_CAP_FILE_CACHE; + ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got); + if (ret < 0) { + dout("start_read %p, error getting cap\n", inode); + } else if (!(got & want)) { + dout("start_read %p, no cache cap\n", inode); + ret = 0; + } + if (ret <= 0) { + if (got) + ceph_put_cap_refs(ci, got); + while (!list_empty(page_list)) { + page = list_entry(page_list->prev, + struct page, lru); + list_del(&page->lru); + put_page(page); + } + return ret; + } + } off = (u64) page_offset(page); @@ -338,15 +363,18 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq, ci->i_truncate_size, false); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + goto out; + } /* build page vector */ nr_pages = calc_pages_for(0, len); pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); - ret = -ENOMEM; - if (!pages) - goto out; + if (!pages) { + ret = -ENOMEM; + goto out_put; + } for (i = 0; i < nr_pages; ++i) { page = list_entry(page_list->prev, struct page, lru); BUG_ON(PageLocked(page)); @@ -378,6 +406,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) if (ret < 0) goto out_pages; ceph_osdc_put_request(req); + + /* After adding locked pages to page cache, the inode holds cache cap. + * So we can drop our cap refs. */ + if (got) + ceph_put_cap_refs(ci, got); + return nr_pages; out_pages: @@ -386,8 +420,11 @@ out_pages: unlock_page(pages[i]); } ceph_put_page_vector(pages, nr_pages, false); -out: +out_put: ceph_osdc_put_request(req); +out: + if (got) + ceph_put_cap_refs(ci, got); return ret; } @@ -424,7 +461,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, rc = start_read(inode, page_list, max); if (rc < 0) goto out; - BUG_ON(rc == 0); } out: ceph_fscache_readpages_cancel(inode, page_list); @@ -438,7 +474,9 @@ out: * only snap context we are allowed to write back. */ static struct ceph_snap_context *get_oldest_context(struct inode *inode, - loff_t *snap_size) + loff_t *snap_size, + u64 *truncate_size, + u32 *truncate_seq) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -452,6 +490,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, snapc = ceph_get_snap_context(capsnap->context); if (snap_size) *snap_size = capsnap->size; + if (truncate_size) + *truncate_size = capsnap->truncate_size; + if (truncate_seq) + *truncate_seq = capsnap->truncate_seq; break; } } @@ -459,6 +501,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); + if (truncate_size) + *truncate_size = ci->i_truncate_size; + if (truncate_seq) + *truncate_seq = ci->i_truncate_seq; } spin_unlock(&ci->i_ceph_lock); return snapc; @@ -501,7 +547,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - oldest = get_oldest_context(inode, &snap_size); + oldest = get_oldest_context(inode, &snap_size, + &truncate_size, &truncate_seq); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); @@ -512,12 +559,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); - spin_lock(&ci->i_ceph_lock); - truncate_seq = ci->i_truncate_seq; - truncate_size = ci->i_truncate_size; if (snap_size == -1) snap_size = i_size_read(inode); - spin_unlock(&ci->i_ceph_lock); /* is this a partial page at end of file? */ if (page_off >= snap_size) { @@ -764,7 +807,8 @@ retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); snap_size = -1; - snapc = get_oldest_context(inode, &snap_size); + snapc = get_oldest_context(inode, &snap_size, + &truncate_size, &truncate_seq); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ @@ -774,11 +818,7 @@ retry: dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); - spin_lock(&ci->i_ceph_lock); - truncate_seq = ci->i_truncate_seq; - truncate_size = ci->i_truncate_size; i_size = i_size_read(inode); - spin_unlock(&ci->i_ceph_lock); if (last_snapc && snapc != last_snapc) { /* if we switched to a newer snapc, restart our scan at the @@ -1124,7 +1164,8 @@ out: static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, + NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); @@ -1169,7 +1210,7 @@ retry_locked: * this page is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL); + oldest = get_oldest_context(inode, NULL, NULL, NULL); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); @@ -1276,25 +1317,27 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - unsigned from = pos & (PAGE_SIZE - 1); int check_cap = 0; dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, inode, page, (int)pos, (int)copied, (int)len); /* zero the stale part of the page if we did a short copy */ - if (copied < len) - zero_user_segment(page, from+copied, len); + if (!PageUptodate(page)) { + if (copied < len) { + copied = 0; + goto out; + } + SetPageUptodate(page); + } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); - if (!PageUptodate(page)) - SetPageUptodate(page); - set_page_dirty(page); +out: unlock_page(page); put_page(page); @@ -1371,9 +1414,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || - ci->i_inline_version == CEPH_INLINE_NONE) + ci->i_inline_version == CEPH_INLINE_NONE) { + current->journal_info = vma->vm_file; ret = filemap_fault(vma, vmf); - else + current->journal_info = NULL; + } else ret = -EAGAIN; dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", @@ -1905,6 +1950,15 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) struct ceph_string *pool_ns; int ret, flags; + if (ci->i_vino.snap != CEPH_NOSNAP) { + /* + * Pool permission check needs to write to the first object. + * But for snapshot, head of the first object may have alread + * been deleted. Skip check to avoid creating orphan object. + */ + return 0; + } + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 16e6ded0b7f2..94fd76d04683 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -987,96 +987,127 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) __cap_delay_cancel(mdsc, ci); } +struct cap_msg_args { + struct ceph_mds_session *session; + u64 ino, cid, follows; + u64 flush_tid, oldest_flush_tid, size, max_size; + u64 xattr_version; + struct ceph_buffer *xattr_buf; + struct timespec atime, mtime, ctime; + int op, caps, wanted, dirty; + u32 seq, issue_seq, mseq, time_warp_seq; + u32 flags; + kuid_t uid; + kgid_t gid; + umode_t mode; + bool inline_data; +}; + /* * Build and send a cap message to the given MDS. * * Caller should be holding s_mutex. */ -static int send_cap_msg(struct ceph_mds_session *session, - u64 ino, u64 cid, int op, - int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u64 oldest_flush_tid, - u32 issue_seq, u32 mseq, u64 size, u64 max_size, - struct timespec *mtime, struct timespec *atime, - struct timespec *ctime, u32 time_warp_seq, - kuid_t uid, kgid_t gid, umode_t mode, - u64 xattr_version, - struct ceph_buffer *xattrs_buf, - u64 follows, bool inline_data) +static int send_cap_msg(struct cap_msg_args *arg) { struct ceph_mds_caps *fc; struct ceph_msg *msg; void *p; size_t extra_len; + struct timespec zerotime = {0}; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), - cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), - ceph_cap_string(dirty), - seq, issue_seq, flush_tid, oldest_flush_tid, - mseq, follows, size, max_size, - xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); + " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), + arg->cid, arg->ino, ceph_cap_string(arg->caps), + ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), + arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, + arg->mseq, arg->follows, arg->size, arg->max_size, + arg->xattr_version, + arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); /* flock buffer size + inline version + inline data size + * osd_epoch_barrier + oldest_flush_tid */ - extra_len = 4 + 8 + 4 + 4 + 8; + extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; - msg->hdr.version = cpu_to_le16(6); - msg->hdr.tid = cpu_to_le64(flush_tid); + msg->hdr.version = cpu_to_le16(10); + msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; memset(fc, 0, sizeof(*fc)); - fc->cap_id = cpu_to_le64(cid); - fc->op = cpu_to_le32(op); - fc->seq = cpu_to_le32(seq); - fc->issue_seq = cpu_to_le32(issue_seq); - fc->migrate_seq = cpu_to_le32(mseq); - fc->caps = cpu_to_le32(caps); - fc->wanted = cpu_to_le32(wanted); - fc->dirty = cpu_to_le32(dirty); - fc->ino = cpu_to_le64(ino); - fc->snap_follows = cpu_to_le64(follows); - - fc->size = cpu_to_le64(size); - fc->max_size = cpu_to_le64(max_size); - if (mtime) - ceph_encode_timespec(&fc->mtime, mtime); - if (atime) - ceph_encode_timespec(&fc->atime, atime); - if (ctime) - ceph_encode_timespec(&fc->ctime, ctime); - fc->time_warp_seq = cpu_to_le32(time_warp_seq); - - fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); - fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid)); - fc->mode = cpu_to_le32(mode); + fc->cap_id = cpu_to_le64(arg->cid); + fc->op = cpu_to_le32(arg->op); + fc->seq = cpu_to_le32(arg->seq); + fc->issue_seq = cpu_to_le32(arg->issue_seq); + fc->migrate_seq = cpu_to_le32(arg->mseq); + fc->caps = cpu_to_le32(arg->caps); + fc->wanted = cpu_to_le32(arg->wanted); + fc->dirty = cpu_to_le32(arg->dirty); + fc->ino = cpu_to_le64(arg->ino); + fc->snap_follows = cpu_to_le64(arg->follows); + + fc->size = cpu_to_le64(arg->size); + fc->max_size = cpu_to_le64(arg->max_size); + ceph_encode_timespec(&fc->mtime, &arg->mtime); + ceph_encode_timespec(&fc->atime, &arg->atime); + ceph_encode_timespec(&fc->ctime, &arg->ctime); + fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); + + fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); + fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); + fc->mode = cpu_to_le32(arg->mode); + + fc->xattr_version = cpu_to_le64(arg->xattr_version); + if (arg->xattr_buf) { + msg->middle = ceph_buffer_get(arg->xattr_buf); + fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); + msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); + } p = fc + 1; - /* flock buffer size */ + /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); - /* inline version */ - ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); + /* inline version (version 4) */ + ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); - /* osd_epoch_barrier */ + /* osd_epoch_barrier (version 5) */ ceph_encode_32(&p, 0); - /* oldest_flush_tid */ - ceph_encode_64(&p, oldest_flush_tid); + /* oldest_flush_tid (version 6) */ + ceph_encode_64(&p, arg->oldest_flush_tid); - fc->xattr_version = cpu_to_le64(xattr_version); - if (xattrs_buf) { - msg->middle = ceph_buffer_get(xattrs_buf); - fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len); - msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len); - } + /* + * caller_uid/caller_gid (version 7) + * + * Currently, we don't properly track which caller dirtied the caps + * last, and force a flush of them when there is a conflict. For now, + * just set this to 0:0, to emulate how the MDS has worked up to now. + */ + ceph_encode_32(&p, 0); + ceph_encode_32(&p, 0); + + /* pool namespace (version 8) (mds always ignores this) */ + ceph_encode_32(&p, 0); - ceph_con_send(&session->s_con, msg); + /* + * btime and change_attr (version 9) + * + * We just zero these out for now, as the MDS ignores them unless + * the requisite feature flags are set (which we don't do yet). + */ + ceph_encode_timespec(p, &zerotime); + p += sizeof(struct ceph_timespec); + ceph_encode_64(&p, 0); + + /* Advisory flags (version 10) */ + ceph_encode_32(&p, arg->flags); + + ceph_con_send(&arg->session->s_con, msg); return 0; } @@ -1115,27 +1146,17 @@ void ceph_queue_caps_release(struct inode *inode) * caller should hold snap_rwsem (read), s_mutex. */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, - int op, int used, int want, int retain, int flushing, - u64 flush_tid, u64 oldest_flush_tid) + int op, bool sync, int used, int want, int retain, + int flushing, u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; - u64 cap_id = cap->cap_id; - int held, revoking, dropping, keep; - u64 follows, size, max_size; - u32 seq, issue_seq, mseq, time_warp_seq; - struct timespec mtime, atime, ctime; + struct cap_msg_args arg; + int held, revoking, dropping; int wake = 0; - umode_t mode; - kuid_t uid; - kgid_t gid; - struct ceph_mds_session *session; - u64 xattr_version = 0; - struct ceph_buffer *xattr_blob = NULL; int delayed = 0; int ret; - bool inline_data; held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; @@ -1148,7 +1169,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); - session = cap->session; + arg.session = cap->session; /* don't release wanted unless we've waited a bit. */ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 && @@ -1177,40 +1198,51 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - follows = flushing ? ci->i_head_snapc->seq : 0; - - keep = cap->implemented; - seq = cap->seq; - issue_seq = cap->issue_seq; - mseq = cap->mseq; - size = inode->i_size; - ci->i_reported_size = size; - max_size = ci->i_wanted_max_size; - ci->i_requested_max_size = max_size; - mtime = inode->i_mtime; - atime = inode->i_atime; - ctime = inode->i_ctime; - time_warp_seq = ci->i_time_warp_seq; - uid = inode->i_uid; - gid = inode->i_gid; - mode = inode->i_mode; + arg.ino = ceph_vino(inode).ino; + arg.cid = cap->cap_id; + arg.follows = flushing ? ci->i_head_snapc->seq : 0; + arg.flush_tid = flush_tid; + arg.oldest_flush_tid = oldest_flush_tid; + + arg.size = inode->i_size; + ci->i_reported_size = arg.size; + arg.max_size = ci->i_wanted_max_size; + ci->i_requested_max_size = arg.max_size; if (flushing & CEPH_CAP_XATTR_EXCL) { __ceph_build_xattrs_blob(ci); - xattr_blob = ci->i_xattrs.blob; - xattr_version = ci->i_xattrs.version; + arg.xattr_version = ci->i_xattrs.version; + arg.xattr_buf = ci->i_xattrs.blob; + } else { + arg.xattr_buf = NULL; } - inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + arg.mtime = inode->i_mtime; + arg.atime = inode->i_atime; + arg.ctime = inode->i_ctime; + + arg.op = op; + arg.caps = cap->implemented; + arg.wanted = want; + arg.dirty = flushing; + + arg.seq = cap->seq; + arg.issue_seq = cap->issue_seq; + arg.mseq = cap->mseq; + arg.time_warp_seq = ci->i_time_warp_seq; + + arg.uid = inode->i_uid; + arg.gid = inode->i_gid; + arg.mode = inode->i_mode; + + arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + arg.flags = 0; + if (sync) + arg.flags |= CEPH_CLIENT_CAPS_SYNC; spin_unlock(&ci->i_ceph_lock); - ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, - flush_tid, oldest_flush_tid, issue_seq, mseq, - size, max_size, &mtime, &atime, &ctime, time_warp_seq, - uid, gid, mode, xattr_version, xattr_blob, - follows, inline_data); + ret = send_cap_msg(&arg); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); delayed = 1; @@ -1227,15 +1259,42 @@ static inline int __send_flush_snap(struct inode *inode, struct ceph_cap_snap *capsnap, u32 mseq, u64 oldest_flush_tid) { - return send_cap_msg(session, ceph_vino(inode).ino, 0, - CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->cap_flush.tid, - oldest_flush_tid, 0, mseq, capsnap->size, 0, - &capsnap->mtime, &capsnap->atime, - &capsnap->ctime, capsnap->time_warp_seq, - capsnap->uid, capsnap->gid, capsnap->mode, - capsnap->xattr_version, capsnap->xattr_blob, - capsnap->follows, capsnap->inline_data); + struct cap_msg_args arg; + + arg.session = session; + arg.ino = ceph_vino(inode).ino; + arg.cid = 0; + arg.follows = capsnap->follows; + arg.flush_tid = capsnap->cap_flush.tid; + arg.oldest_flush_tid = oldest_flush_tid; + + arg.size = capsnap->size; + arg.max_size = 0; + arg.xattr_version = capsnap->xattr_version; + arg.xattr_buf = capsnap->xattr_blob; + + arg.atime = capsnap->atime; + arg.mtime = capsnap->mtime; + arg.ctime = capsnap->ctime; + + arg.op = CEPH_CAP_OP_FLUSHSNAP; + arg.caps = capsnap->issued; + arg.wanted = 0; + arg.dirty = capsnap->dirty; + + arg.seq = 0; + arg.issue_seq = 0; + arg.mseq = mseq; + arg.time_warp_seq = capsnap->time_warp_seq; + + arg.uid = capsnap->uid; + arg.gid = capsnap->gid; + arg.mode = capsnap->mode; + + arg.inline_data = capsnap->inline_data; + arg.flags = 0; + + return send_cap_msg(&arg); } /* @@ -1858,9 +1917,9 @@ ack: sent++; /* __send_cap drops i_ceph_lock */ - delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, - flush_tid, oldest_flush_tid); + delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false, + cap_used, want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1924,9 +1983,9 @@ retry: &flush_tid, &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - (cap->issued | cap->implemented), - flushing, flush_tid, oldest_flush_tid); + delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true, + used, want, (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); if (delayed) { spin_lock(&ci->i_ceph_lock); @@ -1996,7 +2055,7 @@ static int unsafe_request_wait(struct inode *inode) } spin_unlock(&ci->i_unsafe_lock); - dout("unsafe_requeset_wait %p wait on tid %llu %llu\n", + dout("unsafe_request_wait %p wait on tid %llu %llu\n", inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, @@ -2119,7 +2178,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, inode, cap, cf->tid, ceph_cap_string(cf->caps)); ci->i_ceph_flags |= CEPH_I_NODELAY; ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), + false, __ceph_caps_used(ci), __ceph_caps_wanted(ci), cap->issued | cap->implemented, cf->caps, cf->tid, oldest_flush_tid); @@ -2479,6 +2538,27 @@ static void check_max_size(struct inode *inode, loff_t endoff) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); } +int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got) +{ + int ret, err = 0; + + BUG_ON(need & ~CEPH_CAP_FILE_RD); + BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)); + ret = ceph_pool_perm_check(ci, need); + if (ret < 0) + return ret; + + ret = try_get_cap_refs(ci, need, want, 0, true, got, &err); + if (ret) { + if (err == -EAGAIN) { + ret = 0; + } else if (err < 0) { + ret = err; + } + } + return ret; +} + /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly @@ -2507,9 +2587,20 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, if (err < 0) ret = err; } else { - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - true, &_got, &err)); + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&ci->i_cap_wq, &wait); + + while (!try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)) { + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + } + + remove_wait_queue(&ci->i_cap_wq, &wait); + if (err == -EAGAIN) continue; if (err < 0) @@ -3570,6 +3661,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; + cap->issue_seq = seq; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_cap_releases); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index a594c7879cc2..8ab1fdf0bd49 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -32,40 +32,19 @@ const struct dentry_operations ceph_dentry_ops; /* * Initialize ceph dentry state. */ -int ceph_init_dentry(struct dentry *dentry) +static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; - if (dentry->d_fsdata) - return 0; - di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) return -ENOMEM; /* oh well */ - spin_lock(&dentry->d_lock); - if (dentry->d_fsdata) { - /* lost a race */ - kmem_cache_free(ceph_dentry_cachep, di); - goto out_unlock; - } - - if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) - d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) - d_set_d_op(dentry, &ceph_snapdir_dentry_ops); - else - d_set_d_op(dentry, &ceph_snap_dentry_ops); - di->dentry = dentry; di->lease_session = NULL; di->time = jiffies; - /* avoid reordering d_fsdata setup so that the check above is safe */ - smp_mb(); dentry->d_fsdata = di; ceph_dentry_lru_add(dentry); -out_unlock: - spin_unlock(&dentry->d_lock); return 0; } @@ -737,10 +716,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - err = ceph_init_dentry(dentry); - if (err < 0) - return ERR_PTR(err); - /* can we conclude ENOENT locally? */ if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -1255,7 +1230,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) struct ceph_mds_client *mdsc = ceph_sb_to_client(dir->i_sb)->mdsc; struct ceph_mds_request *req; - int op, mask, err; + int op, err; + u32 mask; if (flags & LOOKUP_RCU) return -ECHILD; @@ -1270,7 +1246,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; if (ceph_security_xattr_wanted(dir)) mask |= CEPH_CAP_XATTR_SHARED; - req->r_args.getattr.mask = mask; + req->r_args.getattr.mask = cpu_to_le32(mask); err = ceph_mdsc_do_request(mdsc, NULL, req); switch (err) { @@ -1323,16 +1299,6 @@ static void ceph_d_release(struct dentry *dentry) kmem_cache_free(ceph_dentry_cachep, di); } -static int ceph_snapdir_d_revalidate(struct dentry *dentry, - unsigned int flags) -{ - /* - * Eventually, we'll want to revalidate snapped metadata - * too... probably... - */ - return 1; -} - /* * When the VFS prunes a dentry from the cache, we need to clear the * complete flag on the parent directory. @@ -1351,6 +1317,9 @@ static void ceph_d_prune(struct dentry *dentry) if (d_unhashed(dentry)) return; + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) + return; + /* * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release @@ -1521,14 +1490,5 @@ const struct dentry_operations ceph_dentry_ops = { .d_revalidate = ceph_d_revalidate, .d_release = ceph_d_release, .d_prune = ceph_d_prune, -}; - -const struct dentry_operations ceph_snapdir_dentry_ops = { - .d_revalidate = ceph_snapdir_d_revalidate, - .d_release = ceph_d_release, -}; - -const struct dentry_operations ceph_snap_dentry_ops = { - .d_release = ceph_d_release, - .d_prune = ceph_d_prune, + .d_init = ceph_d_init, }; diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 1780218a48f0..180bbef760f2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -62,7 +62,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) { struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct inode *inode; - struct dentry *dentry; struct ceph_vino vino; int err; @@ -94,16 +93,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_PTR(-ESTALE); } - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) - return dentry; - err = ceph_init_dentry(dentry); - if (err < 0) { - dput(dentry); - return ERR_PTR(err); - } - dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry); - return dentry; + return d_obtain_alias(inode); } /* @@ -131,7 +121,6 @@ static struct dentry *__get_parent(struct super_block *sb, struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc; struct ceph_mds_request *req; struct inode *inode; - struct dentry *dentry; int mask; int err; @@ -164,18 +153,7 @@ static struct dentry *__get_parent(struct super_block *sb, if (!inode) return ERR_PTR(-ENOENT); - dentry = d_obtain_alias(inode); - if (IS_ERR(dentry)) - return dentry; - err = ceph_init_dentry(dentry); - if (err < 0) { - dput(dentry); - return ERR_PTR(err); - } - dout("__get_parent ino %llx parent %p ino %llx.%llx\n", - child ? ceph_ino(d_inode(child)) : ino, - dentry, ceph_vinop(inode)); - return dentry; + return d_obtain_alias(inode); } static struct dentry *ceph_get_parent(struct dentry *child) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index f995e3528a33..045d30d26624 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -351,10 +351,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (dentry->d_name.len > NAME_MAX) return -ENAMETOOLONG; - err = ceph_init_dentry(dentry); - if (err < 0) - return err; - if (flags & O_CREAT) { err = ceph_pre_init_acls(dir, &mode, &acls); if (err < 0) @@ -458,71 +454,60 @@ enum { * only return a short read to the caller if we hit EOF. */ static int striped_read(struct inode *inode, - u64 off, u64 len, + u64 pos, u64 len, struct page **pages, int num_pages, - int *checkeof) + int page_align, int *checkeof) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - u64 pos, this_len, left; + u64 this_len; loff_t i_size; - int page_align, pages_left; - int read, ret; - struct page **page_pos; + int page_idx; + int ret, read = 0; bool hit_stripe, was_short; /* * we may need to do multiple reads. not atomic, unfortunately. */ - pos = off; - left = len; - page_pos = pages; - pages_left = num_pages; - read = 0; - more: - page_align = pos & ~PAGE_MASK; - this_len = left; + this_len = len; + page_idx = (page_align + read) >> PAGE_SHIFT; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, - ci->i_truncate_seq, - ci->i_truncate_size, - page_pos, pages_left, page_align); + ci->i_truncate_seq, ci->i_truncate_size, + pages + page_idx, num_pages - page_idx, + ((page_align + read) & ~PAGE_MASK)); if (ret == -ENOENT) ret = 0; - hit_stripe = this_len < left; + hit_stripe = this_len < len; was_short = ret >= 0 && ret < this_len; - dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, + dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); i_size = i_size_read(inode); if (ret >= 0) { - int didpages; if (was_short && (pos + ret < i_size)) { int zlen = min(this_len - ret, i_size - pos - ret); - int zoff = (off & ~PAGE_MASK) + read + ret; + int zoff = page_align + read + ret; dout(" zero gap %llu to %llu\n", - pos + ret, pos + ret + zlen); + pos + ret, pos + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); ret += zlen; } - didpages = (page_align + ret) >> PAGE_SHIFT; + read += ret; pos += ret; - read = pos - off; - left -= ret; - page_pos += didpages; - pages_left -= didpages; + len -= ret; /* hit stripe and need continue*/ - if (left && hit_stripe && pos < i_size) + if (len && hit_stripe && pos < i_size) goto more; } if (read > 0) { ret = read; /* did we bounce off eof? */ - if (pos + left > i_size) + if (pos + len > i_size) *checkeof = CHECK_EOF; } @@ -536,15 +521,16 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, - int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, + int *checkeof) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; u64 off = iocb->ki_pos; - int num_pages, ret; - size_t len = iov_iter_count(i); + int num_pages; + ssize_t ret; + size_t len = iov_iter_count(to); dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, @@ -563,35 +549,56 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = striped_read(inode, off, len, pages, - num_pages, checkeof); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, left, - PAGE_SIZE - page_off); - l = copy_page_to_iter(pages[k++], page_off, copy, i); - off += l; - left -= l; - if (l < copy) - break; + if (unlikely(to->type & ITER_PIPE)) { + size_t page_off; + ret = iov_iter_get_pages_alloc(to, &pages, len, + &page_off); + if (ret <= 0) + return -ENOMEM; + num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); + + ret = striped_read(inode, off, ret, pages, num_pages, + page_off, checkeof); + if (ret > 0) { + iov_iter_advance(to, ret); + off += ret; + } else { + iov_iter_advance(to, 0); } + ceph_put_page_vector(pages, num_pages, false); + } else { + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = striped_read(inode, off, len, pages, num_pages, + (off & ~PAGE_MASK), checkeof); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, left, + PAGE_SIZE - page_off); + l = copy_page_to_iter(pages[k++], page_off, + copy, to); + off += l; + left -= l; + if (l < copy) + break; + } + } + ceph_release_page_vector(pages, num_pages); } - ceph_release_page_vector(pages, num_pages); if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; iocb->ki_pos = off; } - dout("sync_read result %d\n", ret); + dout("sync_read result %zd\n", ret); return ret; } @@ -853,7 +860,7 @@ void ceph_sync_write_wait(struct inode *inode) dout("sync_write_wait on tid %llu (until %llu)\n", req->r_tid, last_tid); - wait_for_completion(&req->r_safe_completion); + wait_for_completion(&req->r_done_completion); ceph_osdc_put_request(req); spin_lock(&ci->i_unsafe_lock); @@ -906,7 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, pos >> PAGE_SHIFT, (pos + count) >> PAGE_SHIFT); if (ret2 < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); + dout("invalidate_inode_pages2_range returned %d\n", ret2); flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_ONDISK | @@ -1249,8 +1256,9 @@ again: dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, ceph_cap_string(got)); - + current->journal_info = filp; ret = generic_file_read_iter(iocb, to); + current->journal_info = NULL; } dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); @@ -1770,6 +1778,7 @@ const struct file_operations ceph_file_fops = { .fsync = ceph_fsync, .lock = ceph_lock, .flock = ceph_flock, + .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .unlocked_ioctl = ceph_ioctl, .compat_ioctl = ceph_ioctl, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index ef4d04647325..5e659d054b40 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -305,7 +305,8 @@ static int frag_tree_split_cmp(const void *l, const void *r) { struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l; struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r; - return ceph_frag_compare(ls->frag, rs->frag); + return ceph_frag_compare(le32_to_cpu(ls->frag), + le32_to_cpu(rs->frag)); } static bool is_frag_child(u32 f, struct ceph_inode_frag *frag) @@ -1023,16 +1024,17 @@ static void update_dentry_lease(struct dentry *dentry, long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; struct inode *dir; - /* only track leases on regular dentries */ - if (dentry->d_op != &ceph_dentry_ops) - return; - spin_lock(&dentry->d_lock); dout("update_dentry_lease %p duration %lu ms ttl %lu\n", dentry, duration, ttl); /* make lease_rdcache_gen match directory */ dir = d_inode(dentry->d_parent); + + /* only track leases on regular dentries */ + if (ceph_snap(dir) != CEPH_NOSNAP) + goto out_unlock; + di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; if (duration == 0) @@ -1202,12 +1204,7 @@ retry_lookup: err = -ENOMEM; goto done; } - err = ceph_init_dentry(dn); - if (err < 0) { - dput(dn); - dput(parent); - goto done; - } + err = 0; } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != vino.ino || ceph_snap(d_inode(dn)) != vino.snap)) { @@ -1561,12 +1558,6 @@ retry_lookup: err = -ENOMEM; goto out; } - ret = ceph_init_dentry(dn); - if (ret < 0) { - dput(dn); - err = ret; - goto out; - } } else if (d_really_is_positive(dn) && (ceph_ino(d_inode(dn)) != vino.ino || ceph_snap(d_inode(dn)) != vino.snap)) { @@ -1879,7 +1870,6 @@ retry: * symlinks */ static const struct inode_operations ceph_symlink_iops = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 815acd1a56d4..c9d2e553a6c4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -288,12 +288,13 @@ static int parse_reply_info_extra(void **p, void *end, struct ceph_mds_reply_info_parsed *info, u64 features) { - if (info->head->op == CEPH_MDS_OP_GETFILELOCK) + u32 op = le32_to_cpu(info->head->op); + + if (op == CEPH_MDS_OP_GETFILELOCK) return parse_reply_info_filelock(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_READDIR || - info->head->op == CEPH_MDS_OP_LSSNAP) + else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP) return parse_reply_info_dir(p, end, info, features); - else if (info->head->op == CEPH_MDS_OP_CREATE) + else if (op == CEPH_MDS_OP_CREATE) return parse_reply_info_create(p, end, info, features); else return -EIO; @@ -2100,17 +2101,31 @@ static int __do_request(struct ceph_mds_client *mdsc, err = -EIO; goto finish; } + if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) { + if (mdsc->mdsmap_err) { + err = mdsc->mdsmap_err; + dout("do_request mdsmap err %d\n", err); + goto finish; + } + if (mdsc->mdsmap->m_epoch == 0) { + dout("do_request no mdsmap, waiting for map\n"); + list_add(&req->r_wait, &mdsc->waiting_for_map); + goto finish; + } + if (!(mdsc->fsc->mount_options->flags & + CEPH_MOUNT_OPT_MOUNTWAIT) && + !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { + err = -ENOENT; + pr_info("probably no mds server is up\n"); + goto finish; + } + } put_request_session(req); mds = __choose_mds(mdsc, req); if (mds < 0 || ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { - if (mdsc->mdsmap_err) { - err = mdsc->mdsmap_err; - dout("do_request mdsmap err %d\n", err); - goto finish; - } dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); goto out; @@ -3943,13 +3958,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, } -static int verify_authorizer_reply(struct ceph_connection *con, int len) +static int verify_authorizer_reply(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; - return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len); + return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer); } static int invalidate_authorizer(struct ceph_connection *con) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8c3591a7fbae..5454e2327a5f 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -42,6 +42,60 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) return i; } +#define __decode_and_drop_type(p, end, type, bad) \ + do { \ + if (*p + sizeof(type) > end) \ + goto bad; \ + *p += sizeof(type); \ + } while (0) + +#define __decode_and_drop_set(p, end, type, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = sizeof(type) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + +#define __decode_and_drop_map(p, end, ktype, vtype, bad) \ + do { \ + u32 n; \ + size_t need; \ + ceph_decode_32_safe(p, end, n, bad); \ + need = (sizeof(ktype) + sizeof(vtype)) * n; \ + ceph_decode_need(p, end, need, bad); \ + *p += need; \ + } while (0) + + +static int __decode_and_drop_compat_set(void **p, void* end) +{ + int i; + /* compat, ro_compat, incompat*/ + for (i = 0; i < 3; i++) { + u32 n; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad); + /* mask */ + *p += sizeof(u64); + /* names (map<u64, string>) */ + n = ceph_decode_32(p); + while (n-- > 0) { + u32 len; + ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), + bad); + *p += sizeof(u64); + len = ceph_decode_32(p); + ceph_decode_need(p, end, len, bad); + *p += len; + } + } + return 0; +bad: + return -1; +} + /* * Decode an MDS map * @@ -55,6 +109,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) int i, j, n; int err = -EINVAL; u8 mdsmap_v, mdsmap_cv; + u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); if (m == NULL) @@ -83,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); if (m->m_info == NULL) - goto badmem; + goto nomem; /* pick out active nodes from mds_info (state > 0) */ n = ceph_decode_32(p); @@ -166,7 +221,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); if (info->export_targets == NULL) - goto badmem; + goto nomem; for (j = 0; j < num_export_targets; j++) info->export_targets[j] = ceph_decode_32(&pexport_targets); @@ -180,24 +235,104 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_num_data_pg_pools = n; m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS); if (!m->m_data_pg_pools) - goto badmem; + goto nomem; ceph_decode_need(p, end, sizeof(u64)*(n+1), bad); for (i = 0; i < n; i++) m->m_data_pg_pools[i] = ceph_decode_64(p); m->m_cas_pg_pool = ceph_decode_64(p); + m->m_enabled = m->m_epoch > 1; + + mdsmap_ev = 1; + if (mdsmap_v >= 2) { + ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext); + } + if (mdsmap_ev >= 3) { + if (__decode_and_drop_compat_set(p, end) < 0) + goto bad_ext; + } + /* metadata_pool */ + if (mdsmap_ev < 5) { + __decode_and_drop_type(p, end, u32, bad_ext); + } else { + __decode_and_drop_type(p, end, u64, bad_ext); + } - /* ok, we don't care about the rest. */ + /* created + modified + tableserver */ + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, struct ceph_timespec, bad_ext); + __decode_and_drop_type(p, end, u32, bad_ext); + + /* in */ + { + int num_laggy = 0; + ceph_decode_32_safe(p, end, n, bad_ext); + ceph_decode_need(p, end, sizeof(u32) * n, bad_ext); + + for (i = 0; i < n; i++) { + s32 mds = ceph_decode_32(p); + if (mds >= 0 && mds < m->m_max_mds) { + if (m->m_info[mds].laggy) + num_laggy++; + } + } + m->m_num_laggy = num_laggy; + } + + /* inc */ + __decode_and_drop_map(p, end, u32, u32, bad_ext); + /* up */ + __decode_and_drop_map(p, end, u32, u64, bad_ext); + /* failed */ + __decode_and_drop_set(p, end, u32, bad_ext); + /* stopped */ + __decode_and_drop_set(p, end, u32, bad_ext); + + if (mdsmap_ev >= 4) { + /* last_failure_osd_epoch */ + __decode_and_drop_type(p, end, u32, bad_ext); + } + if (mdsmap_ev >= 6) { + /* ever_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + /* explicitly_allowed_snaps */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 7) { + /* inline_data_enabled */ + __decode_and_drop_type(p, end, u8, bad_ext); + } + if (mdsmap_ev >= 8) { + u32 name_len; + /* enabled */ + ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); + ceph_decode_32_safe(p, end, name_len, bad_ext); + ceph_decode_need(p, end, name_len, bad_ext); + *p += name_len; + } + /* damaged */ + if (mdsmap_ev >= 9) { + size_t need; + ceph_decode_32_safe(p, end, n, bad_ext); + need = sizeof(u32) * n; + ceph_decode_need(p, end, need, bad_ext); + *p += need; + m->m_damaged = n > 0; + } else { + m->m_damaged = false; + } +bad_ext: *p = end; dout("mdsmap_decode success epoch %u\n", m->m_epoch); return m; - -badmem: +nomem: err = -ENOMEM; + goto out_err; bad: pr_err("corrupt mdsmap\n"); print_hex_dump(KERN_DEBUG, "mdsmap: ", DUMP_PREFIX_OFFSET, 16, 1, start, end - start, true); +out_err: ceph_mdsmap_destroy(m); return ERR_PTR(err); } @@ -212,3 +347,19 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) kfree(m->m_data_pg_pools); kfree(m); } + +bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) +{ + int i, nr_active = 0; + if (!m->m_enabled) + return false; + if (m->m_damaged) + return false; + if (m->m_num_laggy > 0) + return false; + for (i = 0; i < m->m_max_mds; i++) { + if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) + nr_active++; + } + return nr_active > 0; +} diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 9ff5219d849e..8f8b41c2ef0f 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -593,6 +593,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->atime = inode->i_atime; capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; + capsnap->truncate_size = ci->i_truncate_size; + capsnap->truncate_seq = ci->i_truncate_seq; if (capsnap->dirty_pages) { dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " "still has %d dirty pages\n", inode, capsnap, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b382e5910eea..6bd20d707bfd 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -137,6 +137,8 @@ enum { Opt_nofscache, Opt_poolperm, Opt_nopoolperm, + Opt_require_active_mds, + Opt_norequire_active_mds, #ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, #endif @@ -171,6 +173,8 @@ static match_table_t fsopt_tokens = { {Opt_nofscache, "nofsc"}, {Opt_poolperm, "poolperm"}, {Opt_nopoolperm, "nopoolperm"}, + {Opt_require_active_mds, "require_active_mds"}, + {Opt_norequire_active_mds, "norequire_active_mds"}, #ifdef CONFIG_CEPH_FS_POSIX_ACL {Opt_acl, "acl"}, #endif @@ -287,6 +291,12 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nopoolperm: fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; break; + case Opt_require_active_mds: + fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT; + break; + case Opt_norequire_active_mds: + fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= MS_POSIXACL; @@ -795,7 +805,6 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, root = ERR_PTR(-ENOMEM); goto out; } - ceph_init_dentry(root); dout("open_root_inode success, root dentry is %p\n", root); } else { root = ERR_PTR(err); @@ -879,6 +888,7 @@ static int ceph_set_super(struct super_block *s, void *data) fsc->sb = s; s->s_op = &ceph_super_ops; + s->s_d_op = &ceph_dentry_ops; s->s_export_op = &ceph_export_ops; s->s_time_gran = 1000; /* 1000 ns == 1 us */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3e3fa9163059..3373b61faefd 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -36,6 +36,7 @@ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ +#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */ #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE @@ -180,6 +181,8 @@ struct ceph_cap_snap { u64 size; struct timespec mtime, atime, ctime; u64 time_warp_seq; + u64 truncate_size; + u32 truncate_seq; int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; @@ -905,6 +908,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page); +extern int ceph_try_get_caps(struct ceph_inode_info *ci, + int need, int want, int *got); /* for counting open files by mode */ extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode); @@ -934,8 +939,7 @@ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; extern const struct inode_operations ceph_dir_iops; extern const struct inode_operations ceph_snapdir_iops; -extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops, - ceph_snapdir_dentry_ops; +extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); @@ -951,13 +955,6 @@ extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); -/* - * our d_ops vary depending on whether the inode is live, - * snapshotted (read-only), or a virtual ".snap" directory. - */ -int ceph_init_dentry(struct dentry *dentry); - - /* ioctl.c */ extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 3d03e48a9213..9727e1dcacd5 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -24,7 +24,7 @@ #include <linux/ctype.h> #include <linux/module.h> #include <linux/proc_fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 5eb04129f938..66bd7fa9b7a6 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -699,11 +699,15 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) { if (!ses->domainName) { - rc = find_domain_name(ses, nls_cp); - if (rc) { - cifs_dbg(VFS, "error %d finding domain name\n", - rc); - goto setup_ntlmv2_rsp_ret; + if (ses->domainAuto) { + rc = find_domain_name(ses, nls_cp); + if (rc) { + cifs_dbg(VFS, "error %d finding domain name\n", + rc); + goto setup_ntlmv2_rsp_ret; + } + } else { + ses->domainName = kstrdup("", GFP_KERNEL); } } } else { diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 15261ba464c5..70f4e65fced2 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -615,7 +615,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) return dget(sb->s_root); full_path = cifs_build_path_to_root(vol, cifs_sb, - cifs_sb_master_tcon(cifs_sb)); + cifs_sb_master_tcon(cifs_sb), 0); if (full_path == NULL) return ERR_PTR(-ENOMEM); @@ -914,7 +914,6 @@ const struct inode_operations cifs_file_inode_ops = { }; const struct inode_operations cifs_symlink_inode_ops = { - .readlink = generic_readlink, .get_link = cifs_get_link, .permission = cifs_permission, .listxattr = cifs_listxattr, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 1f17f6bd7a60..7ea8a3393936 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -514,6 +514,7 @@ struct smb_vol { bool persistent:1; bool nopersistent:1; bool resilient:1; /* noresilient not required since not fored for CA */ + bool domainauto:1; unsigned int rsize; unsigned int wsize; bool sockopt_tcp_nodelay:1; @@ -525,6 +526,7 @@ struct smb_vol { struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; unsigned int echo_interval; /* echo interval in secs */ + __u64 snapshot_time; /* needed for timewarp tokens */ unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */ }; @@ -646,6 +648,8 @@ struct TCP_Server_Info { unsigned int max_read; unsigned int max_write; __u8 preauth_hash[512]; + struct delayed_work reconnect; /* reconnect workqueue job */ + struct mutex reconnect_mutex; /* prevent simultaneous reconnects */ #endif /* CONFIG_CIFS_SMB2 */ unsigned long echo_interval; }; @@ -827,6 +831,7 @@ struct cifs_ses { enum securityEnum sectype; /* what security flavor was specified? */ bool sign; /* is signing required? */ bool need_reconnect:1; /* connection reset, uid now invalid */ + bool domainAuto:1; #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; @@ -849,6 +854,7 @@ cap_unix(struct cifs_ses *ses) struct cifs_tcon { struct list_head tcon_list; int tc_count; + struct list_head rlist; /* reconnect list */ struct list_head openFileList; spinlock_t open_file_lock; /* protects list above */ struct cifs_ses *ses; /* pointer to session associated with */ @@ -922,6 +928,7 @@ struct cifs_tcon { bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */ bool broken_sparse_sup; /* if server or share does not support sparse */ bool need_reconnect:1; /* connection reset, tid now invalid */ + bool need_reopen_files:1; /* need to reopen tcon file handles */ bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ #ifdef CONFIG_CIFS_SMB2 @@ -932,6 +939,7 @@ struct cifs_tcon { __u32 maximal_access; __u32 vol_serial_number; __le64 vol_create_time; + __u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */ __u32 ss_flags; /* sector size flags */ __u32 perf_sector_size; /* best sector size for perf */ __u32 max_chunks; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ced0e42ce460..c7b3c841e660 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -63,7 +63,8 @@ extern void exit_cifs_spnego(void); extern char *build_path_from_dentry(struct dentry *); extern char *cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon); + struct cifs_tcon *tcon, + int add_treename); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); extern char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, const struct dfs_info3_param *ref, @@ -206,6 +207,9 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); +extern void cifs_put_tcp_session(struct TCP_Server_Info *server, + int from_reconnect); +extern void cifs_put_tcon(struct cifs_tcon *tcon); #if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) extern void cifs_dfs_release_automount_timer(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e3fed9249a04..b47261858e6d 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -35,7 +35,7 @@ #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/task_io_accounting_ops.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsacl.h" diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4547aeddd12b..35ae49ed1f76 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -34,13 +34,14 @@ #include <linux/pagevec.h> #include <linux/freezer.h> #include <linux/namei.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <linux/inet.h> #include <linux/module.h> #include <keys/user-type.h> #include <net/ipv6.h> #include <linux/parser.h> +#include <linux/bvec.h> #include "cifspdu.h" #include "cifsglob.h" @@ -52,6 +53,9 @@ #include "nterr.h" #include "rfc1002pdu.h" #include "fscache.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2proto.h" +#endif #define CIFS_PORT 445 #define RFC1001_PORT 139 @@ -88,6 +92,7 @@ enum { Opt_multiuser, Opt_sloppy, Opt_nosharesock, Opt_persistent, Opt_nopersistent, Opt_resilient, Opt_noresilient, + Opt_domainauto, /* Mount options which take numeric value */ Opt_backupuid, Opt_backupgid, Opt_uid, @@ -95,6 +100,7 @@ enum { Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, Opt_echo_interval, Opt_max_credits, + Opt_snapshot, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -176,6 +182,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_nopersistent, "nopersistenthandles"}, { Opt_resilient, "resilienthandles"}, { Opt_noresilient, "noresilienthandles"}, + { Opt_domainauto, "domainauto"}, { Opt_backupuid, "backupuid=%s" }, { Opt_backupgid, "backupgid=%s" }, @@ -191,6 +198,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_actimeo, "actimeo=%s" }, { Opt_echo_interval, "echo_interval=%s" }, { Opt_max_credits, "max_credits=%s" }, + { Opt_snapshot, "snapshot=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -1499,6 +1507,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, case Opt_noresilient: vol->resilient = false; /* already the default */ break; + case Opt_domainauto: + vol->domainauto = true; + break; /* Numeric Values */ case Opt_backupuid: @@ -1601,6 +1612,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->echo_interval = option; break; + case Opt_snapshot: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid snapshot time\n", + __func__); + goto cifs_parse_mount_err; + } + vol->snapshot_time = option; + break; case Opt_max_credits: if (get_option_ul(args, &option) || (option < 20) || (option > 60000)) { @@ -2100,8 +2119,8 @@ cifs_find_tcp_session(struct smb_vol *vol) return NULL; } -static void -cifs_put_tcp_session(struct TCP_Server_Info *server) +void +cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) { struct task_struct *task; @@ -2118,6 +2137,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) cancel_delayed_work_sync(&server->echo); +#ifdef CONFIG_CIFS_SMB2 + if (from_reconnect) + /* + * Avoid deadlock here: reconnect work calls + * cifs_put_tcp_session() at its end. Need to be sure + * that reconnect work does nothing with server pointer after + * that step. + */ + cancel_delayed_work(&server->reconnect); + else + cancel_delayed_work_sync(&server->reconnect); +#endif + spin_lock(&GlobalMid_Lock); server->tcpStatus = CifsExiting; spin_unlock(&GlobalMid_Lock); @@ -2182,6 +2214,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info) INIT_LIST_HEAD(&tcp_ses->tcp_ses_list); INIT_LIST_HEAD(&tcp_ses->smb_ses_list); INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request); +#ifdef CONFIG_CIFS_SMB2 + INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server); + mutex_init(&tcp_ses->reconnect_mutex); +#endif memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr, sizeof(tcp_ses->srcaddr)); memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr, @@ -2340,7 +2376,7 @@ cifs_put_smb_ses(struct cifs_ses *ses) spin_unlock(&cifs_tcp_ses_lock); sesInfoFree(ses); - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); } #ifdef CONFIG_KEYS @@ -2514,7 +2550,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) mutex_unlock(&ses->session_mutex); /* existing SMB ses has a server reference already */ - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); free_xid(xid); return ses; } @@ -2548,6 +2584,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) if (!ses->domainName) goto get_ses_fail; } + if (volume_info->domainauto) + ses->domainAuto = volume_info->domainauto; ses->cred_uid = volume_info->cred_uid; ses->linux_uid = volume_info->linux_uid; @@ -2586,7 +2624,7 @@ static int match_tcon(struct cifs_tcon *tcon, const char *unc) } static struct cifs_tcon * -cifs_find_tcon(struct cifs_ses *ses, const char *unc) +cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) { struct list_head *tmp; struct cifs_tcon *tcon; @@ -2594,8 +2632,14 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc) spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp, &ses->tcon_list) { tcon = list_entry(tmp, struct cifs_tcon, tcon_list); - if (!match_tcon(tcon, unc)) + if (!match_tcon(tcon, volume_info->UNC)) continue; + +#ifdef CONFIG_CIFS_SMB2 + if (tcon->snapshot_time != volume_info->snapshot_time) + continue; +#endif /* CONFIG_CIFS_SMB2 */ + ++tcon->tc_count; spin_unlock(&cifs_tcp_ses_lock); return tcon; @@ -2604,7 +2648,7 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc) return NULL; } -static void +void cifs_put_tcon(struct cifs_tcon *tcon) { unsigned int xid; @@ -2636,7 +2680,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) int rc, xid; struct cifs_tcon *tcon; - tcon = cifs_find_tcon(ses, volume_info->UNC); + tcon = cifs_find_tcon(ses, volume_info); if (tcon) { cifs_dbg(FYI, "Found match on UNC path\n"); /* existing tcon already has a reference */ @@ -2657,6 +2701,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info) goto out_fail; } + if (volume_info->snapshot_time) { +#ifdef CONFIG_CIFS_SMB2 + if (ses->server->vals->protocol_id == 0) { + cifs_dbg(VFS, + "Use SMB2 or later for snapshot mount option\n"); + rc = -EOPNOTSUPP; + goto out_fail; + } else + tcon->snapshot_time = volume_info->snapshot_time; +#else + cifs_dbg(VFS, "Snapshot mount option requires SMB2 support\n"); + rc = -EOPNOTSUPP; + goto out_fail; +#endif /* CONFIG_CIFS_SMB2 */ + } + tcon->ses = ses; if (volume_info->password) { tcon->password = kstrdup(volume_info->password, GFP_KERNEL); @@ -3706,7 +3766,8 @@ remote_path_check: /* * cifs_build_path_to_root works only when we have a valid tcon */ - full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon); + full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon, + tcon->Flags & SMB_SHARE_IS_IN_DFS); if (full_path == NULL) { rc = -ENOMEM; goto mount_fail_check; @@ -3792,7 +3853,7 @@ mount_fail_check: else if (ses) cifs_put_smb_ses(ses); else - cifs_put_tcp_session(server); + cifs_put_tcp_session(server, 0); bdi_destroy(&cifs_sb->bdi); } @@ -4103,7 +4164,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info); if (IS_ERR(ses)) { tcon = (struct cifs_tcon *)ses; - cifs_put_tcp_session(master_tcon->ses->server); + cifs_put_tcp_session(master_tcon->ses->server, 0); goto out; } diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 789ff1df2d8d..2c227a99f369 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -47,7 +47,7 @@ renew_parental_timestamps(struct dentry *direntry) char * cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, - struct cifs_tcon *tcon) + struct cifs_tcon *tcon, int add_treename) { int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0; int dfsplen; @@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb, return full_path; } - if (tcon->Flags & SMB_SHARE_IS_IN_DFS) + if (add_treename) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else dfsplen = 0; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 7f5f6176c6f1..18a1e1d6671f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -777,6 +777,11 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) struct list_head *tmp1; struct list_head tmp_list; + if (!tcon->use_persistent || !tcon->need_reopen_files) + return; + + tcon->need_reopen_files = false; + cifs_dbg(FYI, "Reopen persistent handles"); INIT_LIST_HEAD(&tmp_list); @@ -793,7 +798,8 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon) list_for_each_safe(tmp, tmp1, &tmp_list) { open_file = list_entry(tmp, struct cifsFileInfo, rlist); - cifs_reopen_file(open_file, false /* do not flush */); + if (cifs_reopen_file(open_file, false /* do not flush */)) + tcon->need_reopen_files = true; list_del_init(&open_file->rlist); cifsFileInfo_put(open_file); } diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 9f51b81119f2..001528781b6b 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); - cifs_dbg(VFS, "cifs ioctl 0x%x\n", command); + cifs_dbg(FYI, "cifs ioctl 0x%x\n", command); switch (command) { case FS_IOC_GETFLAGS: if (pSMBFile == NULL) diff --git a/fs/cifs/link.c b/fs/cifs/link.c index d031af8d3d4d..c4d996f78e1c 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -45,13 +45,8 @@ (CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN) #define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n" -#define CIFS_MF_SYMLINK_MD5_FORMAT \ - "%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n" -#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \ - md5_hash[0], md5_hash[1], md5_hash[2], md5_hash[3], \ - md5_hash[4], md5_hash[5], md5_hash[6], md5_hash[7], \ - md5_hash[8], md5_hash[9], md5_hash[10], md5_hash[11],\ - md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15] +#define CIFS_MF_SYMLINK_MD5_FORMAT "%16phN\n" +#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash static int symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 8f6a2a5863b9..a27fc8791551 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -285,6 +285,7 @@ initiate_cifs_search(const unsigned int xid, struct file *file) rc = -ENOMEM; goto error_exit; } + spin_lock_init(&cifsFile->file_info_lock); file->private_data = cifsFile; cifsFile->tlink = cifs_get_tlink(tlink); tcon = tlink_tcon(tlink); diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index f9e766f464be..b2aff0c6f22c 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -260,7 +260,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) * and check it for zero before using. */ max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf; - if (!max_buf) { + if (max_buf < sizeof(struct smb2_lock_element)) { free_xid(xid); return -EINVAL; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 5ca5ea4668a1..87457227812c 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -250,16 +250,19 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) } cifs_mark_open_files_invalid(tcon); + if (tcon->use_persistent) + tcon->need_reopen_files = true; rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); mutex_unlock(&tcon->ses->session_mutex); - if (tcon->use_persistent) - cifs_reopen_persistent_handles(tcon); - cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) goto out; + + if (smb2_command != SMB2_INTERNAL_CMD) + queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + atomic_inc(&tconInfoReconnectCount); out: /* @@ -280,7 +283,7 @@ out: case SMB2_CHANGE_NOTIFY: case SMB2_QUERY_INFO: case SMB2_SET_INFO: - return -EAGAIN; + rc = -EAGAIN; } unload_nls(nls_codepage); return rc; @@ -1972,6 +1975,55 @@ smb2_echo_callback(struct mid_q_entry *mid) add_credits(server, credits_received, CIFS_ECHO_OP); } +void smb2_reconnect_server(struct work_struct *work) +{ + struct TCP_Server_Info *server = container_of(work, + struct TCP_Server_Info, reconnect.work); + struct cifs_ses *ses; + struct cifs_tcon *tcon, *tcon2; + struct list_head tmp_list; + int tcon_exist = false; + + /* Prevent simultaneous reconnects that can corrupt tcon->rlist list */ + mutex_lock(&server->reconnect_mutex); + + INIT_LIST_HEAD(&tmp_list); + cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->need_reconnect || tcon->need_reopen_files) { + tcon->tc_count++; + list_add_tail(&tcon->rlist, &tmp_list); + tcon_exist = true; + } + } + } + /* + * Get the reference to server struct to be sure that the last call of + * cifs_put_tcon() in the loop below won't release the server pointer. + */ + if (tcon_exist) + server->srv_count++; + + spin_unlock(&cifs_tcp_ses_lock); + + list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { + if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon)) + cifs_reopen_persistent_handles(tcon); + list_del_init(&tcon->rlist); + cifs_put_tcon(tcon); + } + + cifs_dbg(FYI, "Reconnecting tcons finished\n"); + mutex_unlock(&server->reconnect_mutex); + + /* now we can safely release srv struct */ + if (tcon_exist) + cifs_put_tcp_session(server, 1); +} + int SMB2_echo(struct TCP_Server_Info *server) { @@ -1984,32 +2036,11 @@ SMB2_echo(struct TCP_Server_Info *server) cifs_dbg(FYI, "In echo request\n"); if (server->tcpStatus == CifsNeedNegotiate) { - struct list_head *tmp, *tmp2; - struct cifs_ses *ses; - struct cifs_tcon *tcon; - - cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n"); - spin_lock(&cifs_tcp_ses_lock); - list_for_each(tmp, &server->smb_ses_list) { - ses = list_entry(tmp, struct cifs_ses, smb_ses_list); - list_for_each(tmp2, &ses->tcon_list) { - tcon = list_entry(tmp2, struct cifs_tcon, - tcon_list); - /* add check for persistent handle reconnect */ - if (tcon && tcon->need_reconnect) { - spin_unlock(&cifs_tcp_ses_lock); - rc = smb2_reconnect(SMB2_ECHO, tcon); - spin_lock(&cifs_tcp_ses_lock); - } - } - } - spin_unlock(&cifs_tcp_ses_lock); + /* No need to send echo on newly established connections */ + queue_delayed_work(cifsiod_wq, &server->reconnect, 0); + return rc; } - /* if no session, renegotiate failed above */ - if (server->tcpStatus == CifsNeedNegotiate) - return -EIO; - rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req); if (rc) return rc; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index fd3709e8de33..dc0d141f33e2 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -80,6 +80,8 @@ #define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) #define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) +#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) + #define NUMBER_OF_SMB2_COMMANDS 0x0013 /* BB FIXME - analyze following length BB */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index eb2cde2f64ba..f2d511a6971b 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -96,6 +96,7 @@ extern int smb2_open_file(const unsigned int xid, extern int smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, const unsigned int xid); extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile); +extern void smb2_reconnect_server(struct work_struct *work); /* * SMB2 Worker functions - most of protocol specific implementation details diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 699b7868108f..c12bffefa3c9 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -23,7 +23,7 @@ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -#include <crypto/skcipher.h> +#include <linux/crypto.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/fs.h> @@ -69,46 +69,22 @@ str_to_key(unsigned char *str, unsigned char *key) static int smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) { - int rc; unsigned char key2[8]; - struct crypto_skcipher *tfm_des; - struct scatterlist sgin, sgout; - struct skcipher_request *req; + struct crypto_cipher *tfm_des; str_to_key(key, key2); - tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC); + tfm_des = crypto_alloc_cipher("des", 0, 0); if (IS_ERR(tfm_des)) { - rc = PTR_ERR(tfm_des); - cifs_dbg(VFS, "could not allocate des crypto API\n"); - goto smbhash_err; - } - - req = skcipher_request_alloc(tfm_des, GFP_KERNEL); - if (!req) { - rc = -ENOMEM; cifs_dbg(VFS, "could not allocate des crypto API\n"); - goto smbhash_free_skcipher; + return PTR_ERR(tfm_des); } - crypto_skcipher_setkey(tfm_des, key2, 8); - - sg_init_one(&sgin, in, 8); - sg_init_one(&sgout, out, 8); + crypto_cipher_setkey(tfm_des, key2, 8); + crypto_cipher_encrypt_one(tfm_des, out, in); + crypto_free_cipher(tfm_des); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL); - - rc = crypto_skcipher_encrypt(req); - if (rc) - cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc); - - skcipher_request_free(req); - -smbhash_free_skcipher: - crypto_free_skcipher(tfm_des); -smbhash_err: - return rc; + return 0; } static int diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 206a597b2293..fbb84c08e3cd 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -28,8 +28,9 @@ #include <linux/delay.h> #include <linux/freezer.h> #include <linux/tcp.h> +#include <linux/bvec.h> #include <linux/highmem.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/processor.h> #include <linux/mempool.h> #include "cifspdu.h" diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 1bfb7ba4e85e..f13e09057c6b 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -17,7 +17,6 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) } static const struct inode_operations coda_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = coda_setattr, }; diff --git a/fs/compat.c b/fs/compat.c index bd064a2c3550..e50a2114f474 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -49,7 +49,7 @@ #include <linux/pagemap.h> #include <linux/aio.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/ioctls.h> #include "internal.h" @@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) { - if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | - kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + if (sizeof(ubuf->f_bsize) == 4) { + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | + kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) return -EOVERFLOW; /* f_files and f_ffree may be -1; it's okay * to stuff that into 32 bits */ @@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, return compat_sys_fcntl64(fd, cmd, arg); } -COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) -{ - long ret; - aio_context_t ctx64; - - mm_segment_t oldfs = get_fs(); - if (unlikely(get_user(ctx64, ctx32p))) - return -EFAULT; - - set_fs(KERNEL_DS); - /* The __user pointer cast is valid because of the set_fs() */ - ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64); - set_fs(oldfs); - /* truncating is ok because it's a user address */ - if (!ret) - ret = put_user((u32) ctx64, ctx32p); - return ret; -} - -COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, - compat_long_t, min_nr, - compat_long_t, nr, - struct io_event __user *, events, - struct compat_timespec __user *, timeout) -{ - struct timespec t; - struct timespec __user *ut = NULL; - - if (timeout) { - if (compat_get_timespec(&t, timeout)) - return -EFAULT; - - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t)) ) - return -EFAULT; - } - return sys_io_getevents(ctx_id, min_nr, nr, events, ut); -} - /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) @@ -602,42 +563,6 @@ out: return ret; } -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) -{ - compat_uptr_t uptr; - int i; - - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; - } - return 0; -} - -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) - -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; - - if (unlikely(nr < 0)) - return -EINVAL; - - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; - - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f2d7402abe02..11d087b2b28e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -76,7 +76,7 @@ #include <scsi/sg.h> #endif -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/ethtool.h> #include <linux/mii.h> #include <linux/if_bonding.h> diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 2c6312db8516..39da1103d341 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -29,7 +29,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/configfs.h> #include "configfs_internal.h" diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index db6d69289608..a6ab012a2c6a 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -305,7 +305,6 @@ static const char *configfs_get_link(struct dentry *dentry, const struct inode_operations configfs_symlink_inode_operations = { .get_link = configfs_get_link, - .readlink = generic_readlink, .setattr = configfs_setattr, }; diff --git a/fs/coredump.c b/fs/coredump.c index eb9c92c9b20f..ae6b05629ca1 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -38,7 +38,7 @@ #include <linux/path.h> #include <linux/timekeeping.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> #include <asm/exec.h> @@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align) return mod ? dump_skip(cprm, align - mod) : 1; } EXPORT_SYMBOL(dump_align); + +/* + * Ensures that file size is big enough to contain the current file + * postion. This prevents gdb from complaining about a truncated file + * if the last "write" to the file was dump_skip. + */ +void dump_truncate(struct coredump_params *cprm) +{ + struct file *file = cprm->file; + loff_t offset; + + if (file->f_op->llseek && file->f_op->llseek != no_llseek) { + offset = file->f_op->llseek(file, 0, SEEK_CUR); + if (i_size_read(file->f_mapping->host) < offset) + do_truncate(file->f_path.dentry, offset, 0, file); + } +} +EXPORT_SYMBOL(dump_truncate); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 92348faf9865..f514978f6688 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -8,9 +8,7 @@ config FS_ENCRYPTION select CRYPTO_XTS select CRYPTO_CTS select CRYPTO_CTR - select CRYPTO_SHA256 select KEYS - select ENCRYPTED_KEYS help Enable encryption of files and directories. This feature is similar to ecryptfs, but it is more memory diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 98f87fe8f186..ac8e4f6a3773 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -27,7 +27,7 @@ #include <linux/bio.h> #include <linux/dcache.h> #include <linux/namei.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -146,9 +146,10 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, +static int do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, gfp_t gfp_flags) { struct { @@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode, page_crypt_complete, &ecr); BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); + xts_tweak.index = cpu_to_le64(lblk_num); memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) @@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode, if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, + len, offs, gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +300,13 @@ errout: EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len, + offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { struct fscrypt_ctx *ctx; @@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, while (len--) { err = do_page_crypto(inode, FS_ENCRYPT, lblk, ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); + PAGE_SIZE, 0, GFP_NOFS); if (err) goto errout; @@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); if (ret) { WARN_ON_ONCE(1); @@ -482,17 +525,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -521,7 +569,6 @@ fail: mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..56ad9d195f18 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); @@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_crypt_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..aeab032d7d35 --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,93 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include <linux/fscrypto.h> + +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + + +/* crypto.c */ +int fscrypt_initialize(unsigned int cop_flags); + +/* keyinfo.c */ +extern int fscrypt_get_crypt_info(struct inode *); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..95cd4c3b06c3 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,7 @@ #include <keys/user-type.h> #include <linux/scatterlist.h> -#include <linux/fscrypto.h> +#include "fscrypt_private.h" static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci) kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +int fscrypt_get_crypt_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; @@ -248,7 +248,8 @@ retry: goto out; if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); + memset(raw_key, 0x42, keysize/2); + memset(raw_key+keysize/2, 0x24, keysize - (keysize/2)); goto got_key; } @@ -327,7 +328,7 @@ int fscrypt_get_encryption_info(struct inode *inode) (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | (1 << KEY_FLAG_REVOKED) | (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); + return fscrypt_get_crypt_info(inode); return 0; } EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..d6cd7ea4851d 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,8 +10,8 @@ #include <linux/random.h> #include <linux/string.h> -#include <linux/fscrypto.h> #include <linux/mount.h> +#include "fscrypt_private.h" static int inode_has_encryption_context(struct inode *inode) { @@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; + if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp, ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); + &policy); } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { + &policy)) { printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", __func__); @@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; if (!inode->i_sb->s_cop->get_context || @@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { @@ -171,6 +179,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) BUG_ON(1); } + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) + return 1; + /* no restrictions if the parent directory is not encrypted */ if (!parent->i_sb->s_cop->is_encrypted(parent)) return 1; @@ -31,28 +31,15 @@ #include <linux/vmstat.h> #include <linux/pfn_t.h> #include <linux/sizes.h> +#include <linux/mmu_notifier.h> #include <linux/iomap.h> #include "internal.h" -/* - * We use lowest available bit in exceptional entry for locking, other two - * bits to determine entry type. In total 3 special bits. - */ -#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 3) -#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) -#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD) -#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK) -#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) -#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ - RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \ - RADIX_TREE_EXCEPTIONAL_ENTRY)) - /* We choose 4096 entries - same as per-zone page wait tables */ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) -wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; +static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) { @@ -64,14 +51,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, - pgoff_t index) -{ - unsigned long hash = hash_long((unsigned long)mapping ^ index, - DAX_WAIT_TABLE_BITS); - return wait_table + hash; -} - static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) { struct request_queue *q = bdev->bd_queue; @@ -98,209 +77,52 @@ static void dax_unmap_atomic(struct block_device *bdev, blk_queue_exit(bdev->bd_queue); } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) +static int dax_is_pmd_entry(void *entry) { - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; + return (unsigned long)entry & RADIX_DAX_PMD; } -static bool buffer_written(struct buffer_head *bh) +static int dax_is_pte_entry(void *entry) { - return buffer_mapped(bh) && !buffer_unwritten(bh); + return !((unsigned long)entry & RADIX_DAX_PMD); } -/* - * When ext4 encounters a hole, it returns without modifying the buffer_head - * which means that we can't trust b_size. To cope with this, we set b_state - * to 0 before calling get_block and, if any bit is set, we know we can trust - * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is - * and would save us time calling get_block repeatedly. - */ -static bool buffer_size_valid(struct buffer_head *bh) +static int dax_is_zero_entry(void *entry) { - return bh->b_state != 0; + return (unsigned long)entry & RADIX_DAX_HZP; } - -static sector_t to_sector(const struct buffer_head *bh, - const struct inode *inode) +static int dax_is_empty_entry(void *entry) { - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); - - return sector; + return (unsigned long)entry & RADIX_DAX_EMPTY; } -static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, - loff_t start, loff_t end, get_block_t get_block, - struct buffer_head *bh) +struct page *read_dax_sector(struct block_device *bdev, sector_t n) { - loff_t pos = start, max = start, bh_max = start; - bool hole = false; - struct block_device *bdev = NULL; - int rw = iov_iter_rw(iter), rc; - long map_len = 0; + struct page *page = alloc_pages(GFP_KERNEL, 0); struct blk_dax_ctl dax = { - .addr = ERR_PTR(-EIO), + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), }; - unsigned blkbits = inode->i_blkbits; - sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1) - >> blkbits; - - if (rw == READ) - end = min(end, i_size_read(inode)); - - while (pos < end) { - size_t len; - if (pos == max) { - long page = pos >> PAGE_SHIFT; - sector_t block = page << (PAGE_SHIFT - blkbits); - unsigned first = pos - (block << blkbits); - long size; - - if (pos == bh_max) { - bh->b_size = PAGE_ALIGN(end - pos); - bh->b_state = 0; - rc = get_block(inode, block, bh, rw == WRITE); - if (rc) - break; - if (!buffer_size_valid(bh)) - bh->b_size = 1 << blkbits; - bh_max = pos - first + bh->b_size; - bdev = bh->b_bdev; - /* - * We allow uninitialized buffers for writes - * beyond EOF as those cannot race with faults - */ - WARN_ON_ONCE( - (buffer_new(bh) && block < file_blks) || - (rw == WRITE && buffer_unwritten(bh))); - } else { - unsigned done = bh->b_size - - (bh_max - (pos - first)); - bh->b_blocknr += done >> blkbits; - bh->b_size -= done; - } - - hole = rw == READ && !buffer_written(bh); - if (hole) { - size = bh->b_size - first; - } else { - dax_unmap_atomic(bdev, &dax); - dax.sector = to_sector(bh, inode); - dax.size = bh->b_size; - map_len = dax_map_atomic(bdev, &dax); - if (map_len < 0) { - rc = map_len; - break; - } - dax.addr += first; - size = map_len - first; - } - /* - * pos + size is one past the last offset for IO, - * so pos + size can overflow loff_t at extreme offsets. - * Cast to u64 to catch this and get the true minimum. - */ - max = min_t(u64, pos + size, end); - } - - if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(dax.addr, max - pos, iter); - } else if (!hole) - len = copy_to_iter((void __force *) dax.addr, max - pos, - iter); - else - len = iov_iter_zero(max - pos, iter); - - if (!len) { - rc = -EFAULT; - break; - } + long rc; - pos += len; - if (!IS_ERR(dax.addr)) - dax.addr += len; - } + if (!page) + return ERR_PTR(-ENOMEM); + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); dax_unmap_atomic(bdev, &dax); - - return (pos == start) ? rc : pos - start; -} - -/** - * dax_do_io - Perform I/O to a DAX file - * @iocb: The control block for this I/O - * @inode: The file which the I/O is directed at - * @iter: The addresses to do I/O from or to - * @get_block: The filesystem method used to translate file offsets to blocks - * @end_io: A filesystem callback for I/O completion - * @flags: See below - * - * This function uses the same locking scheme as do_blockdev_direct_IO: - * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the - * caller for writes. For reads, we take and release the i_mutex ourselves. - * If DIO_LOCKING is not set, the filesystem takes care of its own locking. - * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O - * is in progress. - */ -ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, get_block_t get_block, - dio_iodone_t end_io, int flags) -{ - struct buffer_head bh; - ssize_t retval = -EINVAL; - loff_t pos = iocb->ki_pos; - loff_t end = pos + iov_iter_count(iter); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_lock(inode); - - /* Protects against truncate */ - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_begin(inode); - - retval = dax_io(inode, iter, pos, end, get_block, &bh); - - if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - inode_unlock(inode); - - if (end_io) { - int err; - - err = end_io(iocb, pos, retval, bh.b_private); - if (err) - retval = err; - } - - if (!(flags & DIO_SKIP_DIO_COUNT)) - inode_dio_end(inode); - return retval; + return page; } -EXPORT_SYMBOL_GPL(dax_do_io); /* * DAX radix tree locking */ struct exceptional_entry_key { struct address_space *mapping; - unsigned long index; + pgoff_t entry_start; }; struct wait_exceptional_entry_queue { @@ -308,6 +130,26 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, + pgoff_t index, void *entry, struct exceptional_entry_key *key) +{ + unsigned long hash; + + /* + * If 'entry' is a PMD, align the 'index' that we use for the wait + * queue to the start of that PMD. This ensures that all offsets in + * the range covered by the PMD map to the same bit lock. + */ + if (dax_is_pmd_entry(entry)) + index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + + key->mapping = mapping; + key->entry_start = index; + + hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS); + return wait_table + hash; +} + static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, int sync, void *keyp) { @@ -316,7 +158,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode, container_of(wait, struct wait_exceptional_entry_queue, wait); if (key->mapping != ewait->key.mapping || - key->index != ewait->key.index) + key->entry_start != ewait->key.entry_start) return 0; return autoremove_wake_function(wait, mode, sync, NULL); } @@ -342,7 +184,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +198,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -372,24 +214,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) static void *get_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void ***slotp) { - void *ret, **slot; + void *entry, **slot; struct wait_exceptional_entry_queue ewait; - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + wait_queue_head_t *wq; init_wait(&ewait.wait); ewait.wait.func = wake_exceptional_entry_func; - ewait.key.mapping = mapping; - ewait.key.index = index; for (;;) { - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!ret || !radix_tree_exceptional_entry(ret) || + if (!entry || !radix_tree_exceptional_entry(entry) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; - return ret; + return entry; } + + wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); spin_unlock_irq(&mapping->tree_lock); @@ -399,52 +241,173 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + +static void put_locked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) { + unlock_page(entry); + put_page(entry); + } else { + dax_unlock_mapping_entry(mapping, index); + } +} + +/* + * Called when we are done with radix tree entry we looked up via + * get_unlocked_mapping_entry() and which we didn't lock in the end. + */ +static void put_unlocked_mapping_entry(struct address_space *mapping, + pgoff_t index, void *entry) +{ + if (!radix_tree_exceptional_entry(entry)) + return; + + /* We have to wake up next waiter for the radix tree entry lock */ + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + /* * Find radix tree entry at given index. If it points to a page, return with * the page locked. If it points to the exceptional entry, return with the * radix tree entry locked. If the radix tree doesn't contain given index, * create empty exceptional entry for the index and return with it locked. * + * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will + * either return that locked entry or will return an error. This error will + * happen if there are any 4k entries (either zero pages or DAX entries) + * within the 2MiB range that we are requesting. + * + * We always favor 4k entries over 2MiB entries. There isn't a flow where we + * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB + * insertion will fail if it finds any 4k entries already in the tree, and a + * 4k insertion will cause an existing 2MiB entry to be unmapped and + * downgraded to 4k entries. This happens for both 2MiB huge zero pages as + * well as 2MiB empty entries. + * + * The exception to this downgrade path is for 2MiB DAX PMD entries that have + * real storage backing them. We will leave these real 2MiB DAX entries in + * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry. + * * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For * persistent memory the benefit is doubtful. We can add that later if we can * show it helps. */ -static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index) +static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index, + unsigned long size_flag) { - void *ret, **slot; + bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */ + void *entry, **slot; restart: spin_lock_irq(&mapping->tree_lock); - ret = get_unlocked_mapping_entry(mapping, index, &slot); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + + if (entry) { + if (size_flag & RADIX_DAX_PMD) { + if (!radix_tree_exceptional_entry(entry) || + dax_is_pte_entry(entry)) { + put_unlocked_mapping_entry(mapping, index, + entry); + entry = ERR_PTR(-EEXIST); + goto out_unlock; + } + } else { /* trying to grab a PTE entry */ + if (radix_tree_exceptional_entry(entry) && + dax_is_pmd_entry(entry) && + (dax_is_zero_entry(entry) || + dax_is_empty_entry(entry))) { + pmd_downgrade = true; + } + } + } + /* No entry for given index? Make sure radix tree is big enough. */ - if (!ret) { + if (!entry || pmd_downgrade) { int err; + if (pmd_downgrade) { + /* + * Make sure 'entry' remains valid while we drop + * mapping->tree_lock. + */ + entry = lock_slot(mapping, slot); + } + spin_unlock_irq(&mapping->tree_lock); + /* + * Besides huge zero pages the only other thing that gets + * downgraded are empty entries which don't need to be + * unmapped. + */ + if (pmd_downgrade && dax_is_zero_entry(entry)) + unmap_mapping_range(mapping, + (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); - if (err) + if (err) { + if (pmd_downgrade) + put_locked_mapping_entry(mapping, index, entry); return ERR_PTR(err); - ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | - RADIX_DAX_ENTRY_LOCK); + } spin_lock_irq(&mapping->tree_lock); - err = radix_tree_insert(&mapping->page_tree, index, ret); + + if (pmd_downgrade) { + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + dax_wake_mapping_entry_waiter(mapping, index, entry, + true); + } + + entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY); + + err = __radix_tree_insert(&mapping->page_tree, index, + dax_radix_order(entry), entry); radix_tree_preload_end(); if (err) { spin_unlock_irq(&mapping->tree_lock); - /* Someone already created the entry? */ - if (err == -EEXIST) + /* + * Someone already created the entry? This is a + * normal failure when inserting PMDs in a range + * that already contains PTEs. In that case we want + * to return -EEXIST immediately. + */ + if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD)) goto restart; + /* + * Our insertion of a DAX PMD entry failed, most + * likely because it collided with a PTE sized entry + * at a different index in the PMD range. We haven't + * inserted anything into the radix tree and have no + * waiters to wake. + */ return ERR_PTR(err); } /* Good, we have inserted empty locked entry into the tree. */ mapping->nrexceptional++; spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(ret)) { - struct page *page = ret; + if (!radix_tree_exceptional_entry(entry)) { + struct page *page = entry; get_page(page); spin_unlock_irq(&mapping->tree_lock); @@ -457,15 +420,26 @@ restart: } return page; } - ret = lock_slot(mapping, slot); + entry = lock_slot(mapping, slot); + out_unlock: spin_unlock_irq(&mapping->tree_lock); - return ret; + return entry; } +/* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, bool wake_all) + pgoff_t index, void *entry, bool wake_all) { - wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index); + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); /* * Checking for locked entry and prepare_to_wait_exclusive() happens @@ -473,66 +447,41 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, * So at this point all tasks that could have seen our entry locked * must be in the waitqueue and the following check will see them. */ - if (waitqueue_active(wq)) { - struct exceptional_entry_key key; - - key.mapping = mapping; - key.index = index; + if (waitqueue_active(wq)) __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); - } } -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) +static int __dax_invalidate_mapping_entry(struct address_space *mapping, + pgoff_t index, bool trunc) { - void *ret, **slot; + int ret = 0; + void *entry; + struct radix_tree_root *page_tree = &mapping->page_tree; spin_lock_irq(&mapping->tree_lock); - ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); + entry = get_unlocked_mapping_entry(mapping, index, NULL); + if (!entry || !radix_tree_exceptional_entry(entry)) + goto out; + if (!trunc && + (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) + goto out; + radix_tree_delete(page_tree, index); + mapping->nrexceptional--; + ret = 1; +out: + put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, false); -} - -static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } -} - -/* - * Called when we are done with radix tree entry we looked up via - * get_unlocked_mapping_entry() and which we didn't lock in the end. - */ -static void put_unlocked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - if (!radix_tree_exceptional_entry(entry)) - return; - - /* We have to wake up next waiter for the radix tree entry lock */ - dax_wake_mapping_entry_waiter(mapping, index, false); + return ret; } - /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. */ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) { - void *entry; + int ret = __dax_invalidate_mapping_entry(mapping, index, true); - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); /* * This gets called from truncate / punch_hole path. As such, the caller * must hold locks protecting against concurrent modifications of the @@ -540,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) * caller has seen exceptional entry for this index, we better find it * at that index as well... */ - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) { - spin_unlock_irq(&mapping->tree_lock); - return 0; - } - radix_tree_delete(&mapping->page_tree, index); + WARN_ON_ONCE(!ret); + return ret; +} + +/* + * Invalidate exceptional DAX entry if easily possible. This handles DAX + * entries for invalidate_inode_pages() so we evict the entry only if we can + * do so without blocking. + */ +int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) +{ + int ret = 0; + void *entry, **slot; + struct radix_tree_root *page_tree = &mapping->page_tree; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(page_tree, index, NULL, &slot); + if (!entry || !radix_tree_exceptional_entry(entry) || + slot_locked(mapping, slot)) + goto out; + if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || + radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto out; + radix_tree_delete(page_tree, index); mapping->nrexceptional--; + ret = 1; +out: spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, true); + if (ret) + dax_wake_mapping_entry_waiter(mapping, index, entry, true); + return ret; +} - return 1; +/* + * Invalidate exceptional DAX entry if it is clean. + */ +int dax_invalidate_mapping_entry_sync(struct address_space *mapping, + pgoff_t index) +{ + return __dax_invalidate_mapping_entry(mapping, index, false); } /* @@ -560,26 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) * otherwise it will simply fall out of the page cache under memory * pressure without ever having been dirtied. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { struct page *page; + int ret; /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; + if (!radix_tree_exceptional_entry(*entry)) { + page = *entry; + goto out; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } + out: vmf->page = page; - return VM_FAULT_LOCKED; + ret = finish_fault(vmf); + vmf->page = NULL; + *entry = page; + if (!ret) { + /* Grab reference for PTE that is now referencing the page */ + get_page(page); + return VM_FAULT_NOPAGE; + } + return ret; } static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, @@ -600,11 +587,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size return 0; } -#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT)) - +/* + * By this point grab_mapping_entry() has ensured that we have a locked entry + * of the appropriate size so we don't have to worry about downgrading PMDs to + * PTEs. If we happen to be trying to insert a PTE and there is a PMD + * already in the tree, we will skip the insertion and just dirty the PMD as + * appropriate. + */ static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, - void *entry, sector_t sector) + void *entry, sector_t sector, + unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; int error = 0; @@ -627,28 +620,43 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); if (error) return ERR_PTR(error); + } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { + /* replacing huge zero page with PMD block mapping */ + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); - new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) | - RADIX_DAX_ENTRY_LOCK); + new_entry = dax_radix_locked_entry(sector, flags); + if (hole_fill) { __delete_from_page_cache(entry, NULL); /* Drop pagecache reference */ put_page(entry); - error = radix_tree_insert(page_tree, index, new_entry); + error = __radix_tree_insert(page_tree, index, + dax_radix_order(new_entry), new_entry); if (error) { new_entry = ERR_PTR(error); goto unlock; } mapping->nrexceptional++; - } else { + } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + /* + * Only swap our new entry into the radix tree if the current + * entry is a zero page or an empty entry. If a normal PTE or + * PMD entry is already in the tree, we leave it alone. This + * means that if we are trying to insert a PTE and the + * existing entry is a PMD, we will just leave the PMD in the + * tree and dirty it if necessary. + */ + struct radix_tree_node *node; void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - radix_tree_replace_slot(slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); @@ -668,63 +676,171 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, return new_entry; } +static inline unsigned long +pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) +{ + unsigned long address; + + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); + return address; +} + +/* Walk all mappings of a given index of a file and writeprotect them */ +static void dax_mapping_entry_mkclean(struct address_space *mapping, + pgoff_t index, unsigned long pfn) +{ + struct vm_area_struct *vma; + pte_t pte, *ptep = NULL; + pmd_t *pmdp = NULL; + spinlock_t *ptl; + bool changed; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { + unsigned long address; + + cond_resched(); + + if (!(vma->vm_flags & VM_SHARED)) + continue; + + address = pgoff_address(index, vma); + changed = false; + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + continue; + + if (pmdp) { +#ifdef CONFIG_FS_DAX_PMD + pmd_t pmd; + + if (pfn != pmd_pfn(*pmdp)) + goto unlock_pmd; + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + flush_cache_page(vma, address, pfn); + pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + changed = true; +unlock_pmd: + spin_unlock(ptl); +#endif + } else { + if (pfn != pte_pfn(*ptep)) + goto unlock_pte; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock_pte; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock_pte: + pte_unmap_unlock(ptep, ptl); + } + + if (changed) + mmu_notifier_invalidate_page(vma->vm_mm, address); + } + i_mmap_unlock_read(mapping); +} + static int dax_writeback_one(struct block_device *bdev, struct address_space *mapping, pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - int type = RADIX_DAX_TYPE(entry); - struct radix_tree_node *node; struct blk_dax_ctl dax; - void **slot; + void *entry2, **slot; int ret = 0; - spin_lock_irq(&mapping->tree_lock); /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. + * A page got tagged dirty in DAX mapping? Something is seriously + * wrong. */ - if (!__radix_tree_lookup(page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - - /* another fsync thread may have already written back this entry */ - if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto unlock; + if (WARN_ON(!radix_tree_exceptional_entry(entry))) + return -EIO; - if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + spin_lock_irq(&mapping->tree_lock); + entry2 = get_unlocked_mapping_entry(mapping, index, &slot); + /* Entry got punched out / reallocated? */ + if (!entry2 || !radix_tree_exceptional_entry(entry2)) + goto put_unlocked; + /* + * Entry got reallocated elsewhere? No need to writeback. We have to + * compare sectors as we must not bail out due to difference in lockbit + * or entry type. + */ + if (dax_radix_sector(entry2) != dax_radix_sector(entry)) + goto put_unlocked; + if (WARN_ON_ONCE(dax_is_empty_entry(entry) || + dax_is_zero_entry(entry))) { ret = -EIO; - goto unlock; + goto put_unlocked; } - dax.sector = RADIX_DAX_SECTOR(entry); - dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + /* Another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto put_unlocked; + /* Lock the entry to serialize with page faults */ + entry = lock_slot(mapping, slot); + /* + * We can clear the tag now but we have to be careful so that concurrent + * dax_writeback_one() calls for the same index cannot finish before we + * actually flush the caches. This is achieved as the calls will look + * at the entry only under tree_lock and once they do that they will + * see the entry locked and wait for it to unlock. + */ + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); spin_unlock_irq(&mapping->tree_lock); /* + * Even if dax_writeback_mapping_range() was given a wbc->range_start + * in the middle of a PMD, the 'index' we are given will be aligned to + * the start index of the PMD, as will the sector we pull from + * 'entry'. This allows us to flush for PMD_SIZE and not have to + * worry about partial PMD writebacks. + */ + dax.sector = dax_radix_sector(entry); + dax.size = PAGE_SIZE << dax_radix_order(entry); + + /* * We cannot hold tree_lock while calling dax_map_atomic() because it * eventually calls cond_resched(). */ ret = dax_map_atomic(bdev, &dax); - if (ret < 0) + if (ret < 0) { + put_locked_mapping_entry(mapping, index, entry); return ret; + } if (WARN_ON_ONCE(ret < dax.size)) { ret = -EIO; goto unmap; } + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); wb_cache_pmem(dax.addr, dax.size); - + /* + * After we have flushed the cache, we can clear the dirty tag. There + * cannot be new dirty data in the pfn after the flush has completed as + * the pfn mappings are writeprotected and fault waits for mapping + * entry lock. + */ spin_lock_irq(&mapping->tree_lock); - radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); unmap: dax_unmap_atomic(bdev, &dax); + put_locked_mapping_entry(mapping, index, entry); return ret; - unlock: + put_unlocked: + put_unlocked_mapping_entry(mapping, index, entry2); spin_unlock_irq(&mapping->tree_lock); return ret; } @@ -738,12 +854,11 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc) { struct inode *inode = mapping->host; - pgoff_t start_index, end_index, pmd_index; + pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; bool done = false; int i, ret = 0; - void *entry; if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; @@ -753,15 +868,6 @@ int dax_writeback_mapping_range(struct address_space *mapping, start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; - pmd_index = DAX_PMD_INDEX(start_index); - - rcu_read_lock(); - entry = radix_tree_lookup(&mapping->page_tree, pmd_index); - rcu_read_unlock(); - - /* see if the start of our range is covered by a PMD entry */ - if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) - start_index = pmd_index; tag_pages_for_writeback(mapping, start_index, end_index); @@ -794,7 +900,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -806,7 +912,7 @@ static int dax_insert_mapping(struct address_space *mapping, return PTR_ERR(dax.addr); dax_unmap_atomic(bdev, &dax); - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; @@ -815,323 +921,6 @@ static int dax_insert_mapping(struct address_space *mapping, } /** - * dax_fault - handle a page fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * fault handler for DAX files. dax_fault() assumes the caller has done all - * the necessary locking for the page fault to proceed successfully. - */ -int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - void *entry; - struct buffer_head bh; - unsigned long vaddr = (unsigned long)vmf->virtual_address; - unsigned blkbits = inode->i_blkbits; - sector_t block; - pgoff_t size; - int error; - int major = 0; - - /* - * Check whether offset isn't beyond end of file now. Caller is supposed - * to hold locks serializing us with truncate / punch hole so this is - * a reliable test. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - return VM_FAULT_SIGBUS; - - memset(&bh, 0, sizeof(bh)); - block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - - entry = grab_mapping_entry(mapping, vmf->pgoff); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto out; - } - - error = get_block(inode, block, &bh, 0); - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; /* fs corruption? */ - if (error) - goto unlock_entry; - - if (vmf->cow_page) { - struct page *new_page = vmf->cow_page; - if (buffer_written(&bh)) - error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode), - bh.b_size, new_page, vaddr); - else - clear_user_highpage(new_page, vaddr); - if (error) - goto unlock_entry; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; - } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; - } - - if (!buffer_mapped(&bh)) { - if (vmf->flags & FAULT_FLAG_WRITE) { - error = get_block(inode, block, &bh, 1); - count_vm_event(PGMAJFAULT); - mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); - major = VM_FAULT_MAJOR; - if (!error && (bh.b_size < PAGE_SIZE)) - error = -EIO; - if (error) - goto unlock_entry; - } else { - return dax_load_hole(mapping, entry, vmf); - } - } - - /* Filesystem should not return unwritten buffers to us! */ - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode), - bh.b_size, &entry, vma, vmf); - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); - out: - if (error == -ENOMEM) - return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if ((error < 0) && (error != -EBUSY)) - return VM_FAULT_SIGBUS | major; - return VM_FAULT_NOPAGE | major; -} -EXPORT_SYMBOL_GPL(dax_fault); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below function. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - -static void __dax_dbg(struct buffer_head *bh, unsigned long address, - const char *reason, const char *fn) -{ - if (bh) { - char bname[BDEVNAME_SIZE]; - bdevname(bh->b_bdev, bname); - pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " - "length %zd fallback: %s\n", fn, current->comm, - address, bname, bh->b_state, (u64)bh->b_blocknr, - bh->b_size, reason); - } else { - pr_debug("%s: %s addr: %lx fallback: %s\n", fn, - current->comm, address, reason); - } -} - -#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") - -/** - * dax_pmd_fault - handle a PMD fault on a DAX file - * @vma: The virtual memory area where the fault occurred - * @vmf: The description of the fault - * @get_block: The filesystem method used to translate file offsets to blocks - * - * When a page fault occurs, filesystems may call this helper in their - * pmd_fault handler for DAX files. - */ -int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmd, unsigned int flags, get_block_t get_block) -{ - struct file *file = vma->vm_file; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - struct buffer_head bh; - unsigned blkbits = inode->i_blkbits; - unsigned long pmd_addr = address & PMD_MASK; - bool write = flags & FAULT_FLAG_WRITE; - struct block_device *bdev; - pgoff_t size, pgoff; - sector_t block; - int result = 0; - bool alloc = false; - - /* dax pmd mappings require pfn_t_devmap() */ - if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) - return VM_FAULT_FALLBACK; - - /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) { - split_huge_pmd(vma, pmd, address); - dax_pmd_dbg(NULL, address, "cow write"); - return VM_FAULT_FALLBACK; - } - /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) { - dax_pmd_dbg(NULL, address, "vma start unaligned"); - return VM_FAULT_FALLBACK; - } - if ((pmd_addr + PMD_SIZE) > vma->vm_end) { - dax_pmd_dbg(NULL, address, "vma end unaligned"); - return VM_FAULT_FALLBACK; - } - - pgoff = linear_page_index(vma, pmd_addr); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (pgoff >= size) - return VM_FAULT_SIGBUS; - /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(NULL, address, - "offset + huge page size > file size"); - return VM_FAULT_FALLBACK; - } - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); - - bh.b_size = PMD_SIZE; - - if (get_block(inode, block, &bh, 0) != 0) - return VM_FAULT_SIGBUS; - - if (!buffer_mapped(&bh) && write) { - if (get_block(inode, block, &bh, 1) != 0) - return VM_FAULT_SIGBUS; - alloc = true; - WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); - } - - bdev = bh.b_bdev; - - /* - * If the filesystem isn't willing to tell us the length of a hole, - * just fall back to PTEs. Calling get_block 512 times in a loop - * would be silly. - */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "allocated block too small"); - return VM_FAULT_FALLBACK; - } - - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (alloc) { - loff_t lstart = pgoff << PAGE_SHIFT; - loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ - - truncate_pagecache_range(inode, lstart, lend); - } - - if (!write && !buffer_mapped(&bh)) { - spinlock_t *ptl; - pmd_t entry; - struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm); - - if (unlikely(!zero_page)) { - dax_pmd_dbg(&bh, address, "no zero page"); - goto fallback; - } - - ptl = pmd_lock(vma->vm_mm, pmd); - if (!pmd_none(*pmd)) { - spin_unlock(ptl); - dax_pmd_dbg(&bh, address, "pmd already present"); - goto fallback; - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: <zero> sect: %llx\n", - __func__, current->comm, address, - (unsigned long long) to_sector(&bh, inode)); - - entry = mk_pmd(zero_page, vma->vm_page_prot); - entry = pmd_mkhuge(entry); - set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); - result = VM_FAULT_NOPAGE; - spin_unlock(ptl); - } else { - struct blk_dax_ctl dax = { - .sector = to_sector(&bh, inode), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - - if (length < 0) { - dax_pmd_dbg(&bh, address, "dax-error fallback"); - goto fallback; - } - if (length < PMD_SIZE) { - dax_pmd_dbg(&bh, address, "dax-length too small"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { - dax_pmd_dbg(&bh, address, "pfn unaligned"); - dax_unmap_atomic(bdev, &dax); - goto fallback; - } - - if (!pfn_t_devmap(dax.pfn)) { - dax_unmap_atomic(bdev, &dax); - dax_pmd_dbg(&bh, address, "pfn not in memmap"); - goto fallback; - } - dax_unmap_atomic(bdev, &dax); - - /* - * For PTE faults we insert a radix tree entry for reads, and - * leave it clean. Then on the first write we dirty the radix - * tree entry via the dax_pfn_mkwrite() path. This sequence - * allows the dax_pfn_mkwrite() call to be simpler and avoid a - * call into get_block() to translate the pgoff to a sector in - * order to be able to create a new radix tree entry. - * - * The PMD path doesn't have an equivalent to - * dax_pfn_mkwrite(), though, so for a read followed by a - * write we traverse all the way through dax_pmd_fault() - * twice. This means we can just skip inserting a radix tree - * entry completely on the initial read and just wait until - * the write to insert a dirty entry. - */ - if (write) { - /* - * We should insert radix-tree entry and dirty it here. - * For now this is broken... - */ - } - - dev_dbg(part_to_dev(bdev->bd_part), - "%s: %s addr: %lx pfn: %lx sect: %llx\n", - __func__, current->comm, address, - pfn_t_to_pfn(dax.pfn), - (unsigned long long) dax.sector); - result |= vmf_insert_pfn_pmd(vma, address, pmd, - dax.pfn, write); - } - - out: - return result; - - fallback: - count_vm_event(THP_FAULT_FALLBACK); - result = VM_FAULT_FALLBACK; - goto out; -} -EXPORT_SYMBOL_GPL(dax_pmd_fault); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -/** * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault @@ -1140,17 +929,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; - void *entry; + void *entry, **slot; pgoff_t index = vmf->pgoff; spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) - goto out; + entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (!entry || !radix_tree_exceptional_entry(entry)) { + if (entry) + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + return VM_FAULT_NOPAGE; + } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - put_unlocked_mapping_entry(mapping, index, entry); -out: + entry = lock_slot(mapping, slot); spin_unlock_irq(&mapping->tree_lock); + /* + * If we race with somebody updating the PTE and finish_mkwrite_fault() + * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry + * the fault in either case. + */ + finish_mkwrite_fault(vmf); + put_locked_mapping_entry(mapping, index, entry); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -1191,62 +990,13 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -/** - * dax_zero_page_range - zero a range within a page of a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @length: The number of bytes to zero - * @get_block: The filesystem method used to translate file offsets to blocks - * - * This function can be called by a filesystem when it is zeroing part of a - * page in a DAX file. This is intended for hole-punch operations. If - * you are truncating a file, the helper function dax_truncate_page() may be - * more convenient. - */ -int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, - get_block_t get_block) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - struct buffer_head bh; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - int err; - - /* Block boundary? Nothing to do */ - if (!length) - return 0; - BUG_ON((offset + length) > PAGE_SIZE); - - memset(&bh, 0, sizeof(bh)); - bh.b_bdev = inode->i_sb->s_bdev; - bh.b_size = PAGE_SIZE; - err = get_block(inode, index, &bh, 0); - if (err < 0 || !buffer_written(&bh)) - return err; - - return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode), - offset, length); + return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9); } -EXPORT_SYMBOL_GPL(dax_zero_page_range); -/** - * dax_truncate_page - handle a partial page being truncated in a DAX file - * @inode: The file being truncated - * @from: The file offset that is being truncated to - * @get_block: The filesystem method used to translate file offsets to blocks - * - * Similar to block_truncate_page(), this function can be called by a - * filesystem when it is truncating a DAX file to handle the partial page. - */ -int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block) -{ - unsigned length = PAGE_ALIGN(from) - from; - return dax_zero_page_range(inode, from, length, get_block); -} -EXPORT_SYMBOL_GPL(dax_truncate_page); - -#ifdef CONFIG_FS_IOMAP static loff_t -iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, +dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { struct iov_iter *iter = data; @@ -1265,13 +1015,28 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) return -EIO; + /* + * Write can allocate block for an area which has a hole page mapped + * into page tables. We have to tear down these mappings so that data + * written by write(2) is visible in mmap. + */ + if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_SHIFT, + (end - 1) >> PAGE_SHIFT); + } + while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); struct blk_dax_ctl dax = { 0 }; ssize_t map_len; - dax.sector = iomap->blkno + - (((pos & PAGE_MASK) - iomap->offset) >> 9); + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + dax.sector = dax_iomap_sector(iomap, pos); dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; map_len = dax_map_atomic(iomap->bdev, &dax); if (map_len < 0) { @@ -1303,7 +1068,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } /** - * iomap_dax_rw - Perform I/O to a DAX file + * dax_iomap_rw - Perform I/O to a DAX file * @iocb: The control block for this I/O * @iter: The addresses to do I/O from or to * @ops: iomap ops passed from the file system @@ -1313,7 +1078,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * and evicting any page cache pages in the region under I/O. */ ssize_t -iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, +dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -1324,26 +1089,9 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) flags |= IOMAP_WRITE; - /* - * Yes, even DAX files can have page cache attached to them: A zeroed - * page is inserted into the pagecache when we have to serve a write - * fault on a hole. It should never be dirtied and can simply be - * dropped from the pagecache once we get real data for the page. - * - * XXX: This is racy against mmap, and there's nothing we can do about - * it. We'll eventually need to shift this down even further so that - * we can check if we allocated blocks over a hole first. - */ - if (mapping->nrpages) { - ret = invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, - (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - } - while (iov_iter_count(iter)) { ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, - iter, iomap_dax_actor); + iter, dax_iomap_actor); if (ret <= 0) break; pos += ret; @@ -1353,10 +1101,19 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter, iocb->ki_pos += done; return done ? done : ret; } -EXPORT_SYMBOL_GPL(iomap_dax_rw); +EXPORT_SYMBOL_GPL(dax_iomap_rw); + +static int dax_fault_return(int error) +{ + if (error == 0) + return VM_FAULT_NOPAGE; + if (error == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; +} /** - * iomap_dax_fault - handle a page fault on a DAX file + * dax_iomap_fault - handle a page fault on a DAX file * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault * @ops: iomap ops passed from the file system @@ -1365,17 +1122,18 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw); * or mkwrite handler for DAX files. Assumes the caller has done all the * necessary locking for the page fault to proceed successfully. */ -int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, +int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap_ops *ops) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; - unsigned flags = 0; + unsigned flags = IOMAP_FAULT; int error, major = 0; + int vmf_ret = 0; void *entry; /* @@ -1386,12 +1144,6 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (pos >= i_size_read(inode)) return VM_FAULT_SIGBUS; - entry = grab_mapping_entry(mapping, vmf->pgoff); - if (IS_ERR(entry)) { - error = PTR_ERR(entry); - goto out; - } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; @@ -1402,13 +1154,19 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); if (error) - goto unlock_entry; + return dax_fault_return(error); if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - error = -EIO; /* fs corruption? */ - goto unlock_entry; + vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ + goto finish_iomap; + } + + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto finish_iomap; } - sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9); + sector = dax_iomap_sector(&iomap, pos); if (vmf->cow_page) { switch (iomap.type) { @@ -1427,13 +1185,13 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } if (error) - goto unlock_entry; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - return VM_FAULT_LOCKED; - } - vmf->entry = entry; - return VM_FAULT_DAX_LOCKED; + goto error_unlock_entry; + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; + goto unlock_entry; } switch (iomap.type) { @@ -1445,11 +1203,16 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } error = dax_insert_mapping(mapping, iomap.bdev, sector, PAGE_SIZE, &entry, vma, vmf); + /* -EBUSY is fine, somebody else faulted on the same PTE */ + if (error == -EBUSY) + error = 0; break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) - return dax_load_hole(mapping, entry, vmf); + if (!(vmf->flags & FAULT_FLAG_WRITE)) { + vmf_ret = dax_load_hole(mapping, &entry, vmf); + goto unlock_entry; + } /*FALLTHRU*/ default: WARN_ON_ONCE(1); @@ -1457,15 +1220,215 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, break; } + error_unlock_entry: + vmf_ret = dax_fault_return(error) | major; unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff, entry); - out: - if (error == -ENOMEM) - return VM_FAULT_OOM | major; - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error < 0 && error != -EBUSY) - return VM_FAULT_SIGBUS | major; - return VM_FAULT_NOPAGE | major; + finish_iomap: + if (ops->iomap_end) { + int copied = PAGE_SIZE; + + if (vmf_ret & VM_FAULT_ERROR) + copied = 0; + /* + * The fault is done by now and there's no way back (other + * thread may be already happily using PTE we have installed). + * Just ignore error from ->iomap_end since we cannot do much + * with it. + */ + ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); + } + return vmf_ret; +} +EXPORT_SYMBOL_GPL(dax_iomap_fault); + +#ifdef CONFIG_FS_DAX_PMD +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + +static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, loff_t pos, bool write, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + struct block_device *bdev = iomap->bdev; + struct blk_dax_ctl dax = { + .sector = dax_iomap_sector(iomap, pos), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + void *ret; + + if (length < 0) /* dax_map_atomic() failed */ + return VM_FAULT_FALLBACK; + if (length < PMD_SIZE) + goto unmap_fallback; + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) + goto unmap_fallback; + if (!pfn_t_devmap(dax.pfn)) + goto unmap_fallback; + + dax_unmap_atomic(bdev, &dax); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + RADIX_DAX_PMD); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write); + + unmap_fallback: + dax_unmap_atomic(bdev, &dax); + return VM_FAULT_FALLBACK; +} + +static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd, + struct vm_fault *vmf, unsigned long address, + struct iomap *iomap, void **entryp) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + struct page *zero_page; + spinlock_t *ptl; + pmd_t pmd_entry; + void *ret; + + zero_page = mm_get_huge_zero_page(vma->vm_mm); + + if (unlikely(!zero_page)) + return VM_FAULT_FALLBACK; + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, + RADIX_DAX_PMD | RADIX_DAX_HZP); + if (IS_ERR(ret)) + return VM_FAULT_FALLBACK; + *entryp = ret; + + ptl = pmd_lock(vma->vm_mm, pmd); + if (!pmd_none(*pmd)) { + spin_unlock(ptl); + return VM_FAULT_FALLBACK; + } + + pmd_entry = mk_pmd(zero_page, vma->vm_page_prot); + pmd_entry = pmd_mkhuge(pmd_entry); + set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry); + spin_unlock(ptl); + return VM_FAULT_NOPAGE; +} + +int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmd, unsigned int flags, struct iomap_ops *ops) +{ + struct address_space *mapping = vma->vm_file->f_mapping; + unsigned long pmd_addr = address & PMD_MASK; + bool write = flags & FAULT_FLAG_WRITE; + unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; + struct inode *inode = mapping->host; + int result = VM_FAULT_FALLBACK; + struct iomap iomap = { 0 }; + pgoff_t max_pgoff, pgoff; + struct vm_fault vmf; + void *entry; + loff_t pos; + int error; + + /* Fall back to PTEs if we're going to COW */ + if (write && !(vma->vm_flags & VM_SHARED)) + goto fallback; + + /* If the PMD would extend outside the VMA */ + if (pmd_addr < vma->vm_start) + goto fallback; + if ((pmd_addr + PMD_SIZE) > vma->vm_end) + goto fallback; + + /* + * Check whether offset isn't beyond end of file now. Caller is + * supposed to hold locks serializing us with truncate / punch hole so + * this is a reliable test. + */ + pgoff = linear_page_index(vma, pmd_addr); + max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + + if (pgoff > max_pgoff) + return VM_FAULT_SIGBUS; + + /* If the PMD would extend beyond the file size */ + if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + goto fallback; + + /* + * Note that we don't use iomap_apply here. We aren't doing I/O, only + * setting up a mapping, so really we're using iomap_begin() as a way + * to look up our filesystem block. + */ + pos = (loff_t)pgoff << PAGE_SHIFT; + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); + if (error) + goto fallback; + + if (iomap.offset + iomap.length < pos + PMD_SIZE) + goto finish_iomap; + + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto finish_iomap; + + vmf.pgoff = pgoff; + vmf.flags = flags; + vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; + + switch (iomap.type) { + case IOMAP_MAPPED: + result = dax_pmd_insert_mapping(vma, pmd, &vmf, address, + &iomap, pos, write, &entry); + break; + case IOMAP_UNWRITTEN: + case IOMAP_HOLE: + if (WARN_ON_ONCE(write)) + goto unlock_entry; + result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, + &entry); + break; + default: + WARN_ON_ONCE(1); + break; + } + + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); + finish_iomap: + if (ops->iomap_end) { + int copied = PMD_SIZE; + + if (result == VM_FAULT_FALLBACK) + copied = 0; + /* + * The fault is done by now and there's no way back (other + * thread may be already happily using PMD we have installed). + * Just ignore error from ->iomap_end since we cannot do much + * with it. + */ + ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, + &iomap); + } + fallback: + if (result == VM_FAULT_FALLBACK) { + split_huge_pmd(vma, pmd, address); + count_vm_event(THP_FAULT_FALLBACK); + } + return result; } -EXPORT_SYMBOL_GPL(iomap_dax_fault); -#endif /* CONFIG_FS_IOMAP */ +EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault); +#endif /* CONFIG_FS_DAX_PMD */ diff --git a/fs/dcache.c b/fs/dcache.c index 5c7cc953ac81..95d71eda8142 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -26,7 +26,7 @@ #include <linux/export.h> #include <linux/mount.h> #include <linux/file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/swap.h> @@ -1273,38 +1273,44 @@ rename_retry: goto again; } -/* - * Search for at least 1 mount point in the dentry's subdirs. - * We descend to the next level whenever the d_subdirs - * list is non-empty and continue searching. - */ +struct check_mount { + struct vfsmount *mnt; + unsigned int mounted; +}; -static enum d_walk_ret check_mount(void *data, struct dentry *dentry) +static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { - int *ret = data; - if (d_mountpoint(dentry)) { - *ret = 1; + struct check_mount *info = data; + struct path path = { .mnt = info->mnt, .dentry = dentry }; + + if (likely(!d_mountpoint(dentry))) + return D_WALK_CONTINUE; + if (__path_is_mountpoint(&path)) { + info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** - * have_submounts - check for mounts over a dentry - * @parent: dentry to check. + * path_has_submounts - check for mounts over a dentry in the + * current namespace. + * @parent: path to check. * * Return true if the parent or its subdirectories contain - * a mount point + * a mount point in the current namespace. */ -int have_submounts(struct dentry *parent) +int path_has_submounts(const struct path *parent) { - int ret = 0; + struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - d_walk(parent, &ret, check_mount, NULL); + read_seqlock_excl(&mount_lock); + d_walk(parent->dentry, &data, path_check_mount, NULL); + read_sequnlock_excl(&mount_lock); - return ret; + return data.mounted; } -EXPORT_SYMBOL(have_submounts); +EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is @@ -1330,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry) } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { - dentry->d_flags |= DCACHE_MOUNTED; - ret = 0; + ret = -EBUSY; + if (!d_mountpoint(dentry)) { + dentry->d_flags |= DCACHE_MOUNTED; + ret = 0; + } } spin_unlock(&dentry->d_lock); out: diff --git a/fs/dcookies.c b/fs/dcookies.c index ac44a69fbea9..0d0461cf2431 100644 --- a/fs/dcookies.c +++ b/fs/dcookies.c @@ -26,7 +26,7 @@ #include <linux/mutex.h> #include <linux/path.h> #include <linux/compat.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* The dcookies are allocated from a kmem_cache and * hashed onto a small number of lists. None of the @@ -90,7 +90,7 @@ static void hash_dcookie(struct dcookie_struct * dcs) } -static struct dcookie_struct *alloc_dcookie(struct path *path) +static struct dcookie_struct *alloc_dcookie(const struct path *path) { struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, GFP_KERNEL); @@ -113,7 +113,7 @@ static struct dcookie_struct *alloc_dcookie(struct path *path) /* This is the main kernel-side routine that retrieves the cookie * value for a dentry/vfsmnt pair. */ -int get_dcookie(struct path *path, unsigned long *cookie) +int get_dcookie(const struct path *path, unsigned long *cookie) { int err = 0; struct dcookie_struct * dcs; diff --git a/fs/direct-io.c b/fs/direct-io.c index fb9aa16a7727..c87bae4376b8 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); @@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio) * filesystems that don't need it and also allows us to create the workqueue * late enough so the we can include s_id in the name of the workqueue. */ -static int sb_init_dio_done_wq(struct super_block *sb) +int sb_init_dio_done_wq(struct super_block *sb) { struct workqueue_struct *old; struct workqueue_struct *wq = alloc_workqueue("dio/%s", @@ -843,24 +843,6 @@ out: } /* - * Clean any dirty buffers in the blockdev mapping which alias newly-created - * file blocks. Only called for S_ISREG files - blockdevs do not set - * buffer_new - */ -static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh) -{ - unsigned i; - unsigned nblocks; - - nblocks = map_bh->b_size >> dio->inode->i_blkbits; - - for (i = 0; i < nblocks; i++) { - unmap_underlying_metadata(map_bh->b_bdev, - map_bh->b_blocknr + i); - } -} - -/* * If we are not writing the entire block and get_block() allocated * the block for us, we need to fill-in the unused portion of the * block with zeros. This happens only if user-buffer, fileoffset or @@ -924,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, struct buffer_head *map_bh) { const unsigned blkbits = sdio->blkbits; + const unsigned i_blkbits = blkbits + sdio->blkfactor; int ret = 0; while (sdio->block_in_file < sdio->final_block_in_request) { @@ -960,11 +943,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio, goto do_holes; sdio->blocks_available = - map_bh->b_size >> sdio->blkbits; + map_bh->b_size >> blkbits; sdio->next_block_for_io = map_bh->b_blocknr << sdio->blkfactor; - if (buffer_new(map_bh)) - clean_blockdev_aliases(dio, map_bh); + if (buffer_new(map_bh)) { + clean_bdev_aliases( + map_bh->b_bdev, + map_bh->b_blocknr, + map_bh->b_size >> i_blkbits); + } if (!sdio->blkfactor) goto do_holes; @@ -1209,7 +1196,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->inode = inode; if (iov_iter_rw(iter) == WRITE) { dio->op = REQ_OP_WRITE; - dio->op_flags = WRITE_ODIRECT; + dio->op_flags = REQ_SYNC | REQ_IDLE; } else { dio->op = REQ_OP_READ; } diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index dcea1e37a1b7..07fed838d8fd 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -268,7 +268,7 @@ void dlm_callback_work(struct work_struct *work) int dlm_callback_start(struct dlm_ls *ls) { ls->ls_callback_wq = alloc_workqueue("dlm_callback", - WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WQ_HIGHPRI | WQ_MEM_RECLAIM, 0); if (!ls->ls_callback_wq) { log_print("can't start dlm_callback workqueue"); return -ENOMEM; diff --git a/fs/dlm/config.c b/fs/dlm/config.c index df955d2209ce..7211e826d90d 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -12,7 +12,7 @@ ******************************************************************************/ #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/configfs.h> #include <linux/slab.h> #include <linux/in.h> diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 466f7d60edc2..ca7089aeadab 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -12,7 +12,7 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/ctype.h> #include <linux/debugfs.h> #include <linux/slab.h> diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 216b61604ef9..748e8d59e611 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -18,7 +18,6 @@ * This is the main header file to be included in each DLM source file. */ -#include <linux/module.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/types.h> @@ -39,7 +38,7 @@ #include <linux/mutex.h> #include <linux/idr.h> #include <linux/ratelimit.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/dlm.h> #include "config.h" diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 35502d4046f5..6df332296c66 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1395,7 +1395,6 @@ static int nodeid_warned(int nodeid, int num_nodes, int *warned) void dlm_scan_waiters(struct dlm_ls *ls) { struct dlm_lkb *lkb; - ktime_t zero = ktime_set(0, 0); s64 us; s64 debug_maxus = 0; u32 debug_scanned = 0; @@ -1409,7 +1408,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) mutex_lock(&ls->ls_waiters_mutex); list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) { - if (ktime_equal(lkb->lkb_wait_time, zero)) + if (!lkb->lkb_wait_time) continue; debug_scanned++; @@ -1419,7 +1418,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (us < dlm_config.ci_waitwarn_us) continue; - lkb->lkb_wait_time = zero; + lkb->lkb_wait_time = 0; debug_expired++; if (us > debug_maxus) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index f3e72787e7f9..91592b75c309 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -11,6 +11,8 @@ ******************************************************************************* ******************************************************************************/ +#include <linux/module.h> + #include "dlm_internal.h" #include "lockspace.h" #include "member.h" diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 609998de533e..7d398d300e97 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -519,29 +519,25 @@ out: /* Note: sk_callback_lock must be locked before calling this function. */ static void save_callbacks(struct connection *con, struct sock *sk) { - lock_sock(sk); con->orig_data_ready = sk->sk_data_ready; con->orig_state_change = sk->sk_state_change; con->orig_write_space = sk->sk_write_space; con->orig_error_report = sk->sk_error_report; - release_sock(sk); } static void restore_callbacks(struct connection *con, struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); - lock_sock(sk); sk->sk_user_data = NULL; sk->sk_data_ready = con->orig_data_ready; sk->sk_state_change = con->orig_state_change; sk->sk_write_space = con->orig_write_space; sk->sk_error_report = con->orig_error_report; - release_sock(sk); write_unlock_bh(&sk->sk_callback_lock); } /* Make a socket active */ -static void add_sock(struct socket *sock, struct connection *con) +static void add_sock(struct socket *sock, struct connection *con, bool save_cb) { struct sock *sk = sock->sk; @@ -549,7 +545,7 @@ static void add_sock(struct socket *sock, struct connection *con) con->sock = sock; sk->sk_user_data = con; - if (!test_bit(CF_IS_OTHERCON, &con->flags)) + if (save_cb) save_callbacks(con, sk); /* Install a data_ready callback */ sk->sk_data_ready = lowcomms_data_ready; @@ -806,7 +802,7 @@ static int tcp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon); + add_sock(newsock, othercon, false); addcon = othercon; } else { @@ -819,7 +815,10 @@ static int tcp_accept_from_sock(struct connection *con) else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); + /* accept copies the sk after we've saved the callbacks, so we + don't want to save them a second time or comm errors will + result in calling sk_error_report recursively. */ + add_sock(newsock, newcon, false); addcon = newcon; } @@ -880,7 +879,8 @@ static int sctp_accept_from_sock(struct connection *con) } make_sockaddr(&prim.ssp_addr, 0, &addr_len); - if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) { + ret = addr_to_nodeid(&prim.ssp_addr, &nodeid); + if (ret) { unsigned char *b = (unsigned char *)&prim.ssp_addr; log_print("reject connect from unknown addr"); @@ -919,7 +919,7 @@ static int sctp_accept_from_sock(struct connection *con) newcon->othercon = othercon; othercon->sock = newsock; newsock->sk->sk_user_data = othercon; - add_sock(newsock, othercon); + add_sock(newsock, othercon, false); addcon = othercon; } else { printk("Extra connection from node %d attempted\n", nodeid); @@ -930,7 +930,7 @@ static int sctp_accept_from_sock(struct connection *con) } else { newsock->sk->sk_user_data = newcon; newcon->rx_action = receive_from_sock; - add_sock(newsock, newcon); + add_sock(newsock, newcon, false); addcon = newcon; } @@ -1058,7 +1058,7 @@ static void sctp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = sctp_connect_to_sock; - add_sock(sock, con); + add_sock(sock, con, true); /* Bind to all addresses. */ if (sctp_bind_addrs(con, 0)) @@ -1146,7 +1146,7 @@ static void tcp_connect_to_sock(struct connection *con) sock->sk->sk_user_data = con; con->rx_action = receive_from_sock; con->connect_action = tcp_connect_to_sock; - add_sock(sock, con); + add_sock(sock, con, true); /* Bind to our cluster-known address connecting to avoid routing problems */ @@ -1366,7 +1366,7 @@ static int tcp_listen_for_all(void) sock = tcp_create_listen_sock(con, dlm_local_addr[0]); if (sock) { - add_sock(sock, con); + add_sock(sock, con, true); result = 0; } else { diff --git a/fs/dlm/main.c b/fs/dlm/main.c index 079c0bd71ab7..8e1b618891be 100644 --- a/fs/dlm/main.c +++ b/fs/dlm/main.c @@ -11,6 +11,8 @@ ******************************************************************************* ******************************************************************************/ +#include <linux/module.h> + #include "dlm_internal.h" #include "lockspace.h" #include "lock.h" diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 1e6e227134d7..43a96c330570 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -16,11 +16,7 @@ static uint32_t dlm_nl_seqnum; static uint32_t listener_nlportid; -static struct genl_family family = { - .id = GENL_ID_GENERATE, - .name = DLM_GENL_NAME, - .version = DLM_GENL_VERSION, -}; +static struct genl_family family; static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size) { @@ -69,16 +65,24 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info) return 0; } -static struct genl_ops dlm_nl_ops[] = { +static const struct genl_ops dlm_nl_ops[] = { { .cmd = DLM_CMD_HELLO, .doit = user_cmd, }, }; +static struct genl_family family __ro_after_init = { + .name = DLM_GENL_NAME, + .version = DLM_GENL_VERSION, + .ops = dlm_nl_ops, + .n_ops = ARRAY_SIZE(dlm_nl_ops), + .module = THIS_MODULE, +}; + int __init dlm_netlink_init(void) { - return genl_register_family_with_ops(&family, dlm_nl_ops); + return genl_register_family(&family); } void dlm_netlink_exit(void) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 58c2f4a21b7f..1ce908c2232c 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -9,7 +9,6 @@ #include <linux/miscdevice.h> #include <linux/init.h> #include <linux/wait.h> -#include <linux/module.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/poll.h> diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index cf390dceddd2..e7413f82d27b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -631,28 +631,23 @@ out_lock: static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) { + DEFINE_DELAYED_CALL(done); struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - char *lower_buf; + const char *link; char *buf; - mm_segment_t old_fs; int rc; - lower_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!lower_buf) - return ERR_PTR(-ENOMEM); - old_fs = get_fs(); - set_fs(get_ds()); - rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry, - (char __user *)lower_buf, - PATH_MAX); - set_fs(old_fs); - if (rc < 0) - goto out; + link = vfs_get_link(lower_dentry, &done); + if (IS_ERR(link)) + return ERR_CAST(link); + rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb, - lower_buf, rc); -out: - kfree(lower_buf); - return rc ? ERR_PTR(rc) : buf; + link, strlen(link)); + do_delayed_call(&done); + if (rc) + return ERR_PTR(rc); + + return buf; } static const char *ecryptfs_get_link(struct dentry *dentry, @@ -1089,7 +1084,6 @@ out: } const struct inode_operations ecryptfs_symlink_iops = { - .readlink = generic_readlink, .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 5bbf9612140c..70f5d4f9a945 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -14,7 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define EFS_VERSION "1.0a" diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 10db91218933..bcb68fcc8445 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -34,7 +34,7 @@ #include <linux/mutex.h> #include <linux/anon_inodes.h> #include <linux/device.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include <asm/mman.h> #include <linux/atomic.h> diff --git a/fs/exec.c b/fs/exec.c index 4e497b9ee71e..e57946610733 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include <linux/slab.h> @@ -58,7 +58,7 @@ #include <linux/compat.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; @@ -1169,8 +1169,10 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; +#ifdef CONFIG_POSIX_TIMERS exit_itimers(sig); flush_itimer_signals(); +#endif if (atomic_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; @@ -1266,6 +1268,13 @@ int flush_old_exec(struct linux_binprm * bprm) flush_thread(); current->personality &= ~bprm->per_clear; + /* + * We have to apply CLOEXEC before we change whether the process is + * dumpable (in setup_new_exec) to avoid a race with a process in userspace + * trying to access the should-be-closed file descriptors of a process + * undergoing exec(2). + */ + do_close_on_exec(current->files); return 0; out: @@ -1275,8 +1284,22 @@ EXPORT_SYMBOL(flush_old_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { - if (inode_permission(file_inode(file), MAY_READ) < 0) + struct inode *inode = file_inode(file); + if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; + + /* Ensure mm->user_ns contains the executable */ + user_ns = old = bprm->mm->user_ns; + while ((user_ns != &init_user_ns) && + !privileged_wrt_inode_uidgid(user_ns, inode)) + user_ns = user_ns->parent; + + if (old != user_ns) { + bprm->mm->user_ns = get_user_ns(user_ns); + put_user_ns(old); + } + } } EXPORT_SYMBOL(would_dump); @@ -1306,7 +1329,6 @@ void setup_new_exec(struct linux_binprm * bprm) !gid_eq(bprm->cred->gid, current_egid())) { current->pdeath_signal = 0; } else { - would_dump(bprm, bprm->file); if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) set_dumpable(current->mm, suid_dumpable); } @@ -1315,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm) group */ current->self_exec_id++; flush_signal_handlers(current, 0); - do_close_on_exec(current->files); } EXPORT_SYMBOL(setup_new_exec); @@ -1406,7 +1427,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) unsigned n_fs; if (p->ptrace) { - if (p->ptrace & PT_PTRACE_CAP) + if (ptracer_capable(p, current_user_ns())) bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP; else bprm->unsafe |= LSM_UNSAFE_PTRACE; @@ -1741,6 +1762,8 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out; + would_dump(bprm, bprm->file); + retval = exec_binprm(bprm); if (retval < 0) goto out; diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index d8072bc074a4..0ac62811b341 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -870,46 +870,31 @@ int exofs_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { - ret = simple_write_begin(file, mapping, pos, len, flags, pagep, - fsdata); - if (ret) { - EXOFS_DBGMSG("simple_write_begin failed\n"); - goto out; + page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT, + flags); + if (!page) { + EXOFS_DBGMSG("grab_cache_page_write_begin failed\n"); + return -ENOMEM; } - - page = *pagep; + *pagep = page; } /* read modify write */ if (!PageUptodate(page) && (len != PAGE_SIZE)) { loff_t i_size = i_size_read(mapping->host); pgoff_t end_index = i_size >> PAGE_SHIFT; - size_t rlen; - if (page->index < end_index) - rlen = PAGE_SIZE; - else if (page->index == end_index) - rlen = i_size & ~PAGE_MASK; - else - rlen = 0; - - if (!rlen) { + if (page->index > end_index) { clear_highpage(page); SetPageUptodate(page); - goto out; - } - - ret = _readpage(page, true); - if (ret) { - /*SetPageError was done by _readpage. Is it ok?*/ - unlock_page(page); - EXOFS_DBGMSG("__readpage failed\n"); + } else { + ret = _readpage(page, true); + if (ret) { + unlock_page(page); + EXOFS_DBGMSG("__readpage failed\n"); + } } } -out: - if (unlikely(ret)) - _write_failed(mapping->host, pos + len); - return ret; } @@ -929,18 +914,25 @@ static int exofs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - /* According to comment in simple_write_end i_mutex is held */ - loff_t i_size = inode->i_size; - int ret; - - ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); - if (unlikely(ret)) - _write_failed(inode, pos + len); + loff_t last_pos = pos + copied; - /* TODO: once simple_write_end marks inode dirty remove */ - if (i_size != inode->i_size) + if (!PageUptodate(page)) { + if (copied < len) { + _write_failed(inode, pos + len); + copied = 0; + goto out; + } + SetPageUptodate(page); + } + if (last_pos > inode->i_size) { + i_size_write(inode, last_pos); mark_inode_dirty(inode); - return ret; + } + set_page_dirty(page); +out: + unlock_page(page); + put_page(page); + return copied; } static int exofs_releasepage(struct page *page, gfp_t gfp) diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig index 36bea5adcaba..c634874e12d9 100644 --- a/fs/ext2/Kconfig +++ b/fs/ext2/Kconfig @@ -1,6 +1,5 @@ config EXT2_FS tristate "Second extended fs support" - select FS_IOMAP if FS_DAX help Ext2 is a standard Linux file system for hard disks. diff --git a/fs/ext2/file.c b/fs/ext2/file.c index a0e1478dfd04..b0f241528a30 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) return 0; /* skip atime */ inode_lock_shared(inode); - ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret) goto out_unlock; - ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops); + ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); mark_inode_dirty(inode); @@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) @@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return ret; } -static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned int flags) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - if (flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - } - down_read(&ei->dax_sem); - - ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block); - - up_read(&ei->dax_sem); - if (flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, - .pmd_fault = ext2_dax_pmd_fault, + /* + * .pmd_fault is not supported for DAX because allocation in ext2 + * cannot be reliably aligned to huge page sizes and so pmd faults + * will always fail and fail back to regular faults. + */ .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; @@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); vma->vm_ops = &ext2_dax_vm_ops; - vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + vma->vm_flags |= VM_MIXEDMAP; return 0; } #else diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 41b8b44a391c..f073bfca694b 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode, } if (IS_DAX(inode)) { - int i; - /* * We must unmap blocks before zeroing so that writeback cannot * overwrite zeros with stale data from block device page cache. */ - for (i = 0; i < count; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - le32_to_cpu(chain[depth-1].key) + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key), + count); /* * block must be initialised before we put it in the tree * so that it's not found by another thread before it's @@ -754,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode, mutex_unlock(&ei->truncate_mutex); goto cleanup; } - } else { - *new = true; } + *new = true; ext2_splice_branch(inode, iblock, partial, indirect_blks, count); mutex_unlock(&ei->truncate_mutex); @@ -850,6 +846,9 @@ struct iomap_ops ext2_iomap_ops = { .iomap_begin = ext2_iomap_begin, .iomap_end = ext2_iomap_end, }; +#else +/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */ +struct iomap_ops ext2_iomap_ops; #endif /* CONFIG_FS_DAX */ int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -1293,9 +1292,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) inode_dio_wait(inode); - if (IS_DAX(inode)) - error = dax_truncate_page(inode, newsize, ext2_get_block); - else if (test_opt(inode->i_sb, NOBH)) + if (IS_DAX(inode)) { + error = iomap_zero_range(inode, newsize, + PAGE_ALIGN(newsize) - newsize, NULL, + &ext2_iomap_ops); + } else if (test_opt(inode->i_sb, NOBH)) error = nobh_truncate_page(inode->i_mapping, newsize, ext2_get_block); else @@ -1476,6 +1477,10 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32; else ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl); + if (i_size_read(inode) < 0) { + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_dtime = 0; inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_state = 0; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 9d617423e936..191e02b28ce8 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -14,7 +14,7 @@ #include <linux/compat.h> #include <linux/mount.h> #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 6cb042b53b5b..9e25a71fe1a2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -31,7 +31,7 @@ #include <linux/mount.h> #include <linux/log2.h> #include <linux/quotaops.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "ext2.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 8437b191bf5d..eeffb0138a17 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -21,7 +21,6 @@ #include "xattr.h" const struct inode_operations ext2_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR @@ -30,7 +29,6 @@ const struct inode_operations ext2_symlink_inode_operations = { }; const struct inode_operations ext2_fast_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index dfa519979038..fd389935ecd1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, error = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (error) return error; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); } break; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a8a750f59621..2163c1e69f2a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -397,8 +397,9 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204BC0FF /* User modifiable flags */ +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ #define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ @@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode) return container_of(inode, struct ext4_inode_info, vfs_inode); } -static inline struct timespec ext4_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); -static inline int ext4_sb_has_crypto(struct super_block *sb) -{ - return ext4_has_feature_encrypt(sb); -} - static inline bool ext4_encrypted_inode(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); @@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info @@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate(struct inode *); +extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); @@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern void ext4_ext_truncate(handle_t *, struct inode *); +extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); @@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } -static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) -{ - int blksize = 1 << inode->i_blkbits; - - return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); -} +extern struct iomap_ops ext4_iomap_ops; #endif /* __KERNEL__ */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b1d52c14098e..f97611171023 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ /* We do not support data journalling with delayed allocation */ if (!S_ISREG(inode->i_mode) || - test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) - return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ - if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC)) + test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && + !test_opt(inode->i_sb, DELALLOC))) { + /* We do not support data journalling for encrypted data */ + if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode)) + return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ return EXT4_INODE_JOURNAL_DATA_MODE; /* journal data */ + } if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) return EXT4_INODE_WRITEBACK_DATA_MODE; /* writeback */ - else - BUG(); + BUG(); } static inline int ext4_should_journal_data(struct inode *inode) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c930a0110fb4..3e295d3350a9 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -37,7 +37,7 @@ #include <linux/quotaops.h> #include <linux/string.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/fiemap.h> #include <linux/backing-dev.h> #include "ext4_jbd2.h" @@ -3777,14 +3777,6 @@ out: return err; } -static void unmap_underlying_metadata_blocks(struct block_device *bdev, - sector_t block, int count) -{ - int i; - for (i = 0; i < count; i++) - unmap_underlying_metadata(bdev, block + i); -} - /* * Handle EOFBLOCKS_FL flag, clearing it if necessary */ @@ -4121,9 +4113,8 @@ out: * new. */ if (allocated > map->m_len) { - unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, - newblock + map->m_len, - allocated - map->m_len); + clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len, + allocated - map->m_len); allocated = map->m_len; } map->m_len = allocated; @@ -4631,7 +4622,7 @@ out2: return err ? err : allocated; } -void ext4_ext_truncate(handle_t *handle, struct inode *inode) +int ext4_ext_truncate(handle_t *handle, struct inode *inode) { struct super_block *sb = inode->i_sb; ext4_lblk_t last_block; @@ -4645,7 +4636,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode) /* we have to know where to truncate from in crash case */ EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); + if (err) + return err; last_block = (inode->i_size + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); @@ -4657,12 +4650,9 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - if (err) { - ext4_std_error(inode->i_sb, err); - return; - } - err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); - ext4_std_error(inode->i_sb, err); + if (err) + return err; + return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, @@ -4701,7 +4691,7 @@ retry: /* * Recalculate credits when extent tree depth changes. */ - if (depth >= 0 && depth != ext_depth(inode)) { + if (depth != ext_depth(inode)) { credits = ext4_chunk_trans_blocks(inode, len); depth = ext_depth(inode); } @@ -4725,7 +4715,7 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (new_size) { if (epos > new_size) epos = new_size; @@ -4853,7 +4843,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); @@ -4878,7 +4868,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_dio; } - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) { ext4_update_inode_size(inode, new_size); } else { @@ -5568,7 +5558,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: @@ -5678,7 +5668,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2a822d30e73f..d663d3d7c81c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,6 +31,42 @@ #include "xattr.h" #include "acl.h" +#ifdef CONFIG_FS_DAX +static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + inode_lock_shared(inode); + /* + * Recheck under inode lock - at this point we are sure it cannot + * change anymore + */ + if (!IS_DAX(inode)) { + inode_unlock_shared(inode); + /* Fallback to buffered IO in case we cannot support DAX */ + return generic_file_read_iter(iocb, to); + } + ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops); + inode_unlock_shared(inode); + + file_accessed(iocb->ki_filp); + return ret; +} +#endif + +static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + if (!iov_iter_count(to)) + return 0; /* skip atime */ + +#ifdef CONFIG_FS_DAX + if (IS_DAX(file_inode(iocb->ki_filp))) + return ext4_dax_read_iter(iocb, to); +#endif + return generic_file_read_iter(iocb, to); +} + /* * Called when an inode is released. Note that this is different * from ext4_file_open: open gets called at every open, but release @@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) return 0; } +/* Is IO overwriting allocated and initialized blocks? */ +static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) +{ + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + int err, blklen; + + if (pos + len > i_size_read(inode)) + return false; + + map.m_lblk = pos >> blkbits; + map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits); + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); + /* + * 'err==len' means that all of the blocks have been preallocated, + * regardless of whether they have been initialized or not. To exclude + * unwritten extents, we need to check m_flags. + */ + return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); +} + +static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + return ret; + /* + * If we have encountered a bitmap-format file, the size limit + * is smaller than s_maxbytes, which is for extent-mapped files. + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) + return -EFBIG; + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); + } + return iov_iter_count(from); +} + +#ifdef CONFIG_FS_DAX +static ssize_t +ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + ssize_t ret; + bool overwrite = false; + + inode_lock(inode); + ret = ext4_write_checks(iocb, from); + if (ret <= 0) + goto out; + ret = file_remove_privs(iocb->ki_filp); + if (ret) + goto out; + ret = file_update_time(iocb->ki_filp); + if (ret) + goto out; + + if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) { + overwrite = true; + downgrade_write(&inode->i_rwsem); + } + ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops); +out: + if (!overwrite) + inode_unlock(inode); + else + inode_unlock_shared(inode); + if (ret > 0) + ret = generic_write_sync(iocb, ret); + return ret; +} +#endif + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) int overwrite = 0; ssize_t ret; +#ifdef CONFIG_FS_DAX + if (IS_DAX(inode)) + return ext4_dax_write_iter(iocb, from); +#endif + inode_lock(inode); - ret = generic_write_checks(iocb, from); + ret = ext4_write_checks(iocb, from); if (ret <= 0) goto out; @@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - /* - * If we have encountered a bitmap-format file, the size limit - * is smaller than s_maxbytes, which is for extent-mapped files. - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { - ret = -EFBIG; - goto out; - } - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); - } - iocb->private = &overwrite; - if (o_direct) { - size_t length = iov_iter_count(from); - loff_t pos = iocb->ki_pos; - - /* check whether we do a DIO overwrite or not */ - if (ext4_should_dioread_nolock(inode) && !unaligned_aio && - pos + length <= i_size_read(inode)) { - struct ext4_map_blocks map; - unsigned int blkbits = inode->i_blkbits; - int err, len; - - map.m_lblk = pos >> blkbits; - map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits); - len = map.m_len; - - err = ext4_map_blocks(NULL, inode, &map, 0); - /* - * 'err==len' means that all of blocks has - * been preallocated no matter they are - * initialized or not. For excluding - * unwritten extents, we need to check - * m_flags. There are two conditions that - * indicate for initialized extents. 1) If we - * hit extent cache, EXT4_MAP_MAPPED flag is - * returned; 2) If we do a real lookup, - * non-flags are returned. So we should check - * these two conditions. - */ - if (err == len && (map.m_flags & EXT4_MAP_MAPPED)) - overwrite = 1; - } - } + /* Check whether we do a DIO overwrite or not */ + if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio && + ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) + overwrite = 1; ret = __generic_file_write_iter(iocb, from); inode_unlock(inode); @@ -179,7 +258,6 @@ out: static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; - handle_t *handle = NULL; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -187,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, - EXT4_DATA_TRANS_BLOCKS(sb)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); - - if (IS_ERR(handle)) - result = VM_FAULT_SIGBUS; - else - result = dax_fault(vma, vmf, ext4_dax_get_block); - - if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); + } + down_read(&EXT4_I(inode)->i_mmap_sem); + result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops); + up_read(&EXT4_I(inode)->i_mmap_sem); + if (write) sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -213,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags) { int result; - handle_t *handle = NULL; struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; bool write = flags & FAULT_FLAG_WRITE; @@ -221,26 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, - ext4_chunk_trans_blocks(inode, - PMD_SIZE / PAGE_SIZE)); - } else - down_read(&EXT4_I(inode)->i_mmap_sem); - - if (IS_ERR(handle)) - result = VM_FAULT_SIGBUS; - else - result = dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_get_block); - - if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); - up_read(&EXT4_I(inode)->i_mmap_sem); + } + down_read(&EXT4_I(inode)->i_mmap_sem); + result = dax_iomap_pmd_fault(vma, addr, pmd, flags, + &ext4_iomap_ops); + up_read(&EXT4_I(inode)->i_mmap_sem); + if (write) sb_end_pagefault(sb); - } else - up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -687,7 +739,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read_iter = generic_file_read_iter, + .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 170421edfdfe..e57e8d90ea54 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1039,7 +1039,7 @@ got: /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - ext4_current_time(inode); + current_time(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; @@ -1115,8 +1115,7 @@ got: } if (encrypt) { - /* give pointer to avoid set_context with journal ops. */ - err = fscrypt_inherit_context(dir, inode, &encrypt, true); + err = fscrypt_inherit_context(dir, inode, handle, true); if (err) goto fail_free_drop; } diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f74d5ee2cdec..437df6a1a841 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle, EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE; ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get cleared + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode, len -= EXT4_MIN_INLINE_DATA_SIZE; value = kzalloc(len, GFP_NOFS); - if (!value) + if (!value) { + error = -ENOMEM; goto out; + } error = ext4_xattr_ibody_get(inode, i.name_index, i.name, value, len); @@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle, } } ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA); + /* + * Propagate changes to inode->i_flags as well - e.g. S_DAX may + * get set. + */ + ext4_set_inode_flags(inode); get_bh(is.iloc.bh); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -1971,7 +1983,7 @@ out: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9c064727ed62..88d57af1b516 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,6 +37,7 @@ #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> +#include <linux/iomap.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, - EXT4_INODE_SIZE(inode->i_sb) - - offset); } + csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; @@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode) "couldn't mark inode dirty (err %d)", err); goto stop_handle; } - if (inode->i_blocks) - ext4_truncate(inode); + if (inode->i_blocks) { + err = ext4_truncate(inode); + if (err) { + ext4_error(inode->i_sb, + "couldn't truncate inode %lu (err %d)", + inode->i_ino, err); + goto stop_handle; + } + } /* * ext4_ext_truncate() doesn't reserve any slop when it @@ -654,12 +661,8 @@ found: if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { - ext4_lblk_t i; - - for (i = 0; i < map->m_len; i++) { - unmap_underlying_metadata(inode->i_sb->s_bdev, - map->m_pblk + i); - } + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { @@ -767,6 +770,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; + } else if (ret == 0) { + /* hole case, need to fill in bh->b_size */ + bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } @@ -1127,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (err) break; if (buffer_new(bh)) { - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (PageUptodate(page)) { clear_buffer_new(bh); set_buffer_uptodate(bh); @@ -1166,7 +1171,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, if (unlikely(err)) page_zero_new_buffers(page, from, to); else if (decrypt) - err = fscrypt_decrypt_page(page); + err = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); return err; } #endif @@ -2360,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) BUG_ON(map->m_len == 0); if (map->m_flags & EXT4_MAP_NEW) { - struct block_device *bdev = inode->i_sb->s_bdev; - int i; - - for (i = 0; i < map->m_len; i++) - unmap_underlying_metadata(bdev, map->m_pblk + i); + clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk, + map->m_len); } return 0; } @@ -2891,7 +2894,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_SHIFT; - if (ext4_nonda_switch(inode->i_sb)) { + if (ext4_nonda_switch(inode->i_sb) || + S_ISLNK(inode->i_mode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, flags, pagep, fsdata); @@ -3268,53 +3272,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -/* - * Get block function for DAX IO and mmap faults. It takes care of converting - * unwritten extents to written ones and initializes new / converted blocks - * to zeros. - */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned flags, struct iomap *iomap) { + unsigned int blkbits = inode->i_blkbits; + unsigned long first_block = offset >> blkbits; + unsigned long last_block = (offset + length - 1) >> blkbits; + struct ext4_map_blocks map; int ret; - ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); - if (!create) - return _ext4_get_block(inode, iblock, bh_result, 0); + if (WARN_ON_ONCE(ext4_has_inline_data(inode))) + return -ERANGE; - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) - return ret; + map.m_lblk = first_block; + map.m_len = last_block - first_block + 1; - if (buffer_unwritten(bh_result)) { + if (!(flags & IOMAP_WRITE)) { + ret = ext4_map_blocks(NULL, inode, &map, 0); + } else { + int dio_credits; + handle_t *handle; + int retries = 0; + + /* Trim mapping request to maximum we can map at once for DIO */ + if (map.m_len > DIO_MAX_BLOCKS) + map.m_len = DIO_MAX_BLOCKS; + dio_credits = ext4_chunk_trans_blocks(inode, map.m_len); +retry: /* - * We are protected by i_mmap_sem or i_mutex so we know block - * cannot go away from under us even though we dropped - * i_data_sem. Convert extent to written and write zeros there. + * Either we allocate blocks and then we don't get unwritten + * extent so we have reserved enough credits, or the blocks + * are already allocated and unwritten and in that case + * extent conversion fits in the credits as well. */ - ret = ext4_get_block_trans(inode, iblock, bh_result, - EXT4_GET_BLOCKS_CONVERT | - EXT4_GET_BLOCKS_CREATE_ZERO); - if (ret < 0) + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, + dio_credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + ret = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) { + ext4_journal_stop(handle); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; return ret; + } + + /* + * If we added blocks beyond i_size, we need to make sure they + * will get truncated if we crash before updating i_size in + * ext4_iomap_end(). For faults we don't need to do that (and + * even cannot because for orphan list operations inode_lock is + * required) - if we happen to instantiate block beyond i_size, + * it is because we race with truncate which has already added + * the inode to the orphan list. + */ + if (!(flags & IOMAP_FAULT) && first_block + map.m_len > + (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) { + int err; + + err = ext4_orphan_add(handle, inode); + if (err < 0) { + ext4_journal_stop(handle); + return err; + } + } + ext4_journal_stop(handle); } - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - clear_buffer_new(bh_result); + + iomap->flags = 0; + iomap->bdev = inode->i_sb->s_bdev; + iomap->offset = first_block << blkbits; + + if (ret == 0) { + iomap->type = IOMAP_HOLE; + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->length = (u64)map.m_len << blkbits; + } else { + if (map.m_flags & EXT4_MAP_MAPPED) { + iomap->type = IOMAP_MAPPED; + } else if (map.m_flags & EXT4_MAP_UNWRITTEN) { + iomap->type = IOMAP_UNWRITTEN; + } else { + WARN_ON_ONCE(1); + return -EIO; + } + iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9); + iomap->length = (u64)map.m_len << blkbits; + } + + if (map.m_flags & EXT4_MAP_NEW) + iomap->flags |= IOMAP_F_NEW; return 0; } -#else -/* Just define empty function, it will never get called. */ -int ext4_dax_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + +static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, + ssize_t written, unsigned flags, struct iomap *iomap) { - BUG(); - return 0; + int ret = 0; + handle_t *handle; + int blkbits = inode->i_blkbits; + bool truncate = false; + + if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto orphan_del; + } + if (ext4_update_inode_size(inode, offset + written)) + ext4_mark_inode_dirty(handle, inode); + /* + * We may need to truncate allocated but not written blocks beyond EOF. + */ + if (iomap->offset + iomap->length > + ALIGN(inode->i_size, 1 << blkbits)) { + ext4_lblk_t written_blk, end_blk; + + written_blk = (offset + written) >> blkbits; + end_blk = (offset + length) >> blkbits; + if (written_blk < end_blk && ext4_can_truncate(inode)) + truncate = true; + } + /* + * Remove inode from orphan list if we were extending a inode and + * everything went fine. + */ + if (!truncate && inode->i_nlink && + !list_empty(&EXT4_I(inode)->i_orphan)) + ext4_orphan_del(handle, inode); + ext4_journal_stop(handle); + if (truncate) { + ext4_truncate_failed_write(inode); +orphan_del: + /* + * If truncate failed early the inode might still be on the + * orphan list; we need to make sure the inode is removed from + * the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret; } + +struct iomap_ops ext4_iomap_ops = { + .iomap_begin = ext4_iomap_begin, + .iomap_end = ext4_iomap_end, +}; + #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3436,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (IS_DAX(inode)) { - /* - * We can avoid zeroing for aligned DAX writes beyond EOF. Other - * writes need zeroing either because they can race with page - * faults or because they use partial blocks. - */ - if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size && - ext4_aligned_io(inode, offset, count)) - get_block_func = ext4_dio_get_block; - else - get_block_func = ext4_dax_get_block; - dio_flags = DIO_LOCKING; - } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; @@ -3462,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter) #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, get_block_func, - ext4_end_io_dio, dio_flags); - } else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - get_block_func, - ext4_end_io_dio, NULL, dio_flags); + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + get_block_func, ext4_end_io_dio, NULL, + dio_flags); if (ret > 0 && !overwrite && ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN)) { @@ -3538,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; + size_t count = iov_iter_count(iter); ssize_t ret; /* @@ -3546,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter) * we are protected against page writeback as well. */ inode_lock_shared(inode); - if (IS_DAX(inode)) { - ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0); - } else { - size_t count = iov_iter_count(iter); - - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, - iocb->ki_pos + count); - if (ret) - goto out_unlock; - ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, - iter, ext4_dio_get_block, - NULL, NULL, 0); - } + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, + iocb->ki_pos + count); + if (ret) + goto out_unlock; + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, ext4_dio_get_block, NULL, NULL, 0); out_unlock: inode_unlock_shared(inode); return ret; @@ -3587,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (ext4_has_inline_data(inode)) return 0; + /* DAX uses iomap path now */ + if (WARN_ON_ONCE(IS_DAX(inode))) + return 0; + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == READ) ret = ext4_direct_IO_read(iocb, iter); @@ -3615,6 +3706,13 @@ static int ext4_journalled_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } +static int ext4_set_page_dirty(struct page *page) +{ + WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page)); + WARN_ON_ONCE(!page_has_buffers(page)); + return __set_page_dirty_buffers(page); +} + static const struct address_space_operations ext4_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, @@ -3622,6 +3720,7 @@ static const struct address_space_operations ext4_aops = { .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_invalidatepage, .releasepage = ext4_releasepage, @@ -3654,6 +3753,7 @@ static const struct address_space_operations ext4_da_aops = { .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, + .set_page_dirty = ext4_set_page_dirty, .bmap = ext4_bmap, .invalidatepage = ext4_da_invalidatepage, .releasepage = ext4_releasepage, @@ -3743,7 +3843,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); BUG_ON(blocksize != PAGE_SIZE); - WARN_ON_ONCE(fscrypt_decrypt_page(page)); + WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host, + page, PAGE_SIZE, 0, page->index)); } } if (ext4_should_journal_data(inode)) { @@ -3792,8 +3893,10 @@ static int ext4_block_zero_page_range(handle_t *handle, if (length > max || length < 0) length = max; - if (IS_DAX(inode)) - return dax_zero_page_range(inode, from, length, ext4_get_block); + if (IS_DAX(inode)) { + return iomap_zero_range(inode, from, length, NULL, + &ext4_iomap_ops); + } return __ext4_block_zero_page_range(handle, mapping, from, length); } @@ -4026,7 +4129,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); @@ -4091,10 +4194,11 @@ int ext4_inode_attach_jinode(struct inode *inode) * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ -void ext4_truncate(struct inode *inode) +int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; + int err = 0; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4108,7 +4212,7 @@ void ext4_truncate(struct inode *inode) trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) - return; + return 0; ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS); @@ -4120,13 +4224,13 @@ void ext4_truncate(struct inode *inode) ext4_inline_data_truncate(inode, &has_inline); if (has_inline) - return; + return 0; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { if (ext4_inode_attach_jinode(inode) < 0) - return; + return 0; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4135,10 +4239,8 @@ void ext4_truncate(struct inode *inode) credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ext4_std_error(inode->i_sb, PTR_ERR(handle)); - return; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); @@ -4152,7 +4254,8 @@ void ext4_truncate(struct inode *inode) * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ - if (ext4_orphan_add(handle, inode)) + err = ext4_orphan_add(handle, inode); + if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); @@ -4160,11 +4263,13 @@ void ext4_truncate(struct inode *inode) ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ext4_ext_truncate(handle, inode); + err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); + if (err) + goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); @@ -4180,11 +4285,12 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); + return err; } /* @@ -4352,7 +4458,9 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) && + !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) && + !ext4_encrypted_inode(inode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -4411,7 +4519,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode, { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; - if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { + if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <= + EXT4_INODE_SIZE(inode->i_sb) && + *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { ext4_set_inode_state(inode, EXT4_STATE_XATTR); ext4_find_inline_data_nolock(inode); } else @@ -4434,6 +4544,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) struct inode *inode; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; + loff_t size; int block; uid_t i_uid; gid_t i_gid; @@ -4456,10 +4567,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > - EXT4_INODE_SIZE(inode->i_sb)) { - EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)", - EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize, - EXT4_INODE_SIZE(inode->i_sb)); + EXT4_INODE_SIZE(inode->i_sb) || + (ei->i_extra_isize & 3)) { + EXT4_ERROR_INODE(inode, + "bad extra_isize %u (inode size %u)", + ei->i_extra_isize, + EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } @@ -4534,6 +4647,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); + if ((size = i_size_read(inode)) < 0) { + EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); + ret = -EFSCORRUPTED; + goto bad_inode; + } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; @@ -4577,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ + BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { @@ -5154,7 +5273,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * update c/mtime in shrink case below */ if (!shrink) { - inode->i_mtime = ext4_current_time(inode); + inode->i_mtime = current_time(inode); inode->i_ctime = inode->i_mtime; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5199,12 +5318,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); - if (shrink) - ext4_truncate(inode); + if (shrink) { + rc = ext4_truncate(inode); + if (rc) + error = rc; + } up_write(&EXT4_I(inode)->i_mmap_sem); } - if (!rc) { + if (!error) { setattr_copy(inode, attr); mark_inode_dirty(inode); } @@ -5216,7 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (orphan && inode->i_nlink) ext4_orphan_del(NULL, inode); - if (!rc && (ia_valid & ATTR_MODE)) + if (!error && (ia_valid & ATTR_MODE)) rc = posix_acl_chmod(inode, inode->i_mode); err_out: @@ -5455,18 +5577,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) return err; - if (ext4_handle_valid(handle) && - EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) { /* - * We need extra buffer credits since we may write into EA block + * In nojournal mode, we can immediately attempt to expand + * the inode. When journaled, we first need to obtain extra + * buffer credits since we may write into the EA block * with this same handle. If journal_extend fails, then it will * only result in a minor loss of functionality for that inode. * If this is felt to be critical, then e2fsck should be run to * force a large enough s_min_extra_isize. */ - if ((jbd2_journal_extend(handle, - EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) { + if (!ext4_handle_valid(handle) || + jbd2_journal_extend(handle, + EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) { ret = ext4_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); @@ -5620,6 +5744,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA); } ext4_set_aops(inode); + /* + * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated. + * E.g. S_DAX may get cleared / set. + */ + ext4_set_inode_flags(inode); jbd2_journal_unlock_updates(journal); percpu_up_write(&sbi->s_journal_flag_rwsem); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bf5ae8ebbc97..d534399cf607 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -15,7 +15,7 @@ #include <linux/file.h> #include <linux/quotaops.h> #include <linux/uuid.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb, swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode); + inode->i_ctime = inode_bl->i_ctime = current_time(inode); spin_lock(&sbi->s_next_gen_lock); inode->i_generation = sbi->s_next_generation++; @@ -191,6 +191,7 @@ journal_err_out: return err; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION static int uuid_is_zero(__u8 u[16]) { int i; @@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16]) return 0; return 1; } +#endif static int ext4_ioctl_setflags(struct inode *inode, unsigned int flags) @@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode, err = -EOPNOTSUPP; goto flags_out; } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); + } else if (oldflags & EXT4_EOFBLOCKS_FL) { + err = ext4_truncate(inode); + if (err) + goto flags_out; + } handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { @@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode, for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { if (!(mask & EXT4_FL_USER_MODIFIABLE)) continue; + /* These flags get special treatment later */ + if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL) + continue; if (mask & flags) ext4_set_inode_flag(inode, i); else @@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode, } ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: @@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) @@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) return xflags; } +#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + /* Transfer xflags flags to internal */ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) { @@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; + if (flags & ~EXT4_FL_USER_VISIBLE) + return -EOPNOTSUPP; + /* + * chattr(1) grabs flags via GETFLAGS, modifies the result and + * passes that to SETFLAGS. So we cannot easily make SETFLAGS + * more restrictive than just silently masking off visible but + * not settable flags as we always did. + */ + flags &= EXT4_FL_USER_MODIFIABLE; + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); err = ext4_ioctl_setflags(inode, flags); inode_unlock(inode); @@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); } @@ -765,28 +787,19 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); - case EXT4_IOC_SET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; + case EXT4_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); - if (copy_from_user(&policy, - (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - return fscrypt_process_policy(filp, &policy); -#else - return -EOPNOTSUPP; -#endif - } case EXT4_IOC_GET_ENCRYPTION_PWSALT: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION int err, err2; struct ext4_sb_info *sbi = EXT4_SB(sb); handle_t *handle; - if (!ext4_sb_has_crypto(sb)) + if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { err = mnt_want_write_file(filp); @@ -816,24 +829,13 @@ resizefs_out: sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; - } - case EXT4_IOC_GET_ENCRYPTION_POLICY: { -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct fscrypt_policy policy; - int err = 0; - - if (!ext4_encrypted_inode(inode)) - return -ENOENT; - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; #else return -EOPNOTSUPP; #endif } + case EXT4_IOC_GET_ENCRYPTION_POLICY: + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); + case EXT4_IOC_FSGETXATTR: { struct fsxattr fa; @@ -865,13 +867,17 @@ resizefs_out: if (!inode_owner_or_capable(inode)) return -EACCES; + if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + if (ext4_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + err = mnt_want_write_file(filp); if (err) return err; - flags = ext4_xflags_to_iflags(fa.fsx_xflags); - flags = ext4_mask_flags(inode->i_mode, flags); - inode_lock(inode); flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | (flags & EXT4_FL_XFLAG_VISIBLE); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f418f55c2bbe..7ae43c59bc79 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; - unsigned short border; + unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); @@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) struct ext4_group_info *grinfo; struct sg { struct ext4_group_info info; - ext4_grpblk_t counters[16]; + ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; } sg; group--; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d89754ef1aab..eb9835638680 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) } /* - * Write the MMP block using WRITE_SYNC to try to get the block on-disk + * Write the MMP block using REQ_SYNC to try to get the block on-disk * faster. */ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) @@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) lock_buffer(bh); bh->b_end_io = end_buffer_write_sync; get_bh(bh); - submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh); wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) @@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh, get_bh(*bh); lock_buffer(*bh); (*bh)->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh); + submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh); wait_on_buffer(*bh); if (!buffer_uptodate(*bh)) { ret = -EIO; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 104f8bfba718..eadba919f26b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = ext4_current_time(dir); + dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); @@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode); + inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); @@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_unlink; - dir->i_ctime = dir->i_mtime = ext4_current_time(dir); + dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); ext4_mark_inode_dirty(handle, dir); drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); end_unlink: @@ -3254,7 +3254,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); ext4_inc_count(handle, inode); ihold(inode); @@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, ent->de->file_type = file_type; ent->dir->i_version++; ent->dir->i_ctime = ent->dir->i_mtime = - ext4_current_time(ent->dir); + current_time(ent->dir); ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); + old.inode->i_ctime = current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); if (!whiteout) { @@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode) { ext4_dec_count(handle, new.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + new.inode->i_ctime = current_time(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir); + old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; + struct timespec ctime; if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && @@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = ext4_current_time(old.inode); - new.inode->i_ctime = ext4_current_time(new.inode); + ctime = current_time(old.inode); + old.inode->i_ctime = ctime; + new.inode->i_ctime = ctime; ext4_mark_inode_dirty(handle, old.inode); ext4_mark_inode_dirty(handle, new.inode); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 0094923e5ebf..d83b0f3c5fe9 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0; + REQ_SYNC : 0; bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags); submit_bio(io->io_bio); } @@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, } if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } set_buffer_async_write(bh); nr_to_submit++; @@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, gfp_t gfp_flags = GFP_NOFS; retry_encrypt: - data_page = fscrypt_encrypt_page(inode, page, gfp_flags); + data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0, + page->index, gfp_flags); if (IS_ERR(data_page)) { ret = PTR_ERR(data_page); if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 52b0530c5d65..66845a08a87a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -38,7 +38,7 @@ #include <linux/log2.h> #include <linux/crc16.h> #include <linux/cleancache.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); percpu_free_rwsem(&sbi->s_journal_flag_rwsem); - brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb) } if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); + brelse(sbi->s_sbh); sb->s_fs_info = NULL; /* * Now that we are completely done shutting down the @@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode) static int ext4_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { - handle_t *handle; - int res, res2; + handle_t *handle = fs_data; + int res, res2, retries = 0; + + /* + * If a journal handle was specified, then the encryption context is + * being set on a new inode via inheritance and is part of a larger + * transaction to create the inode. Otherwise the encryption context is + * being set on an existing inode in its own transaction. Only in the + * latter case should the "retry on ENOSPC" logic be used. + */ - /* fs_data is null when internally used. */ - if (fs_data) { - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + if (handle) { + res = ext4_xattr_set_handle(handle, inode, + EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); + /* + * Update inode->i_flags - e.g. S_DAX may get disabled + */ + ext4_set_inode_flags(inode); } return res; } +retry: handle = ext4_journal_start(inode, EXT4_HT_MISC, ext4_jbd2_credits_xattr(inode)); if (IS_ERR(handle)) return PTR_ERR(handle); - res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx, - len, 0); + res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); if (!res) { ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + /* Update inode->i_flags - e.g. S_DAX may get disabled */ + ext4_set_inode_flags(inode); res = ext4_mark_inode_dirty(handle, inode); if (res) EXT4_ERROR_INODE(inode, "Failed to mark inode dirty"); } res2 = ext4_journal_stop(handle); + + if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry; if (!res) res = res2; return res; @@ -1187,7 +1205,7 @@ static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path); + const struct path *path); static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, @@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && - test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " - "in data=ordered mode"); - return 0; - } return 1; } @@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es) { unsigned int s_flags = sb->s_flags; - int nr_orphans = 0, nr_truncates = 0; + int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA int i; #endif @@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, inode->i_ino, inode->i_size); inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); - ext4_truncate(inode); + ret = ext4_truncate(inode); + if (ret) + ext4_std_error(inode->i_sb, ret); inode_unlock(inode); nr_truncates++; } else { @@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp, ext4_set_bit(s++, buf); count++; } - for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) { - ext4_set_bit(EXT4_B2C(sbi, s++), buf); - count++; + j = ext4_bg_num_gdb(sb, grp); + if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) { + ext4_error(sb, "Invalid number of block group " + "descriptor blocks: %d", j); + j = EXT4_BLOCKS_PER_GROUP(sb) - s; } + count += j; + for (; j > 0; j--) + ext4_set_bit(EXT4_B2C(sbi, s++), buf); } if (!count) return 0; @@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); ext4_fsblk_t block; ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; @@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - goto out_free_orig; + if ((data && !orig_data) || !sbi) + goto out_free_base; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); - if (!sbi->s_blockgroup_lock) { - kfree(sbi); - goto out_free_orig; - } + if (!sbi->s_blockgroup_lock) + goto out_free_base; + sb->s_fs_info = sbi; sbi->s_sb = sb; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; @@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) */ sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT; - if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, - &journal_devnum, &journal_ioprio, 0)) { - ext4_msg(sb, KERN_WARNING, - "failed to parse options in superblock: %s", - sbi->s_es->s_mount_opts); + if (sbi->s_es->s_mount_opts[0]) { + char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, + sizeof(sbi->s_es->s_mount_opts), + GFP_KERNEL); + if (!s_mount_opts) + goto failed_mount; + if (!parse_options(s_mount_opts, sb, &journal_devnum, + &journal_ioprio, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + s_mount_opts); + } + kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; if (!parse_options((char *) data, sb, &journal_devnum, @@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dax"); goto failed_mount; } + if (ext4_has_feature_encrypt(sb)) { + ext4_msg(sb, KERN_WARNING, + "encrypted files will use data=ordered " + "instead of data journaling mode"); + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } else { @@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group); sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group); - if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0) - goto cantfind_ext4; sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb); if (sbi->s_inodes_per_block == 0) goto cantfind_ext4; + if (sbi->s_inodes_per_group < sbi->s_inodes_per_block || + sbi->s_inodes_per_group > blocksize * 8) { + ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n", + sbi->s_blocks_per_group); + goto failed_mount; + } sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb); @@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } sbi->s_cluster_ratio = clustersize / blocksize; - if (sbi->s_inodes_per_group > blocksize * 8) { - ext4_msg(sb, KERN_ERR, - "#inodes per group too big: %lu", - sbi->s_inodes_per_group); - goto failed_mount; - } - /* Do we have standard group size of clustersize * 8 blocks ? */ if (sbi->s_blocks_per_group == clustersize << 3) set_opt2(sb, STD_GROUP_SIZE); @@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb))); db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); + if (ext4_has_feature_meta_bg(sb)) { + if (le32_to_cpu(es->s_first_meta_bg) >= db_count) { + ext4_msg(sb, KERN_WARNING, + "first meta block group too large: %u " + "(group descriptor block count %u)", + le32_to_cpu(es->s_first_meta_bg), db_count); + goto failed_mount; + } + } sbi->s_group_desc = ext4_kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); @@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + goto failed_mount_wq; + } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; @@ -4160,7 +4204,9 @@ no_journal: if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount")) ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + "Opts: %.*s%s%s", descr, + (int) sizeof(sbi->s_es->s_mount_opts), + sbi->s_es->s_mount_opts, *sbi->s_es->s_mount_opts ? "; " : "", orig_data); if (es->s_error_count) @@ -4239,8 +4285,8 @@ failed_mount: out_fail: sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); +out_free_base: kfree(sbi); -out_free_orig: kfree(orig_data); return err ? err : ret; } @@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeinodes_counter)); BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - lock_buffer(sbh); + if (sync) + lock_buffer(sbh); if (buffer_write_io_error(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -4566,10 +4613,10 @@ static int ext4_commit_super(struct super_block *sb, int sync) set_buffer_uptodate(sbh); } mark_buffer_dirty(sbh); - unlock_buffer(sbh); if (sync) { + unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC); + test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); if (error) return error; @@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + } else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) { + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "journal_async_commit in data=ordered mode"); + err = -EINVAL; + goto restore_opts; + } } if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { @@ -5239,7 +5293,7 @@ static void lockdep_set_quota_inode(struct inode *inode, int subclass) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; @@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type) handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) goto out; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 557b3b0d668c..73b184d161fc 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -83,21 +83,18 @@ errout: } const struct inode_operations ext4_encrypted_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = ext4_setattr, .listxattr = ext4_listxattr, }; const struct inode_operations ext4_fast_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = ext4_setattr, .listxattr = ext4_listxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index d77be9e9f535..5a94fa52b74f 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, { struct ext4_xattr_entry *e = entry; + /* Find the end of the names list */ while (!IS_LAST_ENTRY(e)) { struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) @@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, e = next; } + /* Check the values */ while (!IS_LAST_ENTRY(entry)) { if (entry->e_value_block != 0) return -EFSCORRUPTED; - if (entry->e_value_size != 0 && - (value_start + le16_to_cpu(entry->e_value_offs) < - (void *)e + sizeof(__u32) || - value_start + le16_to_cpu(entry->e_value_offs) + - le32_to_cpu(entry->e_value_size) > end)) - return -EFSCORRUPTED; + if (entry->e_value_size != 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); + u32 size = le32_to_cpu(entry->e_value_size); + void *value; + + /* + * The value cannot overlap the names, and the value + * with padding cannot extend beyond 'end'. Check both + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ + if (offs > end - value_start) + return -EFSCORRUPTED; + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || + EXT4_XATTR_SIZE(size) > end - value) + return -EFSCORRUPTED; + } entry = EXT4_XATTR_NEXT(entry); } @@ -231,13 +246,12 @@ static int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { - struct ext4_xattr_entry *entry = IFIRST(header); int error = -EFSCORRUPTED; - if (((void *) header >= end) || + if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(entry, end, entry); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, return 0; } -static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, +static int ext4_xattr_ibody_set(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is) { @@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, } if (!value) { if (!is.s.not_found) - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); else if (!bs.s.not_found) error = ext4_xattr_block_set(handle, inode, &i, &bs); } else { @@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i)) goto cleanup; - error = ext4_xattr_ibody_set(handle, inode, &i, &is); + error = ext4_xattr_ibody_set(inode, &i, &is); if (!error && !bs.s.not_found) { i.value = NULL; error = ext4_xattr_block_set(handle, inode, &i, &bs); @@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, goto cleanup; if (!is.s.not_found) { i.value = NULL; - error = ext4_xattr_ibody_set(handle, inode, &i, - &is); + error = ext4_xattr_ibody_set(inode, &i, &is); } } } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = ext4_current_time(inode); + inode->i_ctime = current_time(inode); if (!value) ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); error = ext4_mark_iloc_dirty(handle, inode, &is.iloc); @@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode, goto out; /* Remove the chosen entry from the inode */ - error = ext4_xattr_ibody_set(handle, inode, &i, is); + error = ext4_xattr_ibody_set(inode, &i, is); if (error) goto out; diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 6fe23af509e1..8f487692c21f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7e9b504bd8b2..f73ee9534d83 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = READ_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, @@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .sbi = sbi, .type = META, .op = REQ_OP_READ, - .op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, + .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; @@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, @@ -770,7 +770,12 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; + + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; if (cp_blks <= 1) goto done; @@ -793,6 +798,9 @@ done: f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; @@ -921,7 +929,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; @@ -987,7 +999,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -998,7 +1010,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); @@ -1123,7 +1135,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1184,9 +1196,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is @@ -1261,8 +1273,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - - f2fs_wait_all_discard_bio(sbi); + if (err) { + release_discard_addrs(sbi); + } else { + clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 9ae194fd2fdb..9ac262564fa6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -29,6 +29,26 @@ #include "trace.h" #include <trace/events/f2fs.h> +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio) mapping_set_error(page->mapping, -EIO); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -88,6 +111,46 @@ static void f2fs_write_end_io(struct bio *bio) } /* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + +/* * Low-level block read/write IO operations. */ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, @@ -97,8 +160,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -109,8 +171,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { - atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } @@ -198,11 +259,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - if (test_opt(sbi, NOBARRIER)) - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META | - REQ_PRIO; + io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -270,22 +329,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags))) + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -483,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, 0, false); if (IS_ERR(page)) return page; @@ -509,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, 0, for_write); if (IS_ERR(page)) return page; @@ -590,7 +651,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -608,11 +668,8 @@ alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ @@ -624,11 +681,18 @@ alloc: return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -640,19 +704,22 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -676,7 +743,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -716,7 +782,7 @@ next_dnode: } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: @@ -735,10 +801,8 @@ next_block: } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -793,7 +857,6 @@ skip: err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -812,9 +875,8 @@ skip: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -822,7 +884,7 @@ sync_out: unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); @@ -834,19 +896,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, @@ -891,7 +953,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -908,13 +969,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -933,13 +987,11 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } @@ -982,7 +1034,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -1000,8 +1051,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1096,7 +1146,8 @@ got_it: * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; @@ -1195,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio) fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); + PAGE_SIZE, 0, + fio->page->index, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); if (err == -ENOMEM) { @@ -1251,7 +1304,7 @@ static int f2fs_write_data_page(struct page *page, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1311,7 +1364,6 @@ done: if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1332,6 +1384,8 @@ out: redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1427,6 +1481,15 @@ continue_unlock: ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; @@ -1663,7 +1726,7 @@ repeat: err = PTR_ERR(bio); goto fail; } - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; @@ -1714,7 +1777,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); @@ -1751,9 +1813,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (__force_buffered_io(inode, rw)) return 0; trace_f2fs_direct_IO_enter(inode, offset, count, rw); @@ -1785,12 +1845,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fb245bd302e4..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -74,7 +75,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +196,9 @@ get_cache: si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -310,22 +314,22 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -373,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file) } static const struct file_operations stat_fops = { + .owner = THIS_MODULE, .open = stat_open, .read = seq_read, .llseek = seq_lseek, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 369f4513be37..827c5daef4fc 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } @@ -784,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -819,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -827,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -871,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -891,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8de18a168a..2da8c3aa0ce5 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -401,6 +401,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -413,6 +414,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -428,7 +431,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -493,20 +496,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ @@ -522,9 +531,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -585,7 +594,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { @@ -649,6 +657,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -656,6 +665,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -688,7 +699,7 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int op; /* contains REQ_OP_ */ - int op_flags; /* rq_flag_bits */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ @@ -704,6 +715,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -750,6 +775,12 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -764,6 +795,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -815,10 +847,9 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -863,6 +894,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -1105,13 +1138,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -1232,9 +1258,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -1242,14 +1269,14 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1258,19 +1285,19 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) @@ -1329,22 +1356,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); - - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; return start_addr; } +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); @@ -1621,7 +1653,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1648,7 +1680,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1657,7 +1689,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1668,7 +1700,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1682,34 +1714,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) -{ - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1837,13 +1862,31 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1955,7 +1998,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); @@ -1995,7 +2038,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); @@ -2034,7 +2077,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -2060,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); @@ -2132,12 +2175,15 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); @@ -2160,7 +2206,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* @@ -2181,12 +2227,12 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; @@ -2412,9 +2458,30 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); +} + +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; +} +#endif + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) @@ -2453,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page #define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page #define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy #define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context #define fscrypt_inherit_context fscrypt_notsupp_inherit_context #define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c7865073cd26..49f10dce817d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -94,8 +94,6 @@ mapped: if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -210,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -264,7 +262,7 @@ sync_nodes: } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -632,7 +630,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -679,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = setattr_prepare(dentry, attr); if (err) @@ -694,7 +693,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -710,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -722,7 +722,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } @@ -967,7 +972,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { @@ -1218,6 +1223,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; @@ -1313,15 +1321,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1333,12 +1341,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1352,7 +1360,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -1393,7 +1401,9 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1526,7 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) @@ -1752,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1842,7 +1837,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; @@ -2256,12 +2251,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6f14ee923acd..88bfc3dff496 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -544,13 +544,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_READ, - .op_flags = READ_SYNC, + .op_flags = 0, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -625,7 +629,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_wait_on_page_writeback(dn.node_page, NODE, true); fio.op = REQ_OP_WRITE; - fio.op_flags = WRITE_SYNC; + fio.op_flags = REQ_SYNC; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -645,7 +649,8 @@ out: f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -663,7 +671,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .sbi = F2FS_I_SB(inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC, + .op_flags = REQ_SYNC, .page = page, .encrypted_page = NULL, }; @@ -673,8 +681,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); @@ -683,8 +693,6 @@ retry: congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); @@ -794,9 +802,9 @@ next_step: start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -899,7 +907,7 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -940,6 +948,9 @@ gc_more: if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 5f1a67f756af..e32a9e527968 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .sbi = F2FS_I_SB(dn->inode), .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, }; @@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); @@ -419,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; @@ -573,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); @@ -610,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -622,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include <trace/events/f2fs.h> -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -252,6 +253,7 @@ retry: int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +269,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -335,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -373,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -384,6 +391,8 @@ retry: f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -403,10 +412,12 @@ no_delete: invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); @@ -424,6 +435,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 489fa0d5f914..56c19b0610a8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); @@ -1075,7 +1075,6 @@ errout: } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, @@ -1105,7 +1104,6 @@ const struct inode_operations f2fs_dir_inode_operations = { }; const struct inode_operations f2fs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 01177ecdeab8..b9078fdb3743 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } @@ -1134,7 +1135,7 @@ repeat: if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, 0); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1204,6 +1205,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: @@ -1338,7 +1340,8 @@ retry: if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1407,11 +1410,12 @@ continue_unlock: "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; @@ -1570,7 +1574,7 @@ static int f2fs_write_node_page(struct page *page, .sbi = sbi, .type = NODE, .op = REQ_OP_WRITE, - .op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1692,11 +1696,35 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { + struct f2fs_nm_info *nm_i = NM_I(sbi); + + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool reuse) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; list_del(&i->list); - radix_tree_delete(&nm_i->free_nid_root, i->nid); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1704,9 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; + int err; /* 0 nid should not be used */ if (unlikely(nid == 0)) @@ -1729,33 +1755,30 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); - radix_tree_preload_end(); + spin_lock(&nm_i->nid_list_lock); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); - radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1778,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void build_free_nids(struct f2fs_sb_info *sbi) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1794,7 +1815,10 @@ void build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) + return; + + if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; /* readahead nat pages to be scanned */ @@ -1830,7 +1854,7 @@ void build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1839,6 +1863,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi, sync); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1853,31 +1884,31 @@ retry: if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; + spin_lock(&nm_i->nid_list_lock); - spin_lock(&nm_i->free_nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); - build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); + build_free_nids(sbi, true); goto retry; } @@ -1889,11 +1920,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1910,17 +1941,22 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } - spin_unlock(&nm_i->free_nid_list_lock); + + nm_i->available_nids++; + + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1932,24 +1968,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; - __del_from_free_nid_list(nm_i, i); + + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2005,7 +2041,7 @@ recover_xnid: if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2035,7 +2071,7 @@ retry: } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2069,7 +2105,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2078,7 +2113,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); @@ -2120,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2192,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2268,21 +2320,24 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2310,7 +2365,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } @@ -2327,17 +2382,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); - __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* @@ -313,7 +314,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 2fc84a991325..981a9584b62f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,13 +180,15 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = "<encrypted>"; else @@ -196,32 +198,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +224,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -454,7 +427,8 @@ retry_dn: continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -507,8 +481,10 @@ err: f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fc886f008449..0d8802453758 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); @@ -287,7 +289,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); @@ -363,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } @@ -380,14 +381,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); + + if (!is_idle(sbi)) + return; /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; @@ -400,6 +404,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_bdev = bdev; + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -410,25 +441,18 @@ repeat: return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,15 +473,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); - ret = submit_bio_wait(bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -469,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -481,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -488,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -500,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -633,15 +670,23 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio) } /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio = NULL; int err; - err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); if (!err && bio) { struct bio_entry *be = __add_bio_entry(sbi, bio); @@ -654,24 +699,101 @@ int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector, return err; } +#ifdef CONFIG_BLK_DEV_ZONED +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + sector_t sector; + int devi = 0; + + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, bdev, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); +} + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); - return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0); + + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, @@ -1296,25 +1418,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1448,21 +1566,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - type = direct_io ? CURSEG_WARM_DATA : type; - - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* @@ -1515,7 +1623,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .op = REQ_OP_WRITE, - .op_flags = WRITE_SYNC | REQ_META | REQ_PRIO, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -2166,7 +2274,6 @@ out: static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2233,7 +2340,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2315,10 +2422,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; @@ -2387,6 +2494,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ @@ -2645,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) @@ -102,8 +104,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -695,13 +696,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -719,7 +713,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -736,11 +730,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6132b4ce4e4c..46fd30d8af77 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,14 +412,20 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -512,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { @@ -558,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -620,24 +629,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +659,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +688,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) @@ -687,20 +699,28 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -738,7 +758,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); @@ -762,6 +781,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -789,13 +810,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -822,7 +847,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; @@ -974,7 +1000,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { @@ -1076,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; @@ -1238,7 +1265,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1426,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1437,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1447,6 +1485,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1471,6 +1510,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1486,13 +1528,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) @@ -1502,6 +1538,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) +{ + struct block_device *bdev = FDEV(devi).bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) + return -EINVAL; + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) + FDEV(devi).nr_blkz++; + + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + FDEV(devi).blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1594,6 +1695,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1641,6 +1813,18 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#endif default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1710,6 +1894,13 @@ try_onemore: goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1893,12 +2084,21 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); @@ -2044,3 +2244,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3e1c0280f866..c47ce2f330a1 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: diff --git a/fs/fcntl.c b/fs/fcntl.c index 350a2c8cfd28..e1c54f20325c 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -25,7 +25,7 @@ #include <asm/poll.h> #include <asm/siginfo.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) @@ -52,7 +52,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) arg |= O_NONBLOCK; /* Pipe packetized mode is controlled by O_DIRECT flag */ - if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) { + if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) { if (!filp->f_mapping || !filp->f_mapping->a_ops || !filp->f_mapping->a_ops->direct_IO) return -EINVAL; diff --git a/fs/fhandle.c b/fs/fhandle.c index ca3c3dd01789..5559168d5637 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -8,7 +8,7 @@ #include <linux/fs_struct.h> #include <linux/fsnotify.h> #include <linux/personality.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" #include "mount.h" diff --git a/fs/file_table.c b/fs/file_table.c index ad17e05ebf95..6d982b57de92 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -155,7 +155,7 @@ over: * @mode: the mode with which the new file will be opened * @fop: the 'struct file_operations' for the new file */ -struct file *alloc_file(struct path *path, fmode_t mode, +struct file *alloc_file(const struct path *path, fmode_t mode, const struct file_operations *fop) { struct file *file; diff --git a/fs/filesystems.c b/fs/filesystems.c index c5618db110be..cac75547d35c 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -14,7 +14,7 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* * Handling of filesystem drivers list. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 05713a5da083..ef600591d96f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb, * become available for writeback. Otherwise * we'll just busyloop. */ - if (!list_empty(&wb->b_more_io)) { - trace_writeback_wait(wb, work); - inode = wb_inode(wb->b_more_io.prev); - spin_lock(&inode->i_lock); - spin_unlock(&wb->list_lock); - /* This function drops i_lock... */ - inode_sleep_on_writeback(inode); - spin_lock(&wb->list_lock); - } + trace_writeback_wait(wb, work); + inode = wb_inode(wb->b_more_io.prev); + spin_lock(&inode->i_lock); + spin_unlock(&wb->list_lock); + /* This function drops i_lock... */ + inode_sleep_on_writeback(inode); + spin_lock(&wb->list_lock); } spin_unlock(&wb->list_lock); blk_finish_plug(&plug); diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 4304072161aa..40d61077bead 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -542,6 +542,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) hlist_for_each_entry(object, &cookie->backing_objects, cookie_link) { if (invalidate) set_bit(FSCACHE_OBJECT_RETIRED, &object->flags); + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); fscache_raise_event(object, FSCACHE_OBJECT_EV_KILL); } } else { @@ -560,6 +561,10 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) wait_on_atomic_t(&cookie->n_active, fscache_wait_atomic_t, TASK_UNINTERRUPTIBLE); + /* Make sure any pending writes are cancelled. */ + if (cookie->def->type != FSCACHE_COOKIE_TYPE_INDEX) + fscache_invalidate_writes(cookie); + /* Reset the cookie state if it wasn't relinquished */ if (!test_bit(FSCACHE_COOKIE_RELINQUISHED, &cookie->flags)) { atomic_inc(&cookie->n_active); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 9b28649df3a1..a8aa00be4444 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -48,6 +48,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) cookie->flags = 1 << FSCACHE_COOKIE_ENABLED; spin_lock_init(&cookie->lock); + spin_lock_init(&cookie->stores_lock); INIT_HLIST_HEAD(&cookie->backing_objects); /* check the netfs type is not already present */ diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 9e792e30f4db..7a182c87f378 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -30,6 +30,7 @@ static const struct fscache_state *fscache_look_up_object(struct fscache_object static const struct fscache_state *fscache_object_available(struct fscache_object *, int); static const struct fscache_state *fscache_parent_ready(struct fscache_object *, int); static const struct fscache_state *fscache_update_object(struct fscache_object *, int); +static const struct fscache_state *fscache_object_dead(struct fscache_object *, int); #define __STATE_NAME(n) fscache_osm_##n #define STATE(n) (&__STATE_NAME(n)) @@ -91,7 +92,7 @@ static WORK_STATE(LOOKUP_FAILURE, "LCFL", fscache_lookup_failure); static WORK_STATE(KILL_OBJECT, "KILL", fscache_kill_object); static WORK_STATE(KILL_DEPENDENTS, "KDEP", fscache_kill_dependents); static WORK_STATE(DROP_OBJECT, "DROP", fscache_drop_object); -static WORK_STATE(OBJECT_DEAD, "DEAD", (void*)2UL); +static WORK_STATE(OBJECT_DEAD, "DEAD", fscache_object_dead); static WAIT_STATE(WAIT_FOR_INIT, "?INI", TRANSIT_TO(INIT_OBJECT, 1 << FSCACHE_OBJECT_EV_NEW_CHILD)); @@ -229,6 +230,10 @@ execute_work_state: event = -1; if (new_state == NO_TRANSIT) { _debug("{OBJ%x} %s notrans", object->debug_id, state->name); + if (unlikely(state == STATE(OBJECT_DEAD))) { + _leave(" [dead]"); + return; + } fscache_enqueue_object(object); event_mask = object->oob_event_mask; goto unmask_events; @@ -239,7 +244,7 @@ execute_work_state: object->state = state = new_state; if (state->work) { - if (unlikely(state->work == ((void *)2UL))) { + if (unlikely(state == STATE(OBJECT_DEAD))) { _leave(" [dead]"); return; } @@ -645,6 +650,12 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob fscache_mark_object_dead(object); object->oob_event_mask = 0; + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->flags)) { + /* Reject any new read/write ops and abort any that are pending. */ + clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); + fscache_cancel_all_ops(object); + } + if (list_empty(&object->dependents) && object->n_ops == 0 && object->n_children == 0) @@ -1077,3 +1088,20 @@ void fscache_object_mark_killed(struct fscache_object *object, } } EXPORT_SYMBOL(fscache_object_mark_killed); + +/* + * The object is dead. We can get here if an object gets queued by an event + * that would lead to its death (such as EV_KILL) when the dispatcher is + * already running (and so can be requeued) but hasn't yet cleared the event + * mask. + */ +static const struct fscache_state *fscache_object_dead(struct fscache_object *object, + int event) +{ + if (!test_and_set_bit(FSCACHE_OBJECT_RUN_AFTER_DEAD, + &object->flags)) + return NO_TRANSIT; + + WARN(true, "FS-Cache object redispatched after death"); + return NO_TRANSIT; +} diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 70ea57c7b6bb..f11792672977 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -399,6 +399,10 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->waitq.lock); + if (test_bit(FR_FINISHED, &req->flags)) { + spin_unlock(&fiq->waitq.lock); + return; + } if (list_empty(&req->intr_entry)) { list_add_tail(&req->intr_entry, &fiq->interrupts); wake_up_locked(&fiq->waitq); @@ -1372,6 +1376,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, * code can Oops if the buffer persists after module unload. */ bufs[page_nr].ops = &nosteal_pipe_buf_ops; + bufs[page_nr].flags = 0; ret = add_to_pipe(pipe, &bufs[page_nr++]); if (unlikely(ret < 0)) break; @@ -2025,7 +2030,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - clear_bit(FR_PENDING, &req->flags); clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); request_end(fc, req); @@ -2103,6 +2107,8 @@ void fuse_abort_conn(struct fuse_conn *fc) spin_lock(&fiq->waitq.lock); fiq->connected = 0; list_splice_init(&fiq->pending, &to_end2); + list_for_each_entry(req, &to_end2, list) + clear_bit(FR_PENDING, &req->flags); while (forget_pending(fiq)) kfree(dequeue_forget(fiq, 1, NULL)); wake_up_all_locked(&fiq->waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 096f79997f75..811fd8929a18 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -68,7 +68,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec) if (sec || nsec) { struct timespec64 ts = { sec, - max_t(u32, nsec, NSEC_PER_SEC - 1) + min_t(u32, nsec, NSEC_PER_SEC - 1) }; return get_jiffies_64() + timespec64_to_jiffies(&ts); @@ -1831,7 +1831,6 @@ static const struct inode_operations fuse_common_inode_operations = { static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .get_link = fuse_get_link, - .readlink = generic_readlink, .getattr = fuse_getattr, .listxattr = fuse_listxattr, }; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5a6f52ea2722..6b039d7ce160 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -839,12 +839,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode))); kaddr = kmap_atomic(page); memcpy(buf + pos, kaddr + pos, copied); - memset(kaddr + pos + copied, 0, len - copied); flush_dcache_page(page); kunmap_atomic(kaddr); - if (!PageUptodate(page)) - SetPageUptodate(page); + WARN_ON(!PageUptodate(page)); unlock_page(page); put_page(page); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 3cdde5f5d399..79113219be5f 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -62,6 +62,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/vmalloc.h> +#include <linux/bio.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index e23ff70b3435..016c11eaca7c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -22,7 +22,7 @@ #include <linux/swap.h> #include <linux/crc32.h> #include <linux/writeback.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> #include <linux/delay.h> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 14cbf60167a7..94f50cac91c6 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -21,7 +21,7 @@ #include <linux/list.h> #include <linux/wait.h> #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/debugfs.h> #include <linux/kthread.h> @@ -695,7 +695,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_target = LM_ST_UNLOCKED; gl->gl_demote_state = LM_ST_EXCLUSIVE; gl->gl_ops = glops; - gl->gl_dstamp = ktime_set(0, 0); + gl->gl_dstamp = 0; preempt_disable(); /* We use the global stats to estimate the initial per-glock stats */ gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index fe3f84995c48..eb7724b8578a 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -19,7 +19,7 @@ #include <linux/crc32.h> #include <linux/fiemap.h> #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "gfs2.h" #include "incore.h" @@ -2067,7 +2067,6 @@ const struct inode_operations gfs2_dir_iops = { }; const struct inode_operations gfs2_symlink_iops = { - .readlink = generic_readlink, .get_link = gfs2_get_link, .permission = gfs2_permission, .setattr = gfs2_setattr, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index e58ccef09c91..27c00a16def0 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) struct gfs2_log_header *lh; unsigned int tail; u32 hash; - int op_flags = WRITE_FLUSH_FUA | REQ_META; + int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META; struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO); enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); lh = page_address(page); @@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); - op_flags = WRITE_SYNC | REQ_META | REQ_PRIO; + op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 49d5a1b61b06..b1f9144b42c7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio) * gfs2_log_flush_bio - Submit any pending log bio * @sdp: The superblock * @op: REQ_OP - * @op_flags: rq_flag_bits + * @op_flags: req_flag_bits * * Submit any pending part-built or full bio to the block device. If * there is no pending bio, then this is a no-op. diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 373639a59782..49db8ef13fdf 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb { struct buffer_head *bh, *head; int nr_underway = 0; - int write_flags = REQ_META | REQ_PRIO | - (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc); BUG_ON(!PageLocked(page)); BUG_ON(!page_has_buffers(page)); @@ -285,7 +284,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, } } - gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num); + gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; @@ -453,7 +452,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index ff72ac6439c8..a34308df927f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio->bi_end_io = end_bio_io_page; bio->bi_private = page; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META); + bio_set_op_attrs(bio, REQ_OP_READ, REQ_META); submit_bio(bio); wait_on_page_locked(page); bio_put(bio); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index c9ff1cf7d4f3..f8d30e41d1d3 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -15,7 +15,7 @@ #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/kobject.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/gfs2_ondisk.h> #include <linux/genhd.h> diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index aee4485ad8a9..763d659db91b 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -14,7 +14,7 @@ #include <linux/buffer_head.h> #include <linux/crc32.h> #include <linux/gfs2_ondisk.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index a4a577088d19..d87721aeb575 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -14,7 +14,7 @@ #include <linux/xattr.h> #include <linux/gfs2_ondisk.h> #include <linux/posix_acl_xattr.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 4cdec5a19347..6d0783e2e276 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -23,7 +23,7 @@ #include <linux/workqueue.h> #include <asm/byteorder.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "hfs.h" diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 99627f8a0a18..0a156d84e67d 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -16,7 +16,7 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "hfsplus_fs.h" /* diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 11854dd84572..67aedf4c2e7c 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + HFSPLUS_VOLHEAD_SECTOR, sbi->s_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error = error2; if (!write_backup) @@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait) error2 = hfsplus_submit_bio(sb, sbi->part_start + sbi->sect_count - 2, sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (!error) error2 = error; out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 23e15ea53e45..e61261a7417e 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -920,7 +920,6 @@ static const char *hostfs_get_link(struct dentry *dentry, } static const struct inode_operations hostfs_link_iops = { - .readlink = generic_readlink, .get_link = hostfs_get_link, }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 4fb7b10f3a05..54de77e78775 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -37,7 +37,7 @@ #include <linux/migrate.h> #include <linux/uio.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static const struct super_operations hugetlbfs_ops; static const struct address_space_operations hugetlbfs_aops; diff --git a/fs/internal.h b/fs/internal.h index f4da3341b4a3..b63cf3af2dc2 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -62,7 +62,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); -extern struct vfsmount *lookup_mnt(struct path *); +extern struct vfsmount *lookup_mnt(const struct path *); extern int finish_automount(struct vfsmount *, struct path *); extern int sb_prepare_remount_readonly(struct super_block *); @@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len, loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap_ops *ops, void *data, iomap_actor_t actor); + +/* direct-io.c: */ +int sb_init_dio_done_wq(struct super_block *sb); diff --git a/fs/ioctl.c b/fs/ioctl.c index c415668c86d4..cb9b02940805 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -223,7 +223,11 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, if (!src_file.file) return -EBADF; - ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + ret = -EXDEV; + if (src_file.file->f_path.mnt != dst_file->f_path.mnt) + goto fdput; + ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen); +fdput: fdput(src_file); return ret; } diff --git a/fs/iomap.c b/fs/iomap.c index a8ee8c33ca78..a51cb4c07d4d 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -24,6 +24,7 @@ #include <linux/uio.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include "internal.h" @@ -113,6 +114,9 @@ iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags, BUG_ON(pos + len > iomap->offset + iomap->length); + if (fatal_signal_pending(current)) + return -EINTR; + page = grab_cache_page_write_begin(inode->i_mapping, index, flags); if (!page) return -ENOMEM; @@ -467,8 +471,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, offset = page_offset(page); while (length > 0) { - ret = iomap_apply(inode, offset, length, IOMAP_WRITE, - ops, page, iomap_page_mkwrite_actor); + ret = iomap_apply(inode, offset, length, + IOMAP_WRITE | IOMAP_FAULT, ops, page, + iomap_page_mkwrite_actor); if (unlikely(ret <= 0)) goto out_unlock; offset += ret; @@ -583,3 +588,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return 0; } EXPORT_SYMBOL_GPL(iomap_fiemap); + +/* + * Private flags for iomap_dio, must not overlap with the public ones in + * iomap.h: + */ +#define IOMAP_DIO_WRITE (1 << 30) +#define IOMAP_DIO_DIRTY (1 << 31) + +struct iomap_dio { + struct kiocb *iocb; + iomap_dio_end_io_t *end_io; + loff_t i_size; + loff_t size; + atomic_t ref; + unsigned flags; + int error; + + union { + /* used during submission and for synchronous completion: */ + struct { + struct iov_iter *iter; + struct task_struct *waiter; + struct request_queue *last_queue; + blk_qc_t cookie; + } submit; + + /* used for aio completion: */ + struct { + struct work_struct work; + } aio; + }; +}; + +static ssize_t iomap_dio_complete(struct iomap_dio *dio) +{ + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + if (dio->end_io) { + ret = dio->end_io(iocb, + dio->error ? dio->error : dio->size, + dio->flags); + } else { + ret = dio->error; + } + + if (likely(!ret)) { + ret = dio->size; + /* check for short read */ + if (iocb->ki_pos + ret > dio->i_size && + !(dio->flags & IOMAP_DIO_WRITE)) + ret = dio->i_size - iocb->ki_pos; + iocb->ki_pos += ret; + } + + inode_dio_end(file_inode(iocb->ki_filp)); + kfree(dio); + + return ret; +} + +static void iomap_dio_complete_work(struct work_struct *work) +{ + struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); + struct kiocb *iocb = dio->iocb; + bool is_write = (dio->flags & IOMAP_DIO_WRITE); + ssize_t ret; + + ret = iomap_dio_complete(dio); + if (is_write && ret > 0) + ret = generic_write_sync(iocb, ret); + iocb->ki_complete(iocb, ret, 0); +} + +/* + * Set an error in the dio if none is set yet. We have to use cmpxchg + * as the submission context and the completion context(s) can race to + * update the error. + */ +static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) +{ + cmpxchg(&dio->error, 0, ret); +} + +static void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_error) + iomap_dio_set_error(dio, bio->bi_error); + + if (atomic_dec_and_test(&dio->ref)) { + if (is_sync_kiocb(dio->iocb)) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + wake_up_process(waiter); + } else if (dio->flags & IOMAP_DIO_WRITE) { + struct inode *inode = file_inode(dio->iocb->ki_filp); + + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); + } else { + iomap_dio_complete_work(&dio->aio.work); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) + put_page(bvec->bv_page); + bio_put(bio); + } +} + +static blk_qc_t +iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, + unsigned len) +{ + struct page *page = ZERO_PAGE(0); + struct bio *bio; + + bio = bio_alloc(GFP_KERNEL, 1); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + get_page(page); + if (bio_add_page(bio, page, len, 0) != len) + BUG(); + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + + atomic_inc(&dio->ref); + return submit_bio(bio); +} + +static loff_t +iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + struct iomap_dio *dio = data; + unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev)); + unsigned fs_block_size = (1 << inode->i_blkbits), pad; + unsigned align = iov_iter_alignment(dio->submit.iter); + struct iov_iter iter; + struct bio *bio; + bool need_zeroout = false; + int nr_pages, ret; + + if ((pos | length | align) & ((1 << blkbits) - 1)) + return -EINVAL; + + switch (iomap->type) { + case IOMAP_HOLE: + if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE)) + return -EIO; + /*FALLTHRU*/ + case IOMAP_UNWRITTEN: + if (!(dio->flags & IOMAP_DIO_WRITE)) { + iov_iter_zero(length, dio->submit.iter); + dio->size += length; + return length; + } + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + break; + case IOMAP_MAPPED: + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_NEW) + need_zeroout = true; + break; + default: + WARN_ON_ONCE(1); + return -EIO; + } + + /* + * Operate on a partial iter trimmed to the extent we were called for. + * We'll update the iter in the dio once we're done with this extent. + */ + iter = *dio->submit.iter; + iov_iter_truncate(&iter, length); + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + if (nr_pages <= 0) + return nr_pages; + + if (need_zeroout) { + /* zero out from the start of the block to the write offset */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos - pad, pad); + } + + do { + if (dio->error) + return 0; + + bio = bio_alloc(GFP_KERNEL, nr_pages); + bio->bi_bdev = iomap->bdev; + bio->bi_iter.bi_sector = + iomap->blkno + ((pos - iomap->offset) >> 9); + bio->bi_private = dio; + bio->bi_end_io = iomap_dio_bio_end_io; + + ret = bio_iov_iter_get_pages(bio, &iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + + if (dio->flags & IOMAP_DIO_WRITE) { + bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + task_io_account_write(bio->bi_iter.bi_size); + } else { + bio_set_op_attrs(bio, REQ_OP_READ, 0); + if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); + } + + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); + + atomic_inc(&dio->ref); + + dio->submit.last_queue = bdev_get_queue(iomap->bdev); + dio->submit.cookie = submit_bio(bio); + } while (nr_pages); + + if (need_zeroout) { + /* zero out from the end of the write to the end of the block */ + pad = pos & (fs_block_size - 1); + if (pad) + iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); + } + + iov_iter_advance(dio->submit.iter, length); + return length; +} + +ssize_t +iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops, + iomap_dio_end_io_t end_io) +{ + struct address_space *mapping = iocb->ki_filp->f_mapping; + struct inode *inode = file_inode(iocb->ki_filp); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0; + unsigned int flags = IOMAP_DIRECT; + struct blk_plug plug; + struct iomap_dio *dio; + + lockdep_assert_held(&inode->i_rwsem); + + if (!count) + return 0; + + dio = kmalloc(sizeof(*dio), GFP_KERNEL); + if (!dio) + return -ENOMEM; + + dio->iocb = iocb; + atomic_set(&dio->ref, 1); + dio->size = 0; + dio->i_size = i_size_read(inode); + dio->end_io = end_io; + dio->error = 0; + dio->flags = 0; + + dio->submit.iter = iter; + if (is_sync_kiocb(iocb)) { + dio->submit.waiter = current; + dio->submit.cookie = BLK_QC_T_NONE; + dio->submit.last_queue = NULL; + } + + if (iov_iter_rw(iter) == READ) { + if (pos >= dio->i_size) + goto out_free_dio; + + if (iter->type == ITER_IOVEC) + dio->flags |= IOMAP_DIO_DIRTY; + } else { + dio->flags |= IOMAP_DIO_WRITE; + flags |= IOMAP_WRITE; + } + + if (mapping->nrpages) { + ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); + if (ret) + goto out_free_dio; + + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; + } + + inode_dio_begin(inode); + + blk_start_plug(&plug); + do { + ret = iomap_apply(inode, pos, count, flags, ops, dio, + iomap_dio_actor); + if (ret <= 0) { + /* magic error code to fall back to buffered I/O */ + if (ret == -ENOTBLK) + ret = 0; + break; + } + pos += ret; + } while ((count = iov_iter_count(iter)) > 0); + blk_finish_plug(&plug); + + if (ret < 0) + iomap_dio_set_error(dio, ret); + + if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) && + !inode->i_sb->s_dio_done_wq) { + ret = sb_init_dio_done_wq(inode->i_sb); + if (ret < 0) + iomap_dio_set_error(dio, ret); + } + + if (!atomic_dec_and_test(&dio->ref)) { + if (!is_sync_kiocb(iocb)) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->submit.waiter)) + break; + + if (!(iocb->ki_flags & IOCB_HIPRI) || + !dio->submit.last_queue || + !blk_mq_poll(dio->submit.last_queue, + dio->submit.cookie)) + io_schedule(); + } + __set_current_state(TASK_RUNNING); + } + + /* + * Try again to invalidate clean pages which might have been cached by + * non-direct readahead, or faulted in by get_user_pages() if the source + * of the write was an mmap'ed region of the file we're writing. Either + * one is a pretty crazy thing to do, so we don't support it 100%. If + * this invalidation fails, tough, the write still worked... + */ + if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + ret = invalidate_inode_pages2_range(mapping, + iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + } + + return iomap_dio_complete(dio); + +out_free_dio: + kfree(dio); + return ret; +} +EXPORT_SYMBOL_GPL(iomap_dio_rw); diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 44af14b2e916..9bb2fe35799d 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -18,6 +18,7 @@ #include <linux/module.h> #include <linux/init.h> +#include <linux/bio.h> #include <linux/vmalloc.h> #include <linux/zlib.h> diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 684996c8a3a4..4055f51617ef 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count) blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC); + write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 31f8ca046639..8c514367ba5a 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !jbd2_has_feature_async_commit(journal)) - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh); + ret = submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh); else - ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); *cbh = bh; return ret; @@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_SYNC); + REQ_SYNC); mutex_unlock(&journal->j_checkpoint_mutex); } else { jbd_debug(3, "superblock not updated\n"); @@ -717,7 +718,7 @@ start_journal_io: clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; - submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); } cond_resched(); stats.run.rs_blocks_logged += bufs; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 927da4956a89..a097048ed1a3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -47,7 +47,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/page.h> #ifdef CONFIG_JBD2_DEBUG @@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); if (ret) goto out; @@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal) /* Lock here to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); /* - * Update log tail information. We use WRITE_FUA since new + * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we * must make sure information about current log tail is on * disk before that. @@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - WRITE_FUA); + REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, WRITE_FUA); + jbd2_write_superblock(journal, REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal) ++journal->j_transaction_sequence; write_unlock(&journal->j_state_lock); - jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA); + jbd2_mark_journal_empty(journal, + REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, WRITE_FUA); + jbd2_mark_journal_empty(journal, REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 91171dc352cb..cfc38b552118 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); - write_dirty_buffer(descriptor, WRITE_SYNC); + write_dirty_buffer(descriptor, REQ_SYNC); } #endif diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 8f3f0855fcd2..d2fa138a868c 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -13,7 +13,6 @@ const struct inode_operations jffs2_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = jffs2_setattr, .listxattr = jffs2_listxattr, diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8653cac7e12e..fc89f9436784 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -13,7 +13,7 @@ #include <linux/sched.h> #include <linux/blkdev.h> #include <asm/current.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "jfs_filsys.h" #include "jfs_debug.h" @@ -121,7 +121,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_set_inode_flags(inode); inode_unlock(inode); - inode->i_ctime = CURRENT_TIME_SEC; + inode->i_ctime = current_time(inode); mark_inode_dirty(inode); setflags_out: mnt_drop_write_file(filp); diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index a37eb5f8cbc0..a70907606025 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -22,7 +22,7 @@ #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_debug.h" diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21ea8b3e5fa..bb1da1feafeb 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC); + bio->bi_opf = REQ_OP_READ; /*check if journaling to disk has been disabled*/ if (log->no_integrity) { bio->bi_iter.bi_size = 0; @@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; /* check if journaling to disk has been disabled */ if (log->no_integrity) { diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 85671f7f8518..2be7c9ce6663 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -31,7 +31,7 @@ #include <linux/exportfs.h> #include <linux/crc32.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/seq_file.h> #include <linux/blkdev.h> diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index c82404fee6cd..38320607993e 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -22,14 +22,12 @@ #include "jfs_xattr.h" const struct inode_operations jfs_fast_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = jfs_setattr, .listxattr = jfs_listxattr, }; const struct inode_operations jfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = jfs_setattr, .listxattr = jfs_listxattr, diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index a1982118f92f..ac9e108ce1ea 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -335,7 +335,7 @@ static int kernfs_xattr_set(const struct xattr_handler *handler, return simple_xattr_set(&attrs->xattrs, name, value, size, flags); } -const struct xattr_handler kernfs_trusted_xattr_handler = { +static const struct xattr_handler kernfs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = kernfs_xattr_get, .set = kernfs_xattr_set, @@ -372,7 +372,7 @@ static int kernfs_security_xattr_set(const struct xattr_handler *handler, return error; } -const struct xattr_handler kernfs_security_xattr_handler = { +static const struct xattr_handler kernfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = kernfs_xattr_get, .set = kernfs_security_xattr_set, diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 9b43ca02b7ab..1684af4a8b9b 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -135,7 +135,6 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, - .readlink = generic_readlink, .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, diff --git a/fs/libfs.c b/fs/libfs.c index 48826d4da189..28d6f35feed6 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -16,7 +16,7 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> /* sync_mapping_buffers */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL); + s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -465,6 +466,8 @@ EXPORT_SYMBOL(simple_write_begin); * is not called, so a filesystem that actually does store data in .write_inode * should extend on what's done here with a call to mark_inode_dirty() in the * case that i_size has changed. + * + * Use *ONLY* with simple_readpage() */ int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -474,14 +477,14 @@ int simple_write_end(struct file *file, struct address_space *mapping, loff_t last_pos = pos + copied; /* zero the stale part of the page if we did a short copy */ - if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); - - zero_user(page, from + copied, len - copied); - } + if (!PageUptodate(page)) { + if (copied < len) { + unsigned from = pos & (PAGE_SIZE - 1); - if (!PageUptodate(page)) + zero_user(page, from + copied, len - copied); + } SetPageUptodate(page); + } /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. @@ -1129,7 +1132,6 @@ EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { .get_link = simple_get_link, - .readlink = generic_readlink }; EXPORT_SYMBOL(simple_symlink_inode_operations); diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5426189406c1..fb8cac88251a 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -15,6 +15,6 @@ struct lockd_net { struct list_head nsm_handles; }; -extern int lockd_net_id; +extern unsigned int lockd_net_id; #endif diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index fc4084ef4736..1c13dd80744f 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,7 +57,7 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; -int lockd_net_id; +unsigned int lockd_net_id; /* * These can be set at insmod time (useful for NFS as root filesystem), diff --git a/fs/locks.c b/fs/locks.c index 22c5b4aa4961..26811321d39b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -131,7 +131,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/filelock.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig deleted file mode 100644 index 2b4503163930..000000000000 --- a/fs/logfs/Kconfig +++ /dev/null @@ -1,17 +0,0 @@ -config LOGFS - tristate "LogFS file system" - depends on MTD || (!MTD && BLOCK) - select ZLIB_INFLATE - select ZLIB_DEFLATE - select CRC32 - select BTREE - help - Flash filesystem aimed to scale efficiently to large devices. - In comparison to JFFS2 it offers significantly faster mount - times and potentially less RAM usage, although the latter has - not been measured yet. - - In its current state it is still very experimental and should - not be used for other than testing purposes. - - If unsure, say N. diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile deleted file mode 100644 index 4820027787ee..000000000000 --- a/fs/logfs/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -obj-$(CONFIG_LOGFS) += logfs.o - -logfs-y += compr.o -logfs-y += dir.o -logfs-y += file.o -logfs-y += gc.o -logfs-y += inode.o -logfs-y += journal.o -logfs-y += readwrite.o -logfs-y += segment.o -logfs-y += super.o -logfs-$(CONFIG_BLOCK) += dev_bdev.o -logfs-$(CONFIG_MTD) += dev_mtd.o diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c deleted file mode 100644 index 961f02b86d97..000000000000 --- a/fs/logfs/compr.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * fs/logfs/compr.c - compression routines - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/vmalloc.h> -#include <linux/zlib.h> - -#define COMPR_LEVEL 3 - -static DEFINE_MUTEX(compr_mutex); -static struct z_stream_s stream; - -int logfs_compress(void *in, void *out, size_t inlen, size_t outlen) -{ - int err, ret; - - ret = -EIO; - mutex_lock(&compr_mutex); - err = zlib_deflateInit(&stream, COMPR_LEVEL); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_deflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_deflateEnd(&stream); - if (err != Z_OK) - goto error; - - if (stream.total_out >= stream.total_in) - goto error; - - ret = stream.total_out; -error: - mutex_unlock(&compr_mutex); - return ret; -} - -int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen) -{ - int err, ret; - - ret = -EIO; - mutex_lock(&compr_mutex); - err = zlib_inflateInit(&stream); - if (err != Z_OK) - goto error; - - stream.next_in = in; - stream.avail_in = inlen; - stream.total_in = 0; - stream.next_out = out; - stream.avail_out = outlen; - stream.total_out = 0; - - err = zlib_inflate(&stream, Z_FINISH); - if (err != Z_STREAM_END) - goto error; - - err = zlib_inflateEnd(&stream); - if (err != Z_OK) - goto error; - - ret = 0; -error: - mutex_unlock(&compr_mutex); - return ret; -} - -int __init logfs_compr_init(void) -{ - size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), - zlib_inflate_workspacesize()); - stream.workspace = vmalloc(size); - if (!stream.workspace) - return -ENOMEM; - return 0; -} - -void logfs_compr_exit(void) -{ - vfree(stream.workspace); -} diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c deleted file mode 100644 index a8329cc47dec..000000000000 --- a/fs/logfs/dev_bdev.c +++ /dev/null @@ -1,322 +0,0 @@ -/* - * fs/logfs/dev_bdev.c - Device access methods for block devices - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/gfp.h> -#include <linux/prefetch.h> - -#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) - -static int sync_request(struct page *page, struct block_device *bdev, int op) -{ - struct bio bio; - struct bio_vec bio_vec; - - bio_init(&bio); - bio.bi_max_vecs = 1; - bio.bi_io_vec = &bio_vec; - bio_vec.bv_page = page; - bio_vec.bv_len = PAGE_SIZE; - bio_vec.bv_offset = 0; - bio.bi_vcnt = 1; - bio.bi_bdev = bdev; - bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9); - bio.bi_iter.bi_size = PAGE_SIZE; - bio_set_op_attrs(&bio, op, 0); - - return submit_bio_wait(&bio); -} - -static int bdev_readpage(void *_sb, struct page *page) -{ - struct super_block *sb = _sb; - struct block_device *bdev = logfs_super(sb)->s_bdev; - int err; - - err = sync_request(page, bdev, READ); - if (err) { - ClearPageUptodate(page); - SetPageError(page); - } else { - SetPageUptodate(page); - ClearPageError(page); - } - unlock_page(page); - return err; -} - -static DECLARE_WAIT_QUEUE_HEAD(wq); - -static void writeseg_end_io(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - struct super_block *sb = bio->bi_private; - struct logfs_super *super = logfs_super(sb); - - BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ - - bio_for_each_segment_all(bvec, bio, i) { - end_page_writeback(bvec->bv_page); - put_page(bvec->bv_page); - } - bio_put(bio); - if (atomic_dec_and_test(&super->s_pending_writes)) - wake_up(&wq); -} - -static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, - size_t nr_pages) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct bio *bio; - struct page *page; - unsigned int max_pages; - int i; - - max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); - - bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); - - for (i = 0; i < nr_pages; i++) { - if (i >= max_pages) { - /* Block layer cannot split bios :( */ - bio->bi_vcnt = i; - bio->bi_iter.bi_size = i * PAGE_SIZE; - bio->bi_bdev = super->s_bdev; - bio->bi_iter.bi_sector = ofs >> 9; - bio->bi_private = sb; - bio->bi_end_io = writeseg_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - atomic_inc(&super->s_pending_writes); - submit_bio(bio); - - ofs += i * PAGE_SIZE; - index += i; - nr_pages -= i; - i = 0; - - bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); - } - page = find_lock_page(mapping, index + i); - BUG_ON(!page); - bio->bi_io_vec[i].bv_page = page; - bio->bi_io_vec[i].bv_len = PAGE_SIZE; - bio->bi_io_vec[i].bv_offset = 0; - - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - } - bio->bi_vcnt = nr_pages; - bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; - bio->bi_bdev = super->s_bdev; - bio->bi_iter.bi_sector = ofs >> 9; - bio->bi_private = sb; - bio->bi_end_io = writeseg_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - atomic_inc(&super->s_pending_writes); - submit_bio(bio); - return 0; -} - -static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) -{ - struct logfs_super *super = logfs_super(sb); - int head; - - BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO); - - if (len == 0) { - /* This can happen when the object fit perfectly into a - * segment, the segment gets written per sync and subsequently - * closed. - */ - return; - } - head = ofs & (PAGE_SIZE - 1); - if (head) { - ofs -= head; - len += head; - } - len = PAGE_ALIGN(len); - __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); -} - - -static void erase_end_io(struct bio *bio) -{ - struct super_block *sb = bio->bi_private; - struct logfs_super *super = logfs_super(sb); - - BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ - BUG_ON(bio->bi_vcnt == 0); - bio_put(bio); - if (atomic_dec_and_test(&super->s_pending_writes)) - wake_up(&wq); -} - -static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, - size_t nr_pages) -{ - struct logfs_super *super = logfs_super(sb); - struct bio *bio; - unsigned int max_pages; - int i; - - max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES); - - bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); - - for (i = 0; i < nr_pages; i++) { - if (i >= max_pages) { - /* Block layer cannot split bios :( */ - bio->bi_vcnt = i; - bio->bi_iter.bi_size = i * PAGE_SIZE; - bio->bi_bdev = super->s_bdev; - bio->bi_iter.bi_sector = ofs >> 9; - bio->bi_private = sb; - bio->bi_end_io = erase_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - atomic_inc(&super->s_pending_writes); - submit_bio(bio); - - ofs += i * PAGE_SIZE; - index += i; - nr_pages -= i; - i = 0; - - bio = bio_alloc(GFP_NOFS, max_pages); - BUG_ON(!bio); - } - bio->bi_io_vec[i].bv_page = super->s_erase_page; - bio->bi_io_vec[i].bv_len = PAGE_SIZE; - bio->bi_io_vec[i].bv_offset = 0; - } - bio->bi_vcnt = nr_pages; - bio->bi_iter.bi_size = nr_pages * PAGE_SIZE; - bio->bi_bdev = super->s_bdev; - bio->bi_iter.bi_sector = ofs >> 9; - bio->bi_private = sb; - bio->bi_end_io = erase_end_io; - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - atomic_inc(&super->s_pending_writes); - submit_bio(bio); - return 0; -} - -static int bdev_erase(struct super_block *sb, loff_t to, size_t len, - int ensure_write) -{ - struct logfs_super *super = logfs_super(sb); - - BUG_ON(to & (PAGE_SIZE - 1)); - BUG_ON(len & (PAGE_SIZE - 1)); - - if (super->s_flags & LOGFS_SB_FLAG_RO) - return -EROFS; - - if (ensure_write) { - /* - * Object store doesn't care whether erases happen or not. - * But for the journal they are required. Otherwise a scan - * can find an old commit entry and assume it is the current - * one, travelling back in time. - */ - do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); - } - - return 0; -} - -static void bdev_sync(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - wait_event(wq, atomic_read(&super->s_pending_writes) == 0); -} - -static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = bdev_readpage; - - *ofs = 0; - return read_cache_page(mapping, 0, filler, sb); -} - -static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = bdev_readpage; - u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000; - pgoff_t index = pos >> PAGE_SHIFT; - - *ofs = pos; - return read_cache_page(mapping, index, filler, sb); -} - -static int bdev_write_sb(struct super_block *sb, struct page *page) -{ - struct block_device *bdev = logfs_super(sb)->s_bdev; - - /* Nothing special to do for block devices. */ - return sync_request(page, bdev, WRITE); -} - -static void bdev_put_device(struct logfs_super *s) -{ - blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); -} - -static int bdev_can_write_buf(struct super_block *sb, u64 ofs) -{ - return 0; -} - -static const struct logfs_device_ops bd_devops = { - .find_first_sb = bdev_find_first_sb, - .find_last_sb = bdev_find_last_sb, - .write_sb = bdev_write_sb, - .readpage = bdev_readpage, - .writeseg = bdev_writeseg, - .erase = bdev_erase, - .can_write_buf = bdev_can_write_buf, - .sync = bdev_sync, - .put_device = bdev_put_device, -}; - -int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type, - const char *devname) -{ - struct block_device *bdev; - - bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL, - type); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - - if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) { - int mtdnr = MINOR(bdev->bd_dev); - blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); - return logfs_get_sb_mtd(p, mtdnr); - } - - p->s_bdev = bdev; - p->s_mtd = NULL; - p->s_devops = &bd_devops; - return 0; -} diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c deleted file mode 100644 index b76a62b1978f..000000000000 --- a/fs/logfs/dev_mtd.c +++ /dev/null @@ -1,274 +0,0 @@ -/* - * fs/logfs/dev_mtd.c - Device access methods for MTD - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/completion.h> -#include <linux/mount.h> -#include <linux/sched.h> -#include <linux/slab.h> - -#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) - -static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len, - void *buf) -{ - struct mtd_info *mtd = logfs_super(sb)->s_mtd; - size_t retlen; - int ret; - - ret = mtd_read(mtd, ofs, len, &retlen, buf); - BUG_ON(ret == -EINVAL); - if (ret) - return ret; - - /* Not sure if we should loop instead. */ - if (retlen != len) - return -EIO; - - return 0; -} - -static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len, - void *buf) -{ - struct logfs_super *super = logfs_super(sb); - struct mtd_info *mtd = super->s_mtd; - size_t retlen; - loff_t page_start, page_end; - int ret; - - if (super->s_flags & LOGFS_SB_FLAG_RO) - return -EROFS; - - BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); - BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); - BUG_ON(len > PAGE_SIZE); - page_start = ofs & PAGE_MASK; - page_end = PAGE_ALIGN(ofs + len) - 1; - ret = mtd_write(mtd, ofs, len, &retlen, buf); - if (ret || (retlen != len)) - return -EIO; - - return 0; -} - -/* - * For as long as I can remember (since about 2001) mtd->erase has been an - * asynchronous interface lacking the first driver to actually use the - * asynchronous properties. So just to prevent the first implementor of such - * a thing from breaking logfs in 2350, we do the usual pointless dance to - * declare a completion variable and wait for completion before returning - * from logfs_mtd_erase(). What an exercise in futility! - */ -static void logfs_erase_callback(struct erase_info *ei) -{ - complete((struct completion *)ei->priv); -} - -static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs, - size_t len) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct page *page; - pgoff_t index = ofs >> PAGE_SHIFT; - - for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { - page = find_get_page(mapping, index); - if (!page) - continue; - memset(page_address(page), 0xFF, PAGE_SIZE); - put_page(page); - } - return 0; -} - -static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len, - int ensure_write) -{ - struct mtd_info *mtd = logfs_super(sb)->s_mtd; - struct erase_info ei; - DECLARE_COMPLETION_ONSTACK(complete); - int ret; - - BUG_ON(len % mtd->erasesize); - if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) - return -EROFS; - - memset(&ei, 0, sizeof(ei)); - ei.mtd = mtd; - ei.addr = ofs; - ei.len = len; - ei.callback = logfs_erase_callback; - ei.priv = (long)&complete; - ret = mtd_erase(mtd, &ei); - if (ret) - return -EIO; - - wait_for_completion(&complete); - if (ei.state != MTD_ERASE_DONE) - return -EIO; - return logfs_mtd_erase_mapping(sb, ofs, len); -} - -static void logfs_mtd_sync(struct super_block *sb) -{ - struct mtd_info *mtd = logfs_super(sb)->s_mtd; - - mtd_sync(mtd); -} - -static int logfs_mtd_readpage(void *_sb, struct page *page) -{ - struct super_block *sb = _sb; - int err; - - err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, - page_address(page)); - if (err == -EUCLEAN || err == -EBADMSG) { - /* -EBADMSG happens regularly on power failures */ - err = 0; - /* FIXME: force GC this segment */ - } - if (err) { - ClearPageUptodate(page); - SetPageError(page); - } else { - SetPageUptodate(page); - ClearPageError(page); - } - unlock_page(page); - return err; -} - -static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = logfs_mtd_readpage; - struct mtd_info *mtd = super->s_mtd; - - *ofs = 0; - while (mtd_block_isbad(mtd, *ofs)) { - *ofs += mtd->erasesize; - if (*ofs >= mtd->size) - return NULL; - } - BUG_ON(*ofs & ~PAGE_MASK); - return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); -} - -static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = logfs_mtd_readpage; - struct mtd_info *mtd = super->s_mtd; - - *ofs = mtd->size - mtd->erasesize; - while (mtd_block_isbad(mtd, *ofs)) { - *ofs -= mtd->erasesize; - if (*ofs <= 0) - return NULL; - } - *ofs = *ofs + mtd->erasesize - 0x1000; - BUG_ON(*ofs & ~PAGE_MASK); - return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); -} - -static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, - size_t nr_pages) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct page *page; - int i, err; - - for (i = 0; i < nr_pages; i++) { - page = find_lock_page(mapping, index + i); - BUG_ON(!page); - - err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, - page_address(page)); - unlock_page(page); - put_page(page); - if (err) - return err; - } - return 0; -} - -static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) -{ - struct logfs_super *super = logfs_super(sb); - int head; - - if (super->s_flags & LOGFS_SB_FLAG_RO) - return; - - if (len == 0) { - /* This can happen when the object fit perfectly into a - * segment, the segment gets written per sync and subsequently - * closed. - */ - return; - } - head = ofs & (PAGE_SIZE - 1); - if (head) { - ofs -= head; - len += head; - } - len = PAGE_ALIGN(len); - __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); -} - -static void logfs_mtd_put_device(struct logfs_super *s) -{ - put_mtd_device(s->s_mtd); -} - -static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs) -{ - struct logfs_super *super = logfs_super(sb); - void *buf; - int err; - - buf = kmalloc(super->s_writesize, GFP_KERNEL); - if (!buf) - return -ENOMEM; - err = logfs_mtd_read(sb, ofs, super->s_writesize, buf); - if (err) - goto out; - if (memchr_inv(buf, 0xff, super->s_writesize)) - err = -EIO; - kfree(buf); -out: - return err; -} - -static const struct logfs_device_ops mtd_devops = { - .find_first_sb = logfs_mtd_find_first_sb, - .find_last_sb = logfs_mtd_find_last_sb, - .readpage = logfs_mtd_readpage, - .writeseg = logfs_mtd_writeseg, - .erase = logfs_mtd_erase, - .can_write_buf = logfs_mtd_can_write_buf, - .sync = logfs_mtd_sync, - .put_device = logfs_mtd_put_device, -}; - -int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) -{ - struct mtd_info *mtd = get_mtd_device(NULL, mtdnr); - if (IS_ERR(mtd)) - return PTR_ERR(mtd); - - s->s_bdev = NULL; - s->s_mtd = mtd; - s->s_devops = &mtd_devops; - return 0; -} diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c deleted file mode 100644 index c87ea52de3d9..000000000000 --- a/fs/logfs/dir.c +++ /dev/null @@ -1,801 +0,0 @@ -/* - * fs/logfs/dir.c - directory-related code - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/slab.h> - -/* - * Atomic dir operations - * - * Directory operations are by default not atomic. Dentries and Inodes are - * created/removed/altered in separate operations. Therefore we need to do - * a small amount of journaling. - * - * Create, link, mkdir, mknod and symlink all share the same function to do - * the work: __logfs_create. This function works in two atomic steps: - * 1. allocate inode (remember in journal) - * 2. allocate dentry (clear journal) - * - * As we can only get interrupted between the two, when the inode we just - * created is simply stored in the anchor. On next mount, if we were - * interrupted, we delete the inode. From a users point of view the - * operation never happened. - * - * Unlink and rmdir also share the same function: unlink. Again, this - * function works in two atomic steps - * 1. remove dentry (remember inode in journal) - * 2. unlink inode (clear journal) - * - * And again, on the next mount, if we were interrupted, we delete the inode. - * From a users point of view the operation succeeded. - * - * Rename is the real pain to deal with, harder than all the other methods - * combined. Depending on the circumstances we can run into three cases. - * A "target rename" where the target dentry already existed, a "local - * rename" where both parent directories are identical or a "cross-directory - * rename" in the remaining case. - * - * Local rename is atomic, as the old dentry is simply rewritten with a new - * name. - * - * Cross-directory rename works in two steps, similar to __logfs_create and - * logfs_unlink: - * 1. Write new dentry (remember old dentry in journal) - * 2. Remove old dentry (clear journal) - * - * Here we remember a dentry instead of an inode. On next mount, if we were - * interrupted, we delete the dentry. From a users point of view, the - * operation succeeded. - * - * Target rename works in three atomic steps: - * 1. Attach old inode to new dentry (remember old dentry and new inode) - * 2. Remove old dentry (still remember the new inode) - * 3. Remove victim inode - * - * Here we remember both an inode an a dentry. If we get interrupted - * between steps 1 and 2, we delete both the dentry and the inode. If - * we get interrupted between steps 2 and 3, we delete just the inode. - * In either case, the remaining objects are deleted on next mount. From - * a users point of view, the operation succeeded. - */ - -static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd, - loff_t pos) -{ - return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL); -} - -static int write_inode(struct inode *inode) -{ - return __logfs_write_inode(inode, NULL, WF_LOCK); -} - -static s64 dir_seek_data(struct inode *inode, s64 pos) -{ - s64 new_pos = logfs_seek_data(inode, pos); - - return max(pos, new_pos - 1); -} - -static int beyond_eof(struct inode *inode, loff_t bix) -{ - loff_t pos = bix << inode->i_sb->s_blocksize_bits; - return pos >= i_size_read(inode); -} - -/* - * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11, - * so short names (len <= 9) don't even occupy the complete 32bit name - * space. A prime >256 ensures short names quickly spread the 32bit - * name space. Add about 26 for the estimated amount of information - * of each character and pick a prime nearby, preferably a bit-sparse - * one. - */ -static u32 logfs_hash_32(const char *s, int len, u32 seed) -{ - u32 hash = seed; - int i; - - for (i = 0; i < len; i++) - hash = hash * 293 + s[i]; - return hash; -} - -/* - * We have to satisfy several conflicting requirements here. Small - * directories should stay fairly compact and not require too many - * indirect blocks. The number of possible locations for a given hash - * should be small to make lookup() fast. And we should try hard not - * to overflow the 32bit name space or nfs and 32bit host systems will - * be unhappy. - * - * So we use the following scheme. First we reduce the hash to 0..15 - * and try a direct block. If that is occupied we reduce the hash to - * 16..255 and try an indirect block. Same for 2x and 3x indirect - * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff, - * but use buckets containing eight entries instead of a single one. - * - * Using 16 entries should allow for a reasonable amount of hash - * collisions, so the 32bit name space can be packed fairly tight - * before overflowing. Oh and currently we don't overflow but return - * and error. - * - * How likely are collisions? Doing the appropriate math is beyond me - * and the Bronstein textbook. But running a test program to brute - * force collisions for a couple of days showed that on average the - * first collision occurs after 598M entries, with 290M being the - * smallest result. Obviously 21 entries could already cause a - * collision if all entries are carefully chosen. - */ -static pgoff_t hash_index(u32 hash, int round) -{ - u32 i0_blocks = I0_BLOCKS; - u32 i1_blocks = I1_BLOCKS; - u32 i2_blocks = I2_BLOCKS; - u32 i3_blocks = I3_BLOCKS; - - switch (round) { - case 0: - return hash % i0_blocks; - case 1: - return i0_blocks + hash % (i1_blocks - i0_blocks); - case 2: - return i1_blocks + hash % (i2_blocks - i1_blocks); - case 3: - return i2_blocks + hash % (i3_blocks - i2_blocks); - case 4 ... 19: - return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16)) - + round - 4; - } - BUG(); -} - -static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry) -{ - const struct qstr *name = &dentry->d_name; - struct page *page; - struct logfs_disk_dentry *dd; - u32 hash = logfs_hash_32(name->name, name->len, 0); - pgoff_t index; - int round; - - if (name->len > LOGFS_MAX_NAMELEN) - return ERR_PTR(-ENAMETOOLONG); - - for (round = 0; round < 20; round++) { - index = hash_index(hash, round); - - if (beyond_eof(dir, index)) - return NULL; - if (!logfs_exist_block(dir, index)) - continue; - page = read_cache_page(dir->i_mapping, index, - (filler_t *)logfs_readpage, NULL); - if (IS_ERR(page)) - return page; - dd = kmap_atomic(page); - BUG_ON(dd->namelen == 0); - - if (name->len != be16_to_cpu(dd->namelen) || - memcmp(name->name, dd->name, name->len)) { - kunmap_atomic(dd); - put_page(page); - continue; - } - - kunmap_atomic(dd); - return page; - } - return NULL; -} - -static int logfs_remove_inode(struct inode *inode) -{ - int ret; - - drop_nlink(inode); - ret = write_inode(inode); - LOGFS_BUG_ON(ret, inode->i_sb); - return ret; -} - -static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) -{ - if (logfs_inode(inode)->li_block) - logfs_inode(inode)->li_block->ta = NULL; - kfree(ta); -} - -static int logfs_unlink(struct inode *dir, struct dentry *dentry) -{ - struct logfs_super *super = logfs_super(dir->i_sb); - struct inode *inode = d_inode(dentry); - struct logfs_transaction *ta; - struct page *page; - pgoff_t index; - int ret; - - ta = kzalloc(sizeof(*ta), GFP_KERNEL); - if (!ta) - return -ENOMEM; - - ta->state = UNLINK_1; - ta->ino = inode->i_ino; - - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); - - page = logfs_get_dd_page(dir, dentry); - if (!page) { - kfree(ta); - return -ENOENT; - } - if (IS_ERR(page)) { - kfree(ta); - return PTR_ERR(page); - } - index = page->index; - put_page(page); - - mutex_lock(&super->s_dirop_mutex); - logfs_add_transaction(dir, ta); - - ret = logfs_delete(dir, index, NULL); - if (!ret) - ret = write_inode(dir); - - if (ret) { - abort_transaction(dir, ta); - printk(KERN_ERR"LOGFS: unable to delete inode\n"); - goto out; - } - - ta->state = UNLINK_2; - logfs_add_transaction(inode, ta); - ret = logfs_remove_inode(inode); -out: - mutex_unlock(&super->s_dirop_mutex); - return ret; -} - -static inline int logfs_empty_dir(struct inode *dir) -{ - u64 data; - - data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits; - return data >= i_size_read(dir); -} - -static int logfs_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct inode *inode = d_inode(dentry); - - if (!logfs_empty_dir(inode)) - return -ENOTEMPTY; - - return logfs_unlink(dir, dentry); -} - -/* FIXME: readdir currently has it's own dir_walk code. I don't see a good - * way to combine the two copies */ -static int logfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct inode *dir = file_inode(file); - loff_t pos; - struct page *page; - struct logfs_disk_dentry *dd; - - if (ctx->pos < 0) - return -EINVAL; - - if (!dir_emit_dots(file, ctx)) - return 0; - - pos = ctx->pos - 2; - BUG_ON(pos < 0); - for (;; pos++, ctx->pos++) { - bool full; - if (beyond_eof(dir, pos)) - break; - if (!logfs_exist_block(dir, pos)) { - /* deleted dentry */ - pos = dir_seek_data(dir, pos); - continue; - } - page = read_cache_page(dir->i_mapping, pos, - (filler_t *)logfs_readpage, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - dd = kmap(page); - BUG_ON(dd->namelen == 0); - - full = !dir_emit(ctx, (char *)dd->name, - be16_to_cpu(dd->namelen), - be64_to_cpu(dd->ino), dd->type); - kunmap(page); - put_page(page); - if (full) - break; - } - return 0; -} - -static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name) -{ - dd->namelen = cpu_to_be16(name->len); - memcpy(dd->name, name->name, name->len); -} - -static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct page *page; - struct logfs_disk_dentry *dd; - pgoff_t index; - u64 ino = 0; - struct inode *inode; - - page = logfs_get_dd_page(dir, dentry); - if (IS_ERR(page)) - return ERR_CAST(page); - if (!page) { - d_add(dentry, NULL); - return NULL; - } - index = page->index; - dd = kmap_atomic(page); - ino = be64_to_cpu(dd->ino); - kunmap_atomic(dd); - put_page(page); - - inode = logfs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n", - ino, dir->i_ino, index); - return d_splice_alias(inode, dentry); -} - -static void grow_dir(struct inode *dir, loff_t index) -{ - index = (index + 1) << dir->i_sb->s_blocksize_bits; - if (i_size_read(dir) < index) - i_size_write(dir, index); -} - -static int logfs_write_dir(struct inode *dir, struct dentry *dentry, - struct inode *inode) -{ - struct page *page; - struct logfs_disk_dentry *dd; - u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0); - pgoff_t index; - int round, err; - - for (round = 0; round < 20; round++) { - index = hash_index(hash, round); - - if (logfs_exist_block(dir, index)) - continue; - page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL); - if (!page) - return -ENOMEM; - - dd = kmap_atomic(page); - memset(dd, 0, sizeof(*dd)); - dd->ino = cpu_to_be64(inode->i_ino); - dd->type = logfs_type(inode); - logfs_set_name(dd, &dentry->d_name); - kunmap_atomic(dd); - - err = logfs_write_buf(dir, page, WF_LOCK); - unlock_page(page); - put_page(page); - if (!err) - grow_dir(dir, index); - return err; - } - /* FIXME: Is there a better return value? In most cases neither - * the filesystem nor the directory are full. But we have had - * too many collisions for this particular hash and no fallback. - */ - return -ENOSPC; -} - -static int __logfs_create(struct inode *dir, struct dentry *dentry, - struct inode *inode, const char *dest, long destlen) -{ - struct logfs_super *super = logfs_super(dir->i_sb); - struct logfs_inode *li = logfs_inode(inode); - struct logfs_transaction *ta; - int ret; - - ta = kzalloc(sizeof(*ta), GFP_KERNEL); - if (!ta) { - drop_nlink(inode); - iput(inode); - return -ENOMEM; - } - - ta->state = CREATE_1; - ta->ino = inode->i_ino; - mutex_lock(&super->s_dirop_mutex); - logfs_add_transaction(inode, ta); - - if (dest) { - /* symlink */ - ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL); - if (!ret) - ret = write_inode(inode); - } else { - /* creat/mkdir/mknod */ - ret = write_inode(inode); - } - if (ret) { - abort_transaction(inode, ta); - li->li_flags |= LOGFS_IF_STILLBORN; - /* FIXME: truncate symlink */ - drop_nlink(inode); - iput(inode); - goto out; - } - - ta->state = CREATE_2; - logfs_add_transaction(dir, ta); - ret = logfs_write_dir(dir, dentry, inode); - /* sync directory */ - if (!ret) - ret = write_inode(dir); - - if (ret) { - logfs_del_transaction(dir, ta); - ta->state = CREATE_2; - logfs_add_transaction(inode, ta); - logfs_remove_inode(inode); - iput(inode); - goto out; - } - d_instantiate(dentry, inode); -out: - mutex_unlock(&super->s_dirop_mutex); - return ret; -} - -static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - struct inode *inode; - - /* - * FIXME: why do we have to fill in S_IFDIR, while the mode is - * correct for mknod, creat, etc.? Smells like the vfs *should* - * do it for us but for some reason fails to do so. - */ - inode = logfs_new_inode(dir, S_IFDIR | mode); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - inode->i_op = &logfs_dir_iops; - inode->i_fop = &logfs_dir_fops; - - return __logfs_create(dir, dentry, inode, NULL, 0); -} - -static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) -{ - struct inode *inode; - - inode = logfs_new_inode(dir, mode); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - inode->i_op = &logfs_reg_iops; - inode->i_fop = &logfs_reg_fops; - inode->i_mapping->a_ops = &logfs_reg_aops; - - return __logfs_create(dir, dentry, inode, NULL, 0); -} - -static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) -{ - struct inode *inode; - - if (dentry->d_name.len > LOGFS_MAX_NAMELEN) - return -ENAMETOOLONG; - - inode = logfs_new_inode(dir, mode); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - init_special_inode(inode, mode, rdev); - - return __logfs_create(dir, dentry, inode, NULL, 0); -} - -static int logfs_symlink(struct inode *dir, struct dentry *dentry, - const char *target) -{ - struct inode *inode; - size_t destlen = strlen(target) + 1; - - if (destlen > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; - - inode = logfs_new_inode(dir, S_IFLNK | 0777); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &logfs_reg_aops; - - return __logfs_create(dir, dentry, inode, target, destlen); -} - -static int logfs_link(struct dentry *old_dentry, struct inode *dir, - struct dentry *dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); - ihold(inode); - inc_nlink(inode); - mark_inode_dirty_sync(inode); - - return __logfs_create(dir, dentry, inode, NULL, 0); -} - -static int logfs_get_dd(struct inode *dir, struct dentry *dentry, - struct logfs_disk_dentry *dd, loff_t *pos) -{ - struct page *page; - void *map; - - page = logfs_get_dd_page(dir, dentry); - if (IS_ERR(page)) - return PTR_ERR(page); - *pos = page->index; - map = kmap_atomic(page); - memcpy(dd, map, sizeof(*dd)); - kunmap_atomic(map); - put_page(page); - return 0; -} - -static int logfs_delete_dd(struct inode *dir, loff_t pos) -{ - /* - * Getting called with pos somewhere beyond eof is either a goofup - * within this file or means someone maliciously edited the - * (crc-protected) journal. - */ - BUG_ON(beyond_eof(dir, pos)); - dir->i_ctime = dir->i_mtime = current_time(dir); - log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos); - return logfs_delete(dir, pos, NULL); -} - -/* - * Cross-directory rename, target does not exist. Just a little nasty. - * Create a new dentry in the target dir, then remove the old dentry, - * all the while taking care to remember our operation in the journal. - */ -static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct logfs_super *super = logfs_super(old_dir->i_sb); - struct logfs_disk_dentry dd; - struct logfs_transaction *ta; - loff_t pos; - int err; - - /* 1. locate source dd */ - err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); - if (err) - return err; - - ta = kzalloc(sizeof(*ta), GFP_KERNEL); - if (!ta) - return -ENOMEM; - - ta->state = CROSS_RENAME_1; - ta->dir = old_dir->i_ino; - ta->pos = pos; - - /* 2. write target dd */ - mutex_lock(&super->s_dirop_mutex); - logfs_add_transaction(new_dir, ta); - err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry)); - if (!err) - err = write_inode(new_dir); - - if (err) { - super->s_rename_dir = 0; - super->s_rename_pos = 0; - abort_transaction(new_dir, ta); - goto out; - } - - /* 3. remove source dd */ - ta->state = CROSS_RENAME_2; - logfs_add_transaction(old_dir, ta); - err = logfs_delete_dd(old_dir, pos); - if (!err) - err = write_inode(old_dir); - LOGFS_BUG_ON(err, old_dir->i_sb); -out: - mutex_unlock(&super->s_dirop_mutex); - return err; -} - -static int logfs_replace_inode(struct inode *dir, struct dentry *dentry, - struct logfs_disk_dentry *dd, struct inode *inode) -{ - loff_t pos; - int err; - - err = logfs_get_dd(dir, dentry, dd, &pos); - if (err) - return err; - dd->ino = cpu_to_be64(inode->i_ino); - dd->type = logfs_type(inode); - - err = write_dir(dir, dd, pos); - if (err) - return err; - log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos, - dd->name, be64_to_cpu(dd->ino)); - return write_inode(dir); -} - -/* Target dentry exists - the worst case. We need to attach the source - * inode to the target dentry, then remove the orphaned target inode and - * source dentry. - */ -static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) -{ - struct logfs_super *super = logfs_super(old_dir->i_sb); - struct inode *old_inode = d_inode(old_dentry); - struct inode *new_inode = d_inode(new_dentry); - int isdir = S_ISDIR(old_inode->i_mode); - struct logfs_disk_dentry dd; - struct logfs_transaction *ta; - loff_t pos; - int err; - - BUG_ON(isdir != S_ISDIR(new_inode->i_mode)); - if (isdir) { - if (!logfs_empty_dir(new_inode)) - return -ENOTEMPTY; - } - - /* 1. locate source dd */ - err = logfs_get_dd(old_dir, old_dentry, &dd, &pos); - if (err) - return err; - - ta = kzalloc(sizeof(*ta), GFP_KERNEL); - if (!ta) - return -ENOMEM; - - ta->state = TARGET_RENAME_1; - ta->dir = old_dir->i_ino; - ta->pos = pos; - ta->ino = new_inode->i_ino; - - /* 2. attach source inode to target dd */ - mutex_lock(&super->s_dirop_mutex); - logfs_add_transaction(new_dir, ta); - err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode); - if (err) { - super->s_rename_dir = 0; - super->s_rename_pos = 0; - super->s_victim_ino = 0; - abort_transaction(new_dir, ta); - goto out; - } - - /* 3. remove source dd */ - ta->state = TARGET_RENAME_2; - logfs_add_transaction(old_dir, ta); - err = logfs_delete_dd(old_dir, pos); - if (!err) - err = write_inode(old_dir); - LOGFS_BUG_ON(err, old_dir->i_sb); - - /* 4. remove target inode */ - ta->state = TARGET_RENAME_3; - logfs_add_transaction(new_inode, ta); - err = logfs_remove_inode(new_inode); - -out: - mutex_unlock(&super->s_dirop_mutex); - return err; -} - -static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) -{ - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - if (d_really_is_positive(new_dentry)) - return logfs_rename_target(old_dir, old_dentry, - new_dir, new_dentry); - return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); -} - -/* No locking done here, as this is called before .get_sb() returns. */ -int logfs_replay_journal(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode; - u64 ino, pos; - int err; - - if (super->s_victim_ino) { - /* delete victim inode */ - ino = super->s_victim_ino; - printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino); - inode = logfs_iget(sb, ino); - if (IS_ERR(inode)) - goto fail; - - LOGFS_BUG_ON(i_size_read(inode) > 0, sb); - super->s_victim_ino = 0; - err = logfs_remove_inode(inode); - iput(inode); - if (err) { - super->s_victim_ino = ino; - goto fail; - } - } - if (super->s_rename_dir) { - /* delete old dd from rename */ - ino = super->s_rename_dir; - pos = super->s_rename_pos; - printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n", - ino, pos); - inode = logfs_iget(sb, ino); - if (IS_ERR(inode)) - goto fail; - - super->s_rename_dir = 0; - super->s_rename_pos = 0; - err = logfs_delete_dd(inode, pos); - iput(inode); - if (err) { - super->s_rename_dir = ino; - super->s_rename_pos = pos; - goto fail; - } - } - return 0; -fail: - LOGFS_BUG(sb); - return -EIO; -} - -const struct inode_operations logfs_dir_iops = { - .create = logfs_create, - .link = logfs_link, - .lookup = logfs_lookup, - .mkdir = logfs_mkdir, - .mknod = logfs_mknod, - .rename = logfs_rename, - .rmdir = logfs_rmdir, - .symlink = logfs_symlink, - .unlink = logfs_unlink, -}; -const struct file_operations logfs_dir_fops = { - .fsync = logfs_fsync, - .unlocked_ioctl = logfs_ioctl, - .iterate_shared = logfs_readdir, - .read = generic_read_dir, - .llseek = generic_file_llseek, -}; diff --git a/fs/logfs/file.c b/fs/logfs/file.c deleted file mode 100644 index 1db04930ad57..000000000000 --- a/fs/logfs/file.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * fs/logfs/file.c - prepare_write, commit_write and friends - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/sched.h> -#include <linux/writeback.h> - -static int logfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - struct inode *inode = mapping->host; - struct page *page; - pgoff_t index = pos >> PAGE_SHIFT; - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - *pagep = page; - - if ((len == PAGE_SIZE) || PageUptodate(page)) - return 0; - if ((pos & PAGE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_SIZE); - return 0; - } - return logfs_readpage_nolock(page); -} - -static int logfs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, struct page *page, - void *fsdata) -{ - struct inode *inode = mapping->host; - pgoff_t index = page->index; - unsigned start = pos & (PAGE_SIZE - 1); - unsigned end = start + copied; - int ret = 0; - - BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize); - BUG_ON(page->index > I3_BLOCKS); - - if (copied < len) { - /* - * Short write of a non-initialized paged. Just tell userspace - * to retry the entire page. - */ - if (!PageUptodate(page)) { - copied = 0; - goto out; - } - } - if (copied == 0) - goto out; /* FIXME: do we need to update inode? */ - - if (i_size_read(inode) < (index << PAGE_SHIFT) + end) { - i_size_write(inode, (index << PAGE_SHIFT) + end); - mark_inode_dirty_sync(inode); - } - - SetPageUptodate(page); - if (!PageDirty(page)) { - if (!get_page_reserve(inode, page)) - __set_page_dirty_nobuffers(page); - else - ret = logfs_write_buf(inode, page, WF_LOCK); - } -out: - unlock_page(page); - put_page(page); - return ret ? ret : copied; -} - -int logfs_readpage(struct file *file, struct page *page) -{ - int ret; - - ret = logfs_readpage_nolock(page); - unlock_page(page); - return ret; -} - -/* Clear the page's dirty flag in the radix tree. */ -/* TODO: mucking with PageWriteback is silly. Add a generic function to clear - * the dirty bit from the radix tree for filesystems that don't have to wait - * for page writeback to finish (i.e. any compressing filesystem). - */ -static void clear_radix_tree_dirty(struct page *page) -{ - BUG_ON(PagePrivate(page) || page->private); - set_page_writeback(page); - end_page_writeback(page); -} - -static int __logfs_writepage(struct page *page) -{ - struct inode *inode = page->mapping->host; - int err; - - err = logfs_write_buf(inode, page, WF_LOCK); - if (err) - set_page_dirty(page); - else - clear_radix_tree_dirty(page); - unlock_page(page); - return err; -} - -static int logfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; - unsigned offset; - u64 bix; - level_t level; - - log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index, - page); - - logfs_unpack_index(page->index, &bix, &level); - - /* Indirect blocks are never truncated */ - if (level != 0) - return __logfs_writepage(page); - - /* - * TODO: everything below is a near-verbatim copy of nobh_writepage(). - * The relevant bits should be factored out after logfs is merged. - */ - - /* Is the page fully inside i_size? */ - if (bix < end_index) - return __logfs_writepage(page); - - /* Is the page fully outside i_size? (truncate in progress) */ - offset = i_size & (PAGE_SIZE-1); - if (bix > end_index || offset == 0) { - unlock_page(page); - return 0; /* don't care */ - } - - /* - * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped - * in multiples of the page size. For a file that is not a multiple of - * the page size, the remaining memory is zeroed when mapped, and - * writes to that region are not written out to the file." - */ - zero_user_segment(page, offset, PAGE_SIZE); - return __logfs_writepage(page); -} - -static void logfs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ - struct logfs_block *block = logfs_block(page); - - if (block->reserved_bytes) { - struct super_block *sb = page->mapping->host->i_sb; - struct logfs_super *super = logfs_super(sb); - - super->s_dirty_pages -= block->reserved_bytes; - block->ops->free_block(sb, block); - BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR)); - } else - move_page_to_btree(page); - BUG_ON(PagePrivate(page) || page->private); -} - -static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this) -{ - return 0; /* None of these are easy to release */ -} - - -long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(file); - struct logfs_inode *li = logfs_inode(inode); - unsigned int oldflags, flags; - int err; - - switch (cmd) { - case FS_IOC_GETFLAGS: - flags = li->li_flags & LOGFS_FL_USER_VISIBLE; - return put_user(flags, (int __user *)arg); - case FS_IOC_SETFLAGS: - if (IS_RDONLY(inode)) - return -EROFS; - - if (!inode_owner_or_capable(inode)) - return -EACCES; - - err = get_user(flags, (int __user *)arg); - if (err) - return err; - - inode_lock(inode); - oldflags = li->li_flags; - flags &= LOGFS_FL_USER_MODIFIABLE; - flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; - li->li_flags = flags; - inode_unlock(inode); - - inode->i_ctime = current_time(inode); - mark_inode_dirty_sync(inode); - return 0; - - default: - return -ENOTTY; - } -} - -int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) -{ - struct super_block *sb = file->f_mapping->host->i_sb; - struct inode *inode = file->f_mapping->host; - int ret; - - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - - inode_lock(inode); - logfs_get_wblocks(sb, NULL, WF_LOCK); - logfs_write_anchor(sb); - logfs_put_wblocks(sb, NULL, WF_LOCK); - inode_unlock(inode); - - return 0; -} - -static int logfs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int err = 0; - - err = setattr_prepare(dentry, attr); - if (err) - return err; - - if (attr->ia_valid & ATTR_SIZE) { - err = logfs_truncate(inode, attr->ia_size); - if (err) - return err; - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations logfs_reg_iops = { - .setattr = logfs_setattr, -}; - -const struct file_operations logfs_reg_fops = { - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .fsync = logfs_fsync, - .unlocked_ioctl = logfs_ioctl, - .llseek = generic_file_llseek, - .mmap = generic_file_readonly_mmap, - .open = generic_file_open, -}; - -const struct address_space_operations logfs_reg_aops = { - .invalidatepage = logfs_invalidatepage, - .readpage = logfs_readpage, - .releasepage = logfs_releasepage, - .set_page_dirty = __set_page_dirty_nobuffers, - .writepage = logfs_writepage, - .writepages = generic_writepages, - .write_begin = logfs_write_begin, - .write_end = logfs_write_end, -}; diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c deleted file mode 100644 index d4efb061bdc5..000000000000 --- a/fs/logfs/gc.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * fs/logfs/gc.c - garbage collection code - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/sched.h> -#include <linux/slab.h> - -/* - * Wear leveling needs to kick in when the difference between low erase - * counts and high erase counts gets too big. A good value for "too big" - * may be somewhat below 10% of maximum erase count for the device. - * Why not 397, to pick a nice round number with no specific meaning? :) - * - * WL_RATELIMIT is the minimum time between two wear level events. A huge - * number of segments may fulfil the requirements for wear leveling at the - * same time. If that happens we don't want to cause a latency from hell, - * but just gently pick one segment every so often and minimize overhead. - */ -#define WL_DELTA 397 -#define WL_RATELIMIT 100 -#define MAX_OBJ_ALIASES 2600 -#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ -#define LIST_SIZE 64 /* base size of candidate lists */ -#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ -#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ - -static int no_free_segments(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - return super->s_free_list.count; -} - -/* journal has distance -1, top-most ifile layer distance 0 */ -static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) -{ - struct logfs_super *super = logfs_super(sb); - u8 gc_level = (__force u8)__gc_level; - - switch (gc_level) { - case 0: /* fall through */ - case 1: /* fall through */ - case 2: /* fall through */ - case 3: - /* file data or indirect blocks */ - return super->s_ifile_levels + super->s_iblock_levels - gc_level; - case 6: /* fall through */ - case 7: /* fall through */ - case 8: /* fall through */ - case 9: - /* inode file data or indirect blocks */ - return super->s_ifile_levels - (gc_level - 6); - default: - printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", - gc_level); - WARN_ON(1); - return super->s_ifile_levels + super->s_iblock_levels; - } -} - -static int segment_is_reserved(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area; - void *reserved; - int i; - - /* Some segments are reserved. Just pretend they were all valid */ - reserved = btree_lookup32(&super->s_reserved_segments, segno); - if (reserved) - return 1; - - /* Currently open segments */ - for_each_area(i) { - area = super->s_area[i]; - if (area->a_is_open && area->a_segno == segno) - return 1; - } - - return 0; -} - -static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) -{ - BUG(); -} - -/* - * Returns the bytes consumed by valid objects in this segment. Object headers - * are counted, the segment header is not. - */ -static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, - gc_level_t *gc_level) -{ - struct logfs_segment_entry se; - u32 ec_level; - - logfs_get_segment_entry(sb, segno, &se); - if (se.ec_level == cpu_to_be32(BADSEG) || - se.valid == cpu_to_be32(RESERVED)) - return RESERVED; - - ec_level = be32_to_cpu(se.ec_level); - *ec = ec_level >> 4; - *gc_level = GC_LEVEL(ec_level & 0xf); - return be32_to_cpu(se.valid); -} - -static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, - u64 bix, gc_level_t gc_level) -{ - struct inode *inode; - int err, cookie; - - inode = logfs_safe_iget(sb, ino, &cookie); - err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); - BUG_ON(err); - logfs_safe_iput(inode, cookie); -} - -static u32 logfs_gc_segment(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_segment_header sh; - struct logfs_object_header oh; - u64 ofs, ino, bix; - u32 seg_ofs, logical_segno, cleaned = 0; - int err, len, valid; - gc_level_t gc_level; - - LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); - - btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); - err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); - BUG_ON(err); - gc_level = GC_LEVEL(sh.level); - logical_segno = be32_to_cpu(sh.segno); - if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { - logfs_mark_segment_bad(sb, segno); - cleaned = -1; - goto out; - } - - for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; - seg_ofs + sizeof(oh) < super->s_segsize; ) { - ofs = dev_ofs(sb, logical_segno, seg_ofs); - err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), - &oh); - BUG_ON(err); - - if (!memchr_inv(&oh, 0xff, sizeof(oh))) - break; - - if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { - logfs_mark_segment_bad(sb, segno); - cleaned = super->s_segsize - 1; - goto out; - } - - ino = be64_to_cpu(oh.ino); - bix = be64_to_cpu(oh.bix); - len = sizeof(oh) + be16_to_cpu(oh.len); - valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); - if (valid == 1) { - logfs_cleanse_block(sb, ofs, ino, bix, gc_level); - cleaned += len; - } else if (valid == 2) { - /* Will be invalid upon journal commit */ - cleaned += len; - } - seg_ofs += len; - } -out: - btree_remove32(&super->s_reserved_segments, segno); - return cleaned; -} - -static struct gc_candidate *add_list(struct gc_candidate *cand, - struct candidate_list *list) -{ - struct rb_node **p = &list->rb_tree.rb_node; - struct rb_node *parent = NULL; - struct gc_candidate *cur; - int comp; - - cand->list = list; - while (*p) { - parent = *p; - cur = rb_entry(parent, struct gc_candidate, rb_node); - - if (list->sort_by_ec) - comp = cand->erase_count < cur->erase_count; - else - comp = cand->valid < cur->valid; - - if (comp) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&cand->rb_node, parent, p); - rb_insert_color(&cand->rb_node, &list->rb_tree); - - if (list->count <= list->maxcount) { - list->count++; - return NULL; - } - cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); - rb_erase(&cand->rb_node, &list->rb_tree); - cand->list = NULL; - return cand; -} - -static void remove_from_list(struct gc_candidate *cand) -{ - struct candidate_list *list = cand->list; - - rb_erase(&cand->rb_node, &list->rb_tree); - list->count--; -} - -static void free_candidate(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - - btree_remove32(&super->s_cand_tree, cand->segno); - kfree(cand); -} - -u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) -{ - struct gc_candidate *cand; - u32 segno; - - BUG_ON(list->count == 0); - - cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); - remove_from_list(cand); - segno = cand->segno; - if (ec) - *ec = cand->erase_count; - free_candidate(sb, cand); - return segno; -} - -/* - * We have several lists to manage segments with. The reserve_list is used to - * deal with bad blocks. We try to keep the best (lowest ec) segments on this - * list. - * The free_list contains free segments for normal usage. It usually gets the - * second pick after the reserve_list. But when the free_list is running short - * it is more important to keep the free_list full than to keep a reserve. - * - * Segments that are not free are put onto a per-level low_list. If we have - * to run garbage collection, we pick a candidate from there. All segments on - * those lists should have at least some free space so GC will make progress. - * - * And last we have the ec_list, which is used to pick segments for wear - * leveling. - * - * If all appropriate lists are full, we simply free the candidate and forget - * about that segment for a while. We have better candidates for each purpose. - */ -static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; - - if (cand->valid == 0) { - /* 100% free segments */ - log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", - cand->segno, cand->erase_count, - dev_ofs(sb, cand->segno, 0)); - cand = add_list(cand, &super->s_reserve_list); - if (cand) { - log_gc_noisy("add free segment %x (ec %x) at %llx\n", - cand->segno, cand->erase_count, - dev_ofs(sb, cand->segno, 0)); - cand = add_list(cand, &super->s_free_list); - } - } else { - /* good candidates for Garbage Collection */ - if (cand->valid < full) - cand = add_list(cand, &super->s_low_list[cand->dist]); - /* good candidates for wear leveling, - * segments that were recently written get ignored */ - if (cand) - cand = add_list(cand, &super->s_ec_list); - } - if (cand) - free_candidate(sb, cand); -} - -static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, - u8 dist) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - - cand = kmalloc(sizeof(*cand), GFP_NOFS); - if (!cand) - return -ENOMEM; - - cand->segno = segno; - cand->valid = valid; - cand->erase_count = ec; - cand->dist = dist; - - btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); - __add_candidate(sb, cand); - return 0; -} - -static void remove_segment_from_lists(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - - cand = btree_lookup32(&super->s_cand_tree, segno); - if (cand) { - remove_from_list(cand); - free_candidate(sb, cand); - } -} - -static void scan_segment(struct super_block *sb, u32 segno) -{ - u32 valid, ec = 0; - gc_level_t gc_level = 0; - u8 dist; - - if (segment_is_reserved(sb, segno)) - return; - - remove_segment_from_lists(sb, segno); - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - if (valid == RESERVED) - return; - - dist = root_distance(sb, gc_level); - add_candidate(sb, segno, valid, ec, dist); -} - -static struct gc_candidate *first_in_list(struct candidate_list *list) -{ - if (list->count == 0) - return NULL; - return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); -} - -/* - * Find the best segment for garbage collection. Main criterion is - * the segment requiring the least effort to clean. Secondary - * criterion is to GC on the lowest level available. - * - * So we search the least effort segment on the lowest level first, - * then move up and pick another segment iff is requires significantly - * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. - */ -static struct gc_candidate *get_candidate(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i, max_dist; - struct gc_candidate *cand = NULL, *this; - - max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); - - for (i = max_dist; i >= 0; i--) { - this = first_in_list(&super->s_low_list[i]); - if (!this) - continue; - if (!cand) - cand = this; - if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) - cand = this; - } - return cand; -} - -static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - gc_level_t gc_level; - u32 cleaned, valid, segno, ec; - u8 dist; - - if (!cand) { - log_gc("GC attempted, but no candidate found\n"); - return 0; - } - - segno = cand->segno; - dist = cand->dist; - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - free_candidate(sb, cand); - log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", - segno, (u64)segno << super->s_segshift, - dist, no_free_segments(sb), valid, - super->s_free_bytes); - cleaned = logfs_gc_segment(sb, segno); - log_gc("GC segment #%02x complete - now %x valid\n", segno, - valid - cleaned); - BUG_ON(cleaned != valid); - return 1; -} - -static int logfs_gc_once(struct super_block *sb) -{ - struct gc_candidate *cand; - - cand = get_candidate(sb); - if (cand) - remove_from_list(cand); - return __logfs_gc_once(sb, cand); -} - -/* returns 1 if a wrap occurs, 0 otherwise */ -static int logfs_scan_some(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - u32 segno; - int i, ret = 0; - - segno = super->s_sweeper; - for (i = SCAN_RATIO; i > 0; i--) { - segno++; - if (segno >= super->s_no_segs) { - segno = 0; - ret = 1; - /* Break out of the loop. We want to read a single - * block from the segment size on next invocation if - * SCAN_RATIO is set to match block size - */ - break; - } - - scan_segment(sb, segno); - } - super->s_sweeper = segno; - return ret; -} - -/* - * In principle, this function should loop forever, looking for GC candidates - * and moving data. LogFS is designed in such a way that this loop is - * guaranteed to terminate. - * - * Limiting the loop to some iterations serves purely to catch cases when - * these guarantees have failed. An actual endless loop is an obvious bug - * and should be reported as such. - */ -static void __logfs_gc_pass(struct super_block *sb, int target) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_block *block; - int round, progress, last_progress = 0; - - /* - * Doing too many changes to the segfile at once would result - * in a large number of aliases. Write the journal before - * things get out of hand. - */ - if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) - logfs_write_anchor(sb); - - if (no_free_segments(sb) >= target && - super->s_no_object_aliases < MAX_OBJ_ALIASES) - return; - - log_gc("__logfs_gc_pass(%x)\n", target); - for (round = 0; round < SCAN_ROUNDS; ) { - if (no_free_segments(sb) >= target) - goto write_alias; - - /* Sync in-memory state with on-medium state in case they - * diverged */ - logfs_write_anchor(sb); - round += logfs_scan_some(sb); - if (no_free_segments(sb) >= target) - goto write_alias; - progress = logfs_gc_once(sb); - if (progress) - last_progress = round; - else if (round - last_progress > 2) - break; - continue; - - /* - * The goto logic is nasty, I just don't know a better way to - * code it. GC is supposed to ensure two things: - * 1. Enough free segments are available. - * 2. The number of aliases is bounded. - * When 1. is achieved, we take a look at 2. and write back - * some alias-containing blocks, if necessary. However, after - * each such write we need to go back to 1., as writes can - * consume free segments. - */ -write_alias: - if (super->s_no_object_aliases < MAX_OBJ_ALIASES) - return; - if (list_empty(&super->s_object_alias)) { - /* All aliases are still in btree */ - return; - } - log_gc("Write back one alias\n"); - block = list_entry(super->s_object_alias.next, - struct logfs_block, alias_list); - block->ops->write_block(block); - /* - * To round off the nasty goto logic, we reset round here. It - * is a safety-net for GC not making any progress and limited - * to something reasonably small. If incremented it for every - * single alias, the loop could terminate rather quickly. - */ - round = 0; - } - LOGFS_BUG(sb); -} - -static int wl_ratelimit(struct super_block *sb, u64 *next_event) -{ - struct logfs_super *super = logfs_super(sb); - - if (*next_event < super->s_gec) { - *next_event = super->s_gec + WL_RATELIMIT; - return 0; - } - return 1; -} - -static void logfs_wl_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *wl_cand, *free_cand; - - if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) - return; - - wl_cand = first_in_list(&super->s_ec_list); - if (!wl_cand) - return; - free_cand = first_in_list(&super->s_free_list); - if (!free_cand) - return; - - if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { - remove_from_list(wl_cand); - __logfs_gc_once(sb, wl_cand); - } -} - -/* - * The journal needs wear leveling as well. But moving the journal is an - * expensive operation so we try to avoid it as much as possible. And if we - * have to do it, we move the whole journal, not individual segments. - * - * Ratelimiting is not strictly necessary here, it mainly serves to avoid the - * calculations. First we check whether moving the journal would be a - * significant improvement. That means that a) the current journal segments - * have more wear than the future journal segments and b) the current journal - * segments have more wear than normal ostore segments. - * Rationale for b) is that we don't have to move the journal if it is aging - * less than the ostore, even if the reserve segments age even less (they are - * excluded from wear leveling, after all). - * Next we check that the superblocks have less wear than the journal. Since - * moving the journal requires writing the superblocks, we have to protect the - * superblocks even more than the journal. - * - * Also we double the acceptable wear difference, compared to ostore wear - * leveling. Journal data is read and rewritten rapidly, comparatively. So - * soft errors have much less time to accumulate and we allow the journal to - * be a bit worse than the ostore. - */ -static void logfs_journal_wl_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - u32 min_journal_ec = -1, max_reserve_ec = 0; - int i; - - if (wl_ratelimit(sb, &super->s_wl_gec_journal)) - return; - - if (super->s_reserve_list.count < super->s_no_journal_segs) { - /* Reserve is not full enough to move complete journal */ - return; - } - - journal_for_each(i) - if (super->s_journal_seg[i]) - min_journal_ec = min(min_journal_ec, - super->s_journal_ec[i]); - cand = rb_entry(rb_first(&super->s_free_list.rb_tree), - struct gc_candidate, rb_node); - max_reserve_ec = cand->erase_count; - for (i = 0; i < 2; i++) { - struct logfs_segment_entry se; - u32 segno = seg_no(sb, super->s_sb_ofs[i]); - u32 ec; - - logfs_get_segment_entry(sb, segno, &se); - ec = be32_to_cpu(se.ec_level) >> 4; - max_reserve_ec = max(max_reserve_ec, ec); - } - - if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { - do_logfs_journal_wl_pass(sb); - } -} - -void logfs_gc_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); - /* Write journal before free space is getting saturated with dirty - * objects. - */ - if (super->s_dirty_used_bytes + super->s_dirty_free_bytes - + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) - logfs_write_anchor(sb); - __logfs_gc_pass(sb, super->s_total_levels); - logfs_wl_pass(sb); - logfs_journal_wl_pass(sb); -} - -static int check_area(struct super_block *sb, int i) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_area[i]; - gc_level_t gc_level; - u32 cleaned, valid, ec; - u32 segno = area->a_segno; - u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); - - if (!area->a_is_open) - return 0; - - if (super->s_devops->can_write_buf(sb, ofs) == 0) - return 0; - - printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); - /* - * The device cannot write back the write buffer. Most likely the - * wbuf was already written out and the system crashed at some point - * before the journal commit happened. In that case we wouldn't have - * to do anything. But if the crash happened before the wbuf was - * written out correctly, we must GC this segment. So assume the - * worst and always do the GC run. - */ - area->a_is_open = 0; - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - cleaned = logfs_gc_segment(sb, segno); - if (cleaned != valid) - return -EIO; - return 0; -} - -int logfs_check_areas(struct super_block *sb) -{ - int i, err; - - for_each_area(i) { - err = check_area(sb, i); - if (err) - return err; - } - return 0; -} - -static void logfs_init_candlist(struct candidate_list *list, int maxcount, - int sort_by_ec) -{ - list->count = 0; - list->maxcount = maxcount; - list->sort_by_ec = sort_by_ec; - list->rb_tree = RB_ROOT; -} - -int logfs_init_gc(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); - logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); - logfs_init_candlist(&super->s_reserve_list, - super->s_bad_seg_reserve, 1); - for_each_area(i) - logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); - logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); - return 0; -} - -static void logfs_cleanup_list(struct super_block *sb, - struct candidate_list *list) -{ - struct gc_candidate *cand; - - while (list->count) { - cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, - rb_node); - remove_from_list(cand); - free_candidate(sb, cand); - } - BUG_ON(list->rb_tree.rb_node); -} - -void logfs_cleanup_gc(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - if (!super->s_free_list.count) - return; - - /* - * FIXME: The btree may still contain a single empty node. So we - * call the grim visitor to clean up that mess. Btree code should - * do it for us, really. - */ - btree_grim_visitor32(&super->s_cand_tree, 0, NULL); - logfs_cleanup_list(sb, &super->s_free_list); - logfs_cleanup_list(sb, &super->s_reserve_list); - for_each_area(i) - logfs_cleanup_list(sb, &super->s_low_list[i]); - logfs_cleanup_list(sb, &super->s_ec_list); -} diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c deleted file mode 100644 index f440a1525da8..000000000000 --- a/fs/logfs/inode.c +++ /dev/null @@ -1,428 +0,0 @@ -/* - * fs/logfs/inode.c - inode handling code - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/slab.h> -#include <linux/writeback.h> -#include <linux/backing-dev.h> - -/* - * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes - * on the medium. It therefore also lacks a method to store the previous - * generation number for deleted inodes. Instead a single generation number - * is stored which will be used for new inodes. Being just a 32bit counter, - * this can obvious wrap relatively quickly. So we only reuse inodes if we - * know that a fair number of inodes can be created before we have to increment - * the generation again - effectively adding some bits to the counter. - * But being too aggressive here means we keep a very large and very sparse - * inode file, wasting space on indirect blocks. - * So what is a good value? Beats me. 64k seems moderately bad on both - * fronts, so let's use that for now... - * - * NFS sucks, as everyone already knows. - */ -#define INOS_PER_WRAP (0x10000) - -/* - * Logfs' requirement to read inodes for garbage collection makes life a bit - * harder. GC may have to read inodes that are in I_FREEING state, when they - * are being written out - and waiting for GC to make progress, naturally. - * - * So we cannot just call iget() or some variant of it, but first have to check - * whether the inode in question might be in I_FREEING state. Therefore we - * maintain our own per-sb list of "almost deleted" inodes and check against - * that list first. Normally this should be at most 1-2 entries long. - * - * Also, inodes have logfs-specific reference counting on top of what the vfs - * does. When .destroy_inode is called, normally the reference count will drop - * to zero and the inode gets deleted. But if GC accessed the inode, its - * refcount will remain nonzero and final deletion will have to wait. - * - * As a result we have two sets of functions to get/put inodes: - * logfs_safe_iget/logfs_safe_iput - safe to call from GC context - * logfs_iget/iput - normal version - */ -static struct kmem_cache *logfs_inode_cache; - -static DEFINE_SPINLOCK(logfs_inode_lock); - -static void logfs_inode_setops(struct inode *inode) -{ - switch (inode->i_mode & S_IFMT) { - case S_IFDIR: - inode->i_op = &logfs_dir_iops; - inode->i_fop = &logfs_dir_fops; - inode->i_mapping->a_ops = &logfs_reg_aops; - break; - case S_IFREG: - inode->i_op = &logfs_reg_iops; - inode->i_fop = &logfs_reg_fops; - inode->i_mapping->a_ops = &logfs_reg_aops; - break; - case S_IFLNK: - inode->i_op = &page_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &logfs_reg_aops; - break; - case S_IFSOCK: /* fall through */ - case S_IFBLK: /* fall through */ - case S_IFCHR: /* fall through */ - case S_IFIFO: - init_special_inode(inode, inode->i_mode, inode->i_rdev); - break; - default: - BUG(); - } -} - -static struct inode *__logfs_iget(struct super_block *sb, ino_t ino) -{ - struct inode *inode = iget_locked(sb, ino); - int err; - - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - err = logfs_read_inode(inode); - if (err || inode->i_nlink == 0) { - /* inode->i_nlink == 0 can be true when called from - * block validator */ - /* set i_nlink to 0 to prevent caching */ - clear_nlink(inode); - logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE; - iget_failed(inode); - if (!err) - err = -ENOENT; - return ERR_PTR(err); - } - - logfs_inode_setops(inode); - unlock_new_inode(inode); - return inode; -} - -struct inode *logfs_iget(struct super_block *sb, ino_t ino) -{ - BUG_ON(ino == LOGFS_INO_MASTER); - BUG_ON(ino == LOGFS_INO_SEGFILE); - return __logfs_iget(sb, ino); -} - -/* - * is_cached is set to 1 if we hand out a cached inode, 0 otherwise. - * this allows logfs_iput to do the right thing later - */ -struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_inode *li; - - if (ino == LOGFS_INO_MASTER) - return super->s_master_inode; - if (ino == LOGFS_INO_SEGFILE) - return super->s_segfile_inode; - - spin_lock(&logfs_inode_lock); - list_for_each_entry(li, &super->s_freeing_list, li_freeing_list) - if (li->vfs_inode.i_ino == ino) { - li->li_refcount++; - spin_unlock(&logfs_inode_lock); - *is_cached = 1; - return &li->vfs_inode; - } - spin_unlock(&logfs_inode_lock); - - *is_cached = 0; - return __logfs_iget(sb, ino); -} - -static void logfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kmem_cache_free(logfs_inode_cache, logfs_inode(inode)); -} - -static void __logfs_destroy_inode(struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - - BUG_ON(li->li_block); - list_del(&li->li_freeing_list); - call_rcu(&inode->i_rcu, logfs_i_callback); -} - -static void __logfs_destroy_meta_inode(struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - BUG_ON(li->li_block); - call_rcu(&inode->i_rcu, logfs_i_callback); -} - -static void logfs_destroy_inode(struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - - if (inode->i_ino < LOGFS_RESERVED_INOS) { - /* - * The reserved inodes are never destroyed unless we are in - * unmont path. - */ - __logfs_destroy_meta_inode(inode); - return; - } - - BUG_ON(list_empty(&li->li_freeing_list)); - spin_lock(&logfs_inode_lock); - li->li_refcount--; - if (li->li_refcount == 0) - __logfs_destroy_inode(inode); - spin_unlock(&logfs_inode_lock); -} - -void logfs_safe_iput(struct inode *inode, int is_cached) -{ - if (inode->i_ino == LOGFS_INO_MASTER) - return; - if (inode->i_ino == LOGFS_INO_SEGFILE) - return; - - if (is_cached) { - logfs_destroy_inode(inode); - return; - } - - iput(inode); -} - -static void logfs_init_inode(struct super_block *sb, struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - int i; - - li->li_flags = 0; - li->li_height = 0; - li->li_used_bytes = 0; - li->li_block = NULL; - i_uid_write(inode, 0); - i_gid_write(inode, 0); - inode->i_size = 0; - inode->i_blocks = 0; - inode->i_ctime = current_time(inode); - inode->i_mtime = current_time(inode); - li->li_refcount = 1; - INIT_LIST_HEAD(&li->li_freeing_list); - - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - li->li_data[i] = 0; - - return; -} - -static struct inode *logfs_alloc_inode(struct super_block *sb) -{ - struct logfs_inode *li; - - li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS); - if (!li) - return NULL; - logfs_init_inode(sb, &li->vfs_inode); - return &li->vfs_inode; -} - -/* - * In logfs inodes are written to an inode file. The inode file, like any - * other file, is managed with a inode. The inode file's inode, aka master - * inode, requires special handling in several respects. First, it cannot be - * written to the inode file, so it is stored in the journal instead. - * - * Secondly, this inode cannot be written back and destroyed before all other - * inodes have been written. The ordering is important. Linux' VFS is happily - * unaware of the ordering constraint and would ordinarily destroy the master - * inode at umount time while other inodes are still in use and dirty. Not - * good. - * - * So logfs makes sure the master inode is not written until all other inodes - * have been destroyed. Sadly, this method has another side-effect. The VFS - * will notice one remaining inode and print a frightening warning message. - * Worse, it is impossible to judge whether such a warning was caused by the - * master inode or any other inodes have leaked as well. - * - * Our attempt of solving this is with logfs_new_meta_inode() below. Its - * purpose is to create a new inode that will not trigger the warning if such - * an inode is still in use. An ugly hack, no doubt. Suggections for - * improvement are welcome. - * - * AV: that's what ->put_super() is for... - */ -struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino) -{ - struct inode *inode; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - inode->i_mode = S_IFREG; - inode->i_ino = ino; - inode->i_data.a_ops = &logfs_reg_aops; - mapping_set_gfp_mask(&inode->i_data, GFP_NOFS); - - return inode; -} - -struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino) -{ - struct inode *inode; - int err; - - inode = logfs_new_meta_inode(sb, ino); - if (IS_ERR(inode)) - return inode; - - err = logfs_read_inode(inode); - if (err) { - iput(inode); - return ERR_PTR(err); - } - logfs_inode_setops(inode); - return inode; -} - -static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - int ret; - long flags = WF_LOCK; - - /* Can only happen if creat() failed. Safe to skip. */ - if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN) - return 0; - - ret = __logfs_write_inode(inode, NULL, flags); - LOGFS_BUG_ON(ret, inode->i_sb); - return ret; -} - -/* called with inode->i_lock held */ -static int logfs_drop_inode(struct inode *inode) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - struct logfs_inode *li = logfs_inode(inode); - - spin_lock(&logfs_inode_lock); - list_move(&li->li_freeing_list, &super->s_freeing_list); - spin_unlock(&logfs_inode_lock); - return generic_drop_inode(inode); -} - -static void logfs_set_ino_generation(struct super_block *sb, - struct inode *inode) -{ - struct logfs_super *super = logfs_super(sb); - u64 ino; - - mutex_lock(&super->s_journal_mutex); - ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1); - super->s_last_ino = ino; - super->s_inos_till_wrap--; - if (super->s_inos_till_wrap < 0) { - super->s_last_ino = LOGFS_RESERVED_INOS; - super->s_generation++; - super->s_inos_till_wrap = INOS_PER_WRAP; - } - inode->i_ino = ino; - inode->i_generation = super->s_generation; - mutex_unlock(&super->s_journal_mutex); -} - -struct inode *logfs_new_inode(struct inode *dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct inode *inode; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - logfs_init_inode(sb, inode); - - /* inherit parent flags */ - logfs_inode(inode)->li_flags |= - logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED; - - inode->i_mode = mode; - logfs_set_ino_generation(sb, inode); - - inode_init_owner(inode, dir, mode); - logfs_inode_setops(inode); - insert_inode_hash(inode); - - return inode; -} - -static void logfs_init_once(void *_li) -{ - struct logfs_inode *li = _li; - int i; - - li->li_flags = 0; - li->li_used_bytes = 0; - li->li_refcount = 1; - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - li->li_data[i] = 0; - inode_init_once(&li->vfs_inode); -} - -static int logfs_sync_fs(struct super_block *sb, int wait) -{ - logfs_get_wblocks(sb, NULL, WF_LOCK); - logfs_write_anchor(sb); - logfs_put_wblocks(sb, NULL, WF_LOCK); - return 0; -} - -static void logfs_put_super(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - /* kill the meta-inodes */ - iput(super->s_segfile_inode); - iput(super->s_master_inode); - iput(super->s_mapping_inode); -} - -const struct super_operations logfs_super_operations = { - .alloc_inode = logfs_alloc_inode, - .destroy_inode = logfs_destroy_inode, - .evict_inode = logfs_evict_inode, - .drop_inode = logfs_drop_inode, - .put_super = logfs_put_super, - .write_inode = logfs_write_inode, - .statfs = logfs_statfs, - .sync_fs = logfs_sync_fs, -}; - -int logfs_init_inode_cache(void) -{ - logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, - logfs_init_once); - if (!logfs_inode_cache) - return -ENOMEM; - return 0; -} - -void logfs_destroy_inode_cache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(logfs_inode_cache); -} diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c deleted file mode 100644 index 2a09b8d73989..000000000000 --- a/fs/logfs/journal.c +++ /dev/null @@ -1,894 +0,0 @@ -/* - * fs/logfs/journal.c - journal handling code - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/slab.h> - -static void logfs_calc_free(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - u64 reserve, no_segs = super->s_no_segs; - s64 free; - int i; - - /* superblock segments */ - no_segs -= 2; - super->s_no_journal_segs = 0; - /* journal */ - journal_for_each(i) - if (super->s_journal_seg[i]) { - no_segs--; - super->s_no_journal_segs++; - } - - /* open segments plus one extra per level for GC */ - no_segs -= 2 * super->s_total_levels; - - free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE); - free -= super->s_used_bytes; - /* just a bit extra */ - free -= super->s_total_levels * 4096; - - /* Bad blocks are 'paid' for with speed reserve - the filesystem - * simply gets slower as bad blocks accumulate. Until the bad blocks - * exceed the speed reserve - then the filesystem gets smaller. - */ - reserve = super->s_bad_segments + super->s_bad_seg_reserve; - reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE; - reserve = max(reserve, super->s_speed_reserve); - free -= reserve; - if (free < 0) - free = 0; - - super->s_free_bytes = free; -} - -static void reserve_sb_and_journal(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct btree_head32 *head = &super->s_reserved_segments; - int i, err; - - err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1, - GFP_KERNEL); - BUG_ON(err); - - err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1, - GFP_KERNEL); - BUG_ON(err); - - journal_for_each(i) { - if (!super->s_journal_seg[i]) - continue; - err = btree_insert32(head, super->s_journal_seg[i], (void *)1, - GFP_KERNEL); - BUG_ON(err); - } -} - -static void read_dynsb(struct super_block *sb, - struct logfs_je_dynsb *dynsb) -{ - struct logfs_super *super = logfs_super(sb); - - super->s_gec = be64_to_cpu(dynsb->ds_gec); - super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper); - super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino); - super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir); - super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos); - super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes); - super->s_generation = be32_to_cpu(dynsb->ds_generation); -} - -static void read_anchor(struct super_block *sb, - struct logfs_je_anchor *da) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode = super->s_master_inode; - struct logfs_inode *li = logfs_inode(inode); - int i; - - super->s_last_ino = be64_to_cpu(da->da_last_ino); - li->li_flags = 0; - li->li_height = da->da_height; - i_size_write(inode, be64_to_cpu(da->da_size)); - li->li_used_bytes = be64_to_cpu(da->da_used_bytes); - - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - li->li_data[i] = be64_to_cpu(da->da_data[i]); -} - -static void read_erasecount(struct super_block *sb, - struct logfs_je_journal_ec *ec) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - journal_for_each(i) - super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]); -} - -static int read_area(struct super_block *sb, struct logfs_je_area *a) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_area[a->gc_level]; - u64 ofs; - u32 writemask = ~(super->s_writesize - 1); - - if (a->gc_level >= LOGFS_NO_AREAS) - return -EIO; - if (a->vim != VIM_DEFAULT) - return -EIO; /* TODO: close area and continue */ - - area->a_used_bytes = be32_to_cpu(a->used_bytes); - area->a_written_bytes = area->a_used_bytes & writemask; - area->a_segno = be32_to_cpu(a->segno); - if (area->a_segno) - area->a_is_open = 1; - - ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); - if (super->s_writesize > 1) - return logfs_buf_recover(area, ofs, a + 1, super->s_writesize); - else - return logfs_buf_recover(area, ofs, NULL, 0); -} - -static void *unpack(void *from, void *to) -{ - struct logfs_journal_header *jh = from; - void *data = from + sizeof(struct logfs_journal_header); - int err; - size_t inlen, outlen; - - inlen = be16_to_cpu(jh->h_len); - outlen = be16_to_cpu(jh->h_datalen); - - if (jh->h_compr == COMPR_NONE) - memcpy(to, data, inlen); - else { - err = logfs_uncompress(data, to, inlen, outlen); - BUG_ON(err); - } - return to; -} - -static int __read_je_header(struct super_block *sb, u64 ofs, - struct logfs_journal_header *jh) -{ - struct logfs_super *super = logfs_super(sb); - size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) - + MAX_JOURNAL_HEADER; - u16 type, len, datalen; - int err; - - /* read header only */ - err = wbuf_read(sb, ofs, sizeof(*jh), jh); - if (err) - return err; - type = be16_to_cpu(jh->h_type); - len = be16_to_cpu(jh->h_len); - datalen = be16_to_cpu(jh->h_datalen); - if (len > sb->s_blocksize) - return -EIO; - if ((type < JE_FIRST) || (type > JE_LAST)) - return -EIO; - if (datalen > bufsize) - return -EIO; - return 0; -} - -static int __read_je_payload(struct super_block *sb, u64 ofs, - struct logfs_journal_header *jh) -{ - u16 len; - int err; - - len = be16_to_cpu(jh->h_len); - err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1); - if (err) - return err; - if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) { - /* Old code was confused. It forgot about the header length - * and stopped calculating the crc 16 bytes before the end - * of data - ick! - * FIXME: Remove this hack once the old code is fixed. - */ - if (jh->h_crc == logfs_crc32(jh, len, 4)) - WARN_ON_ONCE(1); - else - return -EIO; - } - return 0; -} - -/* - * jh needs to be large enough to hold the complete entry, not just the header - */ -static int __read_je(struct super_block *sb, u64 ofs, - struct logfs_journal_header *jh) -{ - int err; - - err = __read_je_header(sb, ofs, jh); - if (err) - return err; - return __read_je_payload(sb, ofs, jh); -} - -static int read_je(struct super_block *sb, u64 ofs) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_journal_header *jh = super->s_compressed_je; - void *scratch = super->s_je; - u16 type, datalen; - int err; - - err = __read_je(sb, ofs, jh); - if (err) - return err; - type = be16_to_cpu(jh->h_type); - datalen = be16_to_cpu(jh->h_datalen); - - switch (type) { - case JE_DYNSB: - read_dynsb(sb, unpack(jh, scratch)); - break; - case JE_ANCHOR: - read_anchor(sb, unpack(jh, scratch)); - break; - case JE_ERASECOUNT: - read_erasecount(sb, unpack(jh, scratch)); - break; - case JE_AREA: - err = read_area(sb, unpack(jh, scratch)); - break; - case JE_OBJ_ALIAS: - err = logfs_load_object_aliases(sb, unpack(jh, scratch), - datalen); - break; - default: - WARN_ON_ONCE(1); - return -EIO; - } - return err; -} - -static int logfs_read_segment(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_journal_header *jh = super->s_compressed_je; - u64 ofs, seg_ofs = dev_ofs(sb, segno, 0); - u32 h_ofs, last_ofs = 0; - u16 len, datalen, last_len = 0; - int i, err; - - /* search for most recent commit */ - for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) { - ofs = seg_ofs + h_ofs; - err = __read_je_header(sb, ofs, jh); - if (err) - continue; - if (jh->h_type != cpu_to_be16(JE_COMMIT)) - continue; - err = __read_je_payload(sb, ofs, jh); - if (err) - continue; - len = be16_to_cpu(jh->h_len); - datalen = be16_to_cpu(jh->h_datalen); - if ((datalen > sizeof(super->s_je_array)) || - (datalen % sizeof(__be64))) - continue; - last_ofs = h_ofs; - last_len = datalen; - h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh); - } - /* read commit */ - if (last_ofs == 0) - return -ENOENT; - ofs = seg_ofs + last_ofs; - log_journal("Read commit from %llx\n", ofs); - err = __read_je(sb, ofs, jh); - BUG_ON(err); /* We should have caught it in the scan loop already */ - if (err) - return err; - /* uncompress */ - unpack(jh, super->s_je_array); - super->s_no_je = last_len / sizeof(__be64); - /* iterate over array */ - for (i = 0; i < super->s_no_je; i++) { - err = read_je(sb, be64_to_cpu(super->s_je_array[i])); - if (err) - return err; - } - super->s_journal_area->a_segno = segno; - return 0; -} - -static u64 read_gec(struct super_block *sb, u32 segno) -{ - struct logfs_segment_header sh; - __be32 crc; - int err; - - if (!segno) - return 0; - err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); - if (err) - return 0; - crc = logfs_crc32(&sh, sizeof(sh), 4); - if (crc != sh.crc) { - WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull)); - /* Most likely it was just erased */ - return 0; - } - return be64_to_cpu(sh.gec); -} - -static int logfs_read_journal(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - u64 gec[LOGFS_JOURNAL_SEGS], max; - u32 segno; - int i, max_i; - - max = 0; - max_i = -1; - journal_for_each(i) { - segno = super->s_journal_seg[i]; - gec[i] = read_gec(sb, super->s_journal_seg[i]); - if (gec[i] > max) { - max = gec[i]; - max_i = i; - } - } - if (max_i == -1) - return -EIO; - /* FIXME: Try older segments in case of error */ - return logfs_read_segment(sb, super->s_journal_seg[max_i]); -} - -/* - * First search the current segment (outer loop), then pick the next segment - * in the array, skipping any zero entries (inner loop). - */ -static void journal_get_free_segment(struct logfs_area *area) -{ - struct logfs_super *super = logfs_super(area->a_sb); - int i; - - journal_for_each(i) { - if (area->a_segno != super->s_journal_seg[i]) - continue; - - do { - i++; - if (i == LOGFS_JOURNAL_SEGS) - i = 0; - } while (!super->s_journal_seg[i]); - - area->a_segno = super->s_journal_seg[i]; - area->a_erase_count = ++(super->s_journal_ec[i]); - log_journal("Journal now at %x (ec %x)\n", area->a_segno, - area->a_erase_count); - return; - } - BUG(); -} - -static void journal_get_erase_count(struct logfs_area *area) -{ - /* erase count is stored globally and incremented in - * journal_get_free_segment() - nothing to do here */ -} - -static int journal_erase_segment(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - union { - struct logfs_segment_header sh; - unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)]; - } u; - u64 ofs; - int err; - - err = logfs_erase_segment(sb, area->a_segno, 1); - if (err) - return err; - - memset(&u, 0, sizeof(u)); - u.sh.pad = 0; - u.sh.type = SEG_JOURNAL; - u.sh.level = 0; - u.sh.segno = cpu_to_be32(area->a_segno); - u.sh.ec = cpu_to_be32(area->a_erase_count); - u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); - u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4); - - /* This causes a bug in segment.c. Not yet. */ - //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); - - ofs = dev_ofs(sb, area->a_segno, 0); - area->a_used_bytes = sizeof(u); - logfs_buf_write(area, ofs, &u, sizeof(u)); - return 0; -} - -static size_t __logfs_write_header(struct logfs_super *super, - struct logfs_journal_header *jh, size_t len, size_t datalen, - u16 type, u8 compr) -{ - jh->h_len = cpu_to_be16(len); - jh->h_type = cpu_to_be16(type); - jh->h_datalen = cpu_to_be16(datalen); - jh->h_compr = compr; - jh->h_pad[0] = 'H'; - jh->h_pad[1] = 'E'; - jh->h_pad[2] = 'A'; - jh->h_pad[3] = 'D'; - jh->h_pad[4] = 'R'; - jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4); - return ALIGN(len, 16) + sizeof(*jh); -} - -static size_t logfs_write_header(struct logfs_super *super, - struct logfs_journal_header *jh, size_t datalen, u16 type) -{ - size_t len = datalen; - - return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE); -} - -static inline size_t logfs_journal_erasecount_size(struct logfs_super *super) -{ - return LOGFS_JOURNAL_SEGS * sizeof(__be32); -} - -static void *logfs_write_erasecount(struct super_block *sb, void *_ec, - u16 *type, size_t *len) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_je_journal_ec *ec = _ec; - int i; - - journal_for_each(i) - ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]); - *type = JE_ERASECOUNT; - *len = logfs_journal_erasecount_size(super); - return ec; -} - -static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore, - size_t ignore2) -{ - struct logfs_shadow *shadow = _shadow; - struct super_block *sb = (void *)_sb; - struct logfs_super *super = logfs_super(sb); - - /* consume new space */ - super->s_free_bytes -= shadow->new_len; - super->s_used_bytes += shadow->new_len; - super->s_dirty_used_bytes -= shadow->new_len; - - /* free up old space */ - super->s_free_bytes += shadow->old_len; - super->s_used_bytes -= shadow->old_len; - super->s_dirty_free_bytes -= shadow->old_len; - - logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len); - logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len); - - log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n", - shadow->ino, shadow->bix, shadow->gc_level, - shadow->old_ofs, shadow->new_ofs, - shadow->old_len, shadow->new_len); - mempool_free(shadow, super->s_shadow_pool); -} - -static void account_shadows(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode = super->s_master_inode; - struct logfs_inode *li = logfs_inode(inode); - struct shadow_tree *tree = &super->s_shadow_tree; - - btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); - btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); - btree_grim_visitor32(&tree->segment_map, 0, NULL); - tree->no_shadowed_segments = 0; - - if (li->li_block) { - /* - * We never actually use the structure, when attached to the - * master inode. But it is easier to always free it here than - * to have checks in several places elsewhere when allocating - * it. - */ - li->li_block->ops->free_block(sb, li->li_block); - } - BUG_ON((s64)li->li_used_bytes < 0); -} - -static void *__logfs_write_anchor(struct super_block *sb, void *_da, - u16 *type, size_t *len) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_je_anchor *da = _da; - struct inode *inode = super->s_master_inode; - struct logfs_inode *li = logfs_inode(inode); - int i; - - da->da_height = li->li_height; - da->da_last_ino = cpu_to_be64(super->s_last_ino); - da->da_size = cpu_to_be64(i_size_read(inode)); - da->da_used_bytes = cpu_to_be64(li->li_used_bytes); - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - da->da_data[i] = cpu_to_be64(li->li_data[i]); - *type = JE_ANCHOR; - *len = sizeof(*da); - return da; -} - -static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb, - u16 *type, size_t *len) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_je_dynsb *dynsb = _dynsb; - - dynsb->ds_gec = cpu_to_be64(super->s_gec); - dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper); - dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino); - dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir); - dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos); - dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes); - dynsb->ds_generation = cpu_to_be32(super->s_generation); - *type = JE_DYNSB; - *len = sizeof(*dynsb); - return dynsb; -} - -static void write_wbuf(struct super_block *sb, struct logfs_area *area, - void *wbuf) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - u64 ofs; - pgoff_t index; - int page_ofs; - struct page *page; - - ofs = dev_ofs(sb, area->a_segno, - area->a_used_bytes & ~(super->s_writesize - 1)); - index = ofs >> PAGE_SHIFT; - page_ofs = ofs & (PAGE_SIZE - 1); - - page = find_or_create_page(mapping, index, GFP_NOFS); - BUG_ON(!page); - memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize); - unlock_page(page); -} - -static void *logfs_write_area(struct super_block *sb, void *_a, - u16 *type, size_t *len) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_area[super->s_sum_index]; - struct logfs_je_area *a = _a; - - a->vim = VIM_DEFAULT; - a->gc_level = super->s_sum_index; - a->used_bytes = cpu_to_be32(area->a_used_bytes); - a->segno = cpu_to_be32(area->a_segno); - if (super->s_writesize > 1) - write_wbuf(sb, area, a + 1); - - *type = JE_AREA; - *len = sizeof(*a) + super->s_writesize; - return a; -} - -static void *logfs_write_commit(struct super_block *sb, void *h, - u16 *type, size_t *len) -{ - struct logfs_super *super = logfs_super(sb); - - *type = JE_COMMIT; - *len = super->s_no_je * sizeof(__be64); - return super->s_je_array; -} - -static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, - size_t len) -{ - struct logfs_super *super = logfs_super(sb); - void *header = super->s_compressed_je; - void *data = header + sizeof(struct logfs_journal_header); - ssize_t compr_len, pad_len; - u8 compr = COMPR_ZLIB; - - if (len == 0) - return logfs_write_header(super, header, 0, type); - - compr_len = logfs_compress(buf, data, len, sb->s_blocksize); - if (compr_len < 0 || type == JE_ANCHOR) { - memcpy(data, buf, len); - compr_len = len; - compr = COMPR_NONE; - } - - pad_len = ALIGN(compr_len, 16); - memset(data + compr_len, 0, pad_len - compr_len); - - return __logfs_write_header(super, header, compr_len, len, type, compr); -} - -static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes, - int must_pad) -{ - u32 writesize = logfs_super(area->a_sb)->s_writesize; - s32 ofs; - int ret; - - ret = logfs_open_area(area, *bytes); - if (ret) - return -EAGAIN; - - ofs = area->a_used_bytes; - area->a_used_bytes += *bytes; - - if (must_pad) { - area->a_used_bytes = ALIGN(area->a_used_bytes, writesize); - *bytes = area->a_used_bytes - ofs; - } - - return dev_ofs(area->a_sb, area->a_segno, ofs); -} - -static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, - size_t buf_len) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_journal_area; - struct logfs_journal_header *jh = super->s_compressed_je; - size_t len; - int must_pad = 0; - s64 ofs; - - len = __logfs_write_je(sb, buf, type, buf_len); - if (jh->h_type == cpu_to_be16(JE_COMMIT)) - must_pad = 1; - - ofs = logfs_get_free_bytes(area, &len, must_pad); - if (ofs < 0) - return ofs; - logfs_buf_write(area, ofs, super->s_compressed_je, len); - BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES); - super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); - return 0; -} - -static int logfs_write_je(struct super_block *sb, - void* (*write)(struct super_block *sb, void *scratch, - u16 *type, size_t *len)) -{ - void *buf; - size_t len; - u16 type; - - buf = write(sb, logfs_super(sb)->s_je, &type, &len); - return logfs_write_je_buf(sb, buf, type, len); -} - -int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, - level_t level, int child_no, __be64 val) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_obj_alias *oa = super->s_je; - int err = 0, fill = super->s_je_fill; - - log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n", - fill, ino, bix, level, child_no, be64_to_cpu(val)); - oa[fill].ino = cpu_to_be64(ino); - oa[fill].bix = cpu_to_be64(bix); - oa[fill].val = val; - oa[fill].level = (__force u8)level; - oa[fill].child_no = cpu_to_be16(child_no); - fill++; - if (fill >= sb->s_blocksize / sizeof(*oa)) { - err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize); - fill = 0; - } - - super->s_je_fill = fill; - return err; -} - -static int logfs_write_obj_aliases(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int err; - - log_journal("logfs_write_obj_aliases: %d aliases to write\n", - super->s_no_object_aliases); - super->s_je_fill = 0; - err = logfs_write_obj_aliases_pagecache(sb); - if (err) - return err; - - if (super->s_je_fill) - err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS, - super->s_je_fill - * sizeof(struct logfs_obj_alias)); - return err; -} - -/* - * Write all journal entries. The goto logic ensures that all journal entries - * are written whenever a new segment is used. It is ugly and potentially a - * bit wasteful, but robustness is more important. With this we can *always* - * erase all journal segments except the one containing the most recent commit. - */ -void logfs_write_anchor(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_journal_area; - int i, err; - - if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY)) - return; - super->s_flags &= ~LOGFS_SB_FLAG_DIRTY; - - BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); - mutex_lock(&super->s_journal_mutex); - - /* Do this first or suffer corruption */ - logfs_sync_segments(sb); - account_shadows(sb); - -again: - super->s_no_je = 0; - for_each_area(i) { - if (!super->s_area[i]->a_is_open) - continue; - super->s_sum_index = i; - err = logfs_write_je(sb, logfs_write_area); - if (err) - goto again; - } - err = logfs_write_obj_aliases(sb); - if (err) - goto again; - err = logfs_write_je(sb, logfs_write_erasecount); - if (err) - goto again; - err = logfs_write_je(sb, __logfs_write_anchor); - if (err) - goto again; - err = logfs_write_je(sb, logfs_write_dynsb); - if (err) - goto again; - /* - * Order is imperative. First we sync all writes, including the - * non-committed journal writes. Then we write the final commit and - * sync the current journal segment. - * There is a theoretical bug here. Syncing the journal segment will - * write a number of journal entries and the final commit. All these - * are written in a single operation. If the device layer writes the - * data back-to-front, the commit will precede the other journal - * entries, leaving a race window. - * Two fixes are possible. Preferred is to fix the device layer to - * ensure writes happen front-to-back. Alternatively we can insert - * another logfs_sync_area() super->s_devops->sync() combo before - * writing the commit. - */ - /* - * On another subject, super->s_devops->sync is usually not necessary. - * Unless called from sys_sync or friends, a barrier would suffice. - */ - super->s_devops->sync(sb); - err = logfs_write_je(sb, logfs_write_commit); - if (err) - goto again; - log_journal("Write commit to %llx\n", - be64_to_cpu(super->s_je_array[super->s_no_je - 1])); - logfs_sync_area(area); - BUG_ON(area->a_used_bytes != area->a_written_bytes); - super->s_devops->sync(sb); - - mutex_unlock(&super->s_journal_mutex); - return; -} - -void do_logfs_journal_wl_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_journal_area; - struct btree_head32 *head = &super->s_reserved_segments; - u32 segno, ec; - int i, err; - - log_journal("Journal requires wear-leveling.\n"); - /* Drop old segments */ - journal_for_each(i) - if (super->s_journal_seg[i]) { - btree_remove32(head, super->s_journal_seg[i]); - logfs_set_segment_unreserved(sb, - super->s_journal_seg[i], - super->s_journal_ec[i]); - super->s_journal_seg[i] = 0; - super->s_journal_ec[i] = 0; - } - /* Get new segments */ - for (i = 0; i < super->s_no_journal_segs; i++) { - segno = get_best_cand(sb, &super->s_reserve_list, &ec); - super->s_journal_seg[i] = segno; - super->s_journal_ec[i] = ec; - logfs_set_segment_reserved(sb, segno); - err = btree_insert32(head, segno, (void *)1, GFP_NOFS); - BUG_ON(err); /* mempool should prevent this */ - err = logfs_erase_segment(sb, segno, 1); - BUG_ON(err); /* FIXME: remount-ro would be nicer */ - } - /* Manually move journal_area */ - freeseg(sb, area->a_segno); - area->a_segno = super->s_journal_seg[0]; - area->a_is_open = 0; - area->a_used_bytes = 0; - /* Write journal */ - logfs_write_anchor(sb); - /* Write superblocks */ - err = logfs_write_sb(sb); - BUG_ON(err); -} - -static const struct logfs_area_ops journal_area_ops = { - .get_free_segment = journal_get_free_segment, - .get_erase_count = journal_get_erase_count, - .erase_segment = journal_erase_segment, -}; - -int logfs_init_journal(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize) - + MAX_JOURNAL_HEADER; - int ret = -ENOMEM; - - mutex_init(&super->s_journal_mutex); - btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool); - - super->s_je = kzalloc(bufsize, GFP_KERNEL); - if (!super->s_je) - return ret; - - super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL); - if (!super->s_compressed_je) - return ret; - - super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER); - if (IS_ERR(super->s_master_inode)) - return PTR_ERR(super->s_master_inode); - - ret = logfs_read_journal(sb); - if (ret) - return -EIO; - - reserve_sb_and_journal(sb); - logfs_calc_free(sb); - - super->s_journal_area->a_ops = &journal_area_ops; - return 0; -} - -void logfs_cleanup_journal(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - btree_grim_visitor32(&super->s_reserved_segments, 0, NULL); - - kfree(super->s_compressed_je); - kfree(super->s_je); -} diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h deleted file mode 100644 index 27d040e35faa..000000000000 --- a/fs/logfs/logfs.h +++ /dev/null @@ -1,735 +0,0 @@ -/* - * fs/logfs/logfs.h - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - * - * Private header for logfs. - */ -#ifndef FS_LOGFS_LOGFS_H -#define FS_LOGFS_LOGFS_H - -#undef __CHECK_ENDIAN__ -#define __CHECK_ENDIAN__ - -#include <linux/btree.h> -#include <linux/crc32.h> -#include <linux/fs.h> -#include <linux/kernel.h> -#include <linux/mempool.h> -#include <linux/pagemap.h> -#include <linux/mtd/mtd.h> -#include "logfs_abi.h" - -#define LOGFS_DEBUG_SUPER (0x0001) -#define LOGFS_DEBUG_SEGMENT (0x0002) -#define LOGFS_DEBUG_JOURNAL (0x0004) -#define LOGFS_DEBUG_DIR (0x0008) -#define LOGFS_DEBUG_FILE (0x0010) -#define LOGFS_DEBUG_INODE (0x0020) -#define LOGFS_DEBUG_READWRITE (0x0040) -#define LOGFS_DEBUG_GC (0x0080) -#define LOGFS_DEBUG_GC_NOISY (0x0100) -#define LOGFS_DEBUG_ALIASES (0x0200) -#define LOGFS_DEBUG_BLOCKMOVE (0x0400) -#define LOGFS_DEBUG_ALL (0xffffffff) - -#define LOGFS_DEBUG (0x01) -/* - * To enable specific log messages, simply define LOGFS_DEBUG to match any - * or all of the above. - */ -#ifndef LOGFS_DEBUG -#define LOGFS_DEBUG (0) -#endif - -#define log_cond(cond, fmt, arg...) do { \ - if (cond) \ - printk(KERN_DEBUG fmt, ##arg); \ -} while (0) - -#define log_super(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg) -#define log_segment(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg) -#define log_journal(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg) -#define log_dir(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg) -#define log_file(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg) -#define log_inode(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg) -#define log_readwrite(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg) -#define log_gc(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg) -#define log_gc_noisy(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg) -#define log_aliases(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg) -#define log_blockmove(fmt, arg...) \ - log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg) - -#define PG_pre_locked PG_owner_priv_1 -#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags) -#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags) -#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags) - -/* FIXME: This should really be somewhere in the 64bit area. */ -#define LOGFS_LINK_MAX (1<<30) - -/* Read-only filesystem */ -#define LOGFS_SB_FLAG_RO 0x0001 -#define LOGFS_SB_FLAG_DIRTY 0x0002 -#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004 -#define LOGFS_SB_FLAG_SHUTDOWN 0x0008 - -/* Write Control Flags */ -#define WF_LOCK 0x01 /* take write lock */ -#define WF_WRITE 0x02 /* write block */ -#define WF_DELETE 0x04 /* delete old block */ - -typedef u8 __bitwise level_t; -typedef u8 __bitwise gc_level_t; - -#define LEVEL(level) ((__force level_t)(level)) -#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level)) - -#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \ - (__force level_t)((__force u8)(level) - 1) ) - -/** - * struct logfs_area - area management information - * - * @a_sb: the superblock this area belongs to - * @a_is_open: 1 if the area is currently open, else 0 - * @a_segno: segment number of area - * @a_written_bytes: number of bytes already written back - * @a_used_bytes: number of used bytes - * @a_ops: area operations (either journal or ostore) - * @a_erase_count: erase count - * @a_level: GC level - */ -struct logfs_area { /* a segment open for writing */ - struct super_block *a_sb; - int a_is_open; - u32 a_segno; - u32 a_written_bytes; - u32 a_used_bytes; - const struct logfs_area_ops *a_ops; - u32 a_erase_count; - gc_level_t a_level; -}; - -/** - * struct logfs_area_ops - area operations - * - * @get_free_segment: fill area->ofs with the offset of a free segment - * @get_erase_count: fill area->erase_count (needs area->ofs) - * @erase_segment: erase and setup segment - */ -struct logfs_area_ops { - void (*get_free_segment)(struct logfs_area *area); - void (*get_erase_count)(struct logfs_area *area); - int (*erase_segment)(struct logfs_area *area); -}; - -struct logfs_super; /* forward */ -/** - * struct logfs_device_ops - device access operations - * - * @readpage: read one page (mm page) - * @writeseg: write one segment. may be a partial segment - * @erase: erase one segment - * @read: read from the device - * @erase: erase part of the device - * @can_write_buf: decide whether wbuf can be written to ofs - */ -struct logfs_device_ops { - struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs); - struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs); - int (*write_sb)(struct super_block *sb, struct page *page); - int (*readpage)(void *_sb, struct page *page); - void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); - int (*erase)(struct super_block *sb, loff_t ofs, size_t len, - int ensure_write); - int (*can_write_buf)(struct super_block *sb, u64 ofs); - void (*sync)(struct super_block *sb); - void (*put_device)(struct logfs_super *s); -}; - -/** - * struct candidate_list - list of similar candidates - */ -struct candidate_list { - struct rb_root rb_tree; - int count; - int maxcount; - int sort_by_ec; -}; - -/** - * struct gc_candidate - "candidate" segment to be garbage collected next - * - * @list: list (either free of low) - * @segno: segment number - * @valid: number of valid bytes - * @erase_count: erase count of segment - * @dist: distance from tree root - * - * Candidates can be on two lists. The free list contains electees rather - * than candidates - segments that no longer contain any valid data. The - * low list contains candidates to be picked for GC. It should be kept - * short. It is not required to always pick a perfect candidate. In the - * worst case GC will have to move more data than absolutely necessary. - */ -struct gc_candidate { - struct rb_node rb_node; - struct candidate_list *list; - u32 segno; - u32 valid; - u32 erase_count; - u8 dist; -}; - -/** - * struct logfs_journal_entry - temporary structure used during journal scan - * - * @used: - * @version: normalized version - * @len: length - * @offset: offset - */ -struct logfs_journal_entry { - int used; - s16 version; - u16 len; - u16 datalen; - u64 offset; -}; - -enum transaction_state { - CREATE_1 = 1, - CREATE_2, - UNLINK_1, - UNLINK_2, - CROSS_RENAME_1, - CROSS_RENAME_2, - TARGET_RENAME_1, - TARGET_RENAME_2, - TARGET_RENAME_3 -}; - -/** - * struct logfs_transaction - essential fields to support atomic dirops - * - * @ino: target inode - * @dir: inode of directory containing dentry - * @pos: pos of dentry in directory - */ -struct logfs_transaction { - enum transaction_state state; - u64 ino; - u64 dir; - u64 pos; -}; - -/** - * struct logfs_shadow - old block in the shadow of a not-yet-committed new one - * @old_ofs: offset of old block on medium - * @new_ofs: offset of new block on medium - * @ino: inode number - * @bix: block index - * @old_len: size of old block, including header - * @new_len: size of new block, including header - * @level: block level - */ -struct logfs_shadow { - u64 old_ofs; - u64 new_ofs; - u64 ino; - u64 bix; - int old_len; - int new_len; - gc_level_t gc_level; -}; - -/** - * struct shadow_tree - * @new: shadows where old_ofs==0, indexed by new_ofs - * @old: shadows where old_ofs!=0, indexed by old_ofs - * @segment_map: bitfield of segments containing shadows - * @no_shadowed_segment: number of segments containing shadows - */ -struct shadow_tree { - struct btree_head64 new; - struct btree_head64 old; - struct btree_head32 segment_map; - int no_shadowed_segments; -}; - -struct object_alias_item { - struct list_head list; - __be64 val; - int child_no; -}; - -/** - * struct logfs_block - contains any block state - * @type: indirect block or inode - * @full: number of fully populated children - * @partial: number of partially populated children - * - * Most blocks are directly represented by page cache pages. But when a block - * becomes dirty, is part of a transaction, contains aliases or is otherwise - * special, a struct logfs_block is allocated to track the additional state. - * Inodes are very similar to indirect blocks, so they can also get one of - * these structures added when appropriate. - */ -#define BLOCK_INDIRECT 1 /* Indirect block */ -#define BLOCK_INODE 2 /* Inode */ -struct logfs_block_ops; -struct logfs_block { - struct list_head alias_list; - struct list_head item_list; - struct super_block *sb; - u64 ino; - u64 bix; - level_t level; - struct page *page; - struct inode *inode; - struct logfs_transaction *ta; - unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; - const struct logfs_block_ops *ops; - int full; - int partial; - int reserved_bytes; -}; - -typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, - level_t level, int child_no, __be64 val); -struct logfs_block_ops { - void (*write_block)(struct logfs_block *block); - void (*free_block)(struct super_block *sb, struct logfs_block*block); - int (*write_alias)(struct super_block *sb, - struct logfs_block *block, - write_alias_t *write_one_alias); -}; - -#define MAX_JOURNAL_ENTRIES 256 - -struct logfs_super { - struct mtd_info *s_mtd; /* underlying device */ - struct block_device *s_bdev; /* underlying device */ - const struct logfs_device_ops *s_devops;/* device access */ - struct inode *s_master_inode; /* inode file */ - struct inode *s_segfile_inode; /* segment file */ - struct inode *s_mapping_inode; /* device mapping */ - atomic_t s_pending_writes; /* outstanting bios */ - long s_flags; - mempool_t *s_btree_pool; /* for btree nodes */ - mempool_t *s_alias_pool; /* aliases in segment.c */ - u64 s_feature_incompat; - u64 s_feature_ro_compat; - u64 s_feature_compat; - u64 s_feature_flags; - u64 s_sb_ofs[2]; - struct page *s_erase_page; /* for dev_bdev.c */ - /* alias.c fields */ - struct btree_head32 s_segment_alias; /* remapped segments */ - int s_no_object_aliases; - struct list_head s_object_alias; /* remapped objects */ - struct btree_head128 s_object_alias_tree; /* remapped objects */ - struct mutex s_object_alias_mutex; - /* dir.c fields */ - struct mutex s_dirop_mutex; /* for creat/unlink/rename */ - u64 s_victim_ino; /* used for atomic dir-ops */ - u64 s_rename_dir; /* source directory ino */ - u64 s_rename_pos; /* position of source dd */ - /* gc.c fields */ - long s_segsize; /* size of a segment */ - int s_segshift; /* log2 of segment size */ - long s_segmask; /* 1 << s_segshift - 1 */ - long s_no_segs; /* segments on device */ - long s_no_journal_segs; /* segments used for journal */ - long s_no_blocks; /* blocks per segment */ - long s_writesize; /* minimum write size */ - int s_writeshift; /* log2 of write size */ - u64 s_size; /* filesystem size */ - struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */ - u64 s_gec; /* global erase count */ - u64 s_wl_gec_ostore; /* time of last wl event */ - u64 s_wl_gec_journal; /* time of last wl event */ - u64 s_sweeper; /* current sweeper pos */ - u8 s_ifile_levels; /* max level of ifile */ - u8 s_iblock_levels; /* max level of regular files */ - u8 s_data_levels; /* # of segments to leaf block*/ - u8 s_total_levels; /* sum of above three */ - struct btree_head32 s_cand_tree; /* all candidates */ - struct candidate_list s_free_list; /* 100% free segments */ - struct candidate_list s_reserve_list; /* Bad segment reserve */ - struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */ - struct candidate_list s_ec_list; /* wear level candidates */ - struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */ - /* inode.c fields */ - u64 s_last_ino; /* highest ino used */ - long s_inos_till_wrap; - u32 s_generation; /* i_generation for new files */ - struct list_head s_freeing_list; /* inodes being freed */ - /* journal.c fields */ - struct mutex s_journal_mutex; - void *s_je; /* journal entry to compress */ - void *s_compressed_je; /* block to write to journal */ - u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */ - u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ - u64 s_last_version; - struct logfs_area *s_journal_area; /* open journal segment */ - __be64 s_je_array[MAX_JOURNAL_ENTRIES]; - int s_no_je; - - int s_sum_index; /* for the 12 summaries */ - struct shadow_tree s_shadow_tree; - int s_je_fill; /* index of current je */ - /* readwrite.c fields */ - struct mutex s_write_mutex; - int s_lock_count; - mempool_t *s_block_pool; /* struct logfs_block pool */ - mempool_t *s_shadow_pool; /* struct logfs_shadow pool */ - struct list_head s_writeback_list; /* writeback pages */ - /* - * Space accounting: - * - s_used_bytes specifies space used to store valid data objects. - * - s_dirty_used_bytes is space used to store non-committed data - * objects. Those objects have already been written themselves, - * but they don't become valid until all indirect blocks up to the - * journal have been written as well. - * - s_dirty_free_bytes is space used to store the old copy of a - * replaced object, as long as the replacement is non-committed. - * In other words, it is the amount of space freed when all dirty - * blocks are written back. - * - s_free_bytes is the amount of free space available for any - * purpose. - * - s_root_reserve is the amount of free space available only to - * the root user. Non-privileged users can no longer write once - * this watermark has been reached. - * - s_speed_reserve is space which remains unused to speed up - * garbage collection performance. - * - s_dirty_pages is the space reserved for currently dirty pages. - * It is a pessimistic estimate, so some/most will get freed on - * page writeback. - * - * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size - */ - u64 s_free_bytes; - u64 s_used_bytes; - u64 s_dirty_free_bytes; - u64 s_dirty_used_bytes; - u64 s_root_reserve; - u64 s_speed_reserve; - u64 s_dirty_pages; - /* Bad block handling: - * - s_bad_seg_reserve is a number of segments usually kept - * free. When encountering bad blocks, the affected segment's data - * is _temporarily_ moved to a reserved segment. - * - s_bad_segments is the number of known bad segments. - */ - u32 s_bad_seg_reserve; - u32 s_bad_segments; -}; - -/** - * struct logfs_inode - in-memory inode - * - * @vfs_inode: struct inode - * @li_data: data pointers - * @li_used_bytes: number of used bytes - * @li_freeing_list: used to track inodes currently being freed - * @li_flags: inode flags - * @li_refcount: number of internal (GC-induced) references - */ -struct logfs_inode { - struct inode vfs_inode; - u64 li_data[LOGFS_EMBEDDED_FIELDS]; - u64 li_used_bytes; - struct list_head li_freeing_list; - struct logfs_block *li_block; - u32 li_flags; - u8 li_height; - int li_refcount; -}; - -#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++) -#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++) -#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--) - -/* compr.c */ -int logfs_compress(void *in, void *out, size_t inlen, size_t outlen); -int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen); -int __init logfs_compr_init(void); -void logfs_compr_exit(void); - -/* dev_bdev.c */ -#ifdef CONFIG_BLOCK -int logfs_get_sb_bdev(struct logfs_super *s, - struct file_system_type *type, - const char *devname); -#else -static inline int logfs_get_sb_bdev(struct logfs_super *s, - struct file_system_type *type, - const char *devname) -{ - return -ENODEV; -} -#endif - -/* dev_mtd.c */ -#if IS_ENABLED(CONFIG_MTD) -int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); -#else -static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) -{ - return -ENODEV; -} -#endif - -/* dir.c */ -extern const struct inode_operations logfs_dir_iops; -extern const struct file_operations logfs_dir_fops; -int logfs_replay_journal(struct super_block *sb); - -/* file.c */ -extern const struct inode_operations logfs_reg_iops; -extern const struct file_operations logfs_reg_fops; -extern const struct address_space_operations logfs_reg_aops; -int logfs_readpage(struct file *file, struct page *page); -long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync); - -/* gc.c */ -u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec); -void logfs_gc_pass(struct super_block *sb); -int logfs_check_areas(struct super_block *sb); -int logfs_init_gc(struct super_block *sb); -void logfs_cleanup_gc(struct super_block *sb); - -/* inode.c */ -extern const struct super_operations logfs_super_operations; -struct inode *logfs_iget(struct super_block *sb, ino_t ino); -struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie); -void logfs_safe_iput(struct inode *inode, int cookie); -struct inode *logfs_new_inode(struct inode *dir, umode_t mode); -struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino); -struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino); -int logfs_init_inode_cache(void); -void logfs_destroy_inode_cache(void); -void logfs_set_blocks(struct inode *inode, u64 no); -/* these logically belong into inode.c but actually reside in readwrite.c */ -int logfs_read_inode(struct inode *inode); -int __logfs_write_inode(struct inode *inode, struct page *, long flags); -void logfs_evict_inode(struct inode *inode); - -/* journal.c */ -void logfs_write_anchor(struct super_block *sb); -int logfs_init_journal(struct super_block *sb); -void logfs_cleanup_journal(struct super_block *sb); -int write_alias_journal(struct super_block *sb, u64 ino, u64 bix, - level_t level, int child_no, __be64 val); -void do_logfs_journal_wl_pass(struct super_block *sb); - -/* readwrite.c */ -pgoff_t logfs_pack_index(u64 bix, level_t level); -void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level); -int logfs_inode_write(struct inode *inode, const void *buf, size_t count, - loff_t bix, long flags, struct shadow_tree *shadow_tree); -int logfs_readpage_nolock(struct page *page); -int logfs_write_buf(struct inode *inode, struct page *page, long flags); -int logfs_delete(struct inode *inode, pgoff_t index, - struct shadow_tree *shadow_tree); -int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, - gc_level_t gc_level, long flags); -int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, - gc_level_t gc_level); -int logfs_truncate(struct inode *inode, u64 size); -u64 logfs_seek_hole(struct inode *inode, u64 bix); -u64 logfs_seek_data(struct inode *inode, u64 bix); -int logfs_open_segfile(struct super_block *sb); -int logfs_init_rw(struct super_block *sb); -void logfs_cleanup_rw(struct super_block *sb); -void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta); -void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta); -void logfs_write_block(struct logfs_block *block, long flags); -int logfs_write_obj_aliases_pagecache(struct super_block *sb); -void logfs_get_segment_entry(struct super_block *sb, u32 segno, - struct logfs_segment_entry *se); -void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment); -void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, - gc_level_t gc_level); -void logfs_set_segment_reserved(struct super_block *sb, u32 segno); -void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec); -struct logfs_block *__alloc_block(struct super_block *sb, - u64 ino, u64 bix, level_t level); -void __free_block(struct super_block *sb, struct logfs_block *block); -void btree_write_block(struct logfs_block *block); -void initialize_block_counters(struct page *page, struct logfs_block *block, - __be64 *array, int page_is_empty); -int logfs_exist_block(struct inode *inode, u64 bix); -int get_page_reserve(struct inode *inode, struct page *page); -void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); -void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); -extern const struct logfs_block_ops indirect_block_ops; - -/* segment.c */ -int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); -int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); -int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, - level_t level); -int logfs_segment_write(struct inode *inode, struct page *page, - struct logfs_shadow *shadow); -int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow); -int logfs_load_object_aliases(struct super_block *sb, - struct logfs_obj_alias *oa, int count); -void move_page_to_btree(struct page *page); -int logfs_init_mapping(struct super_block *sb); -void logfs_sync_area(struct logfs_area *area); -void logfs_sync_segments(struct super_block *sb); -void freeseg(struct super_block *sb, u32 segno); -void free_areas(struct super_block *sb); - -/* area handling */ -int logfs_init_areas(struct super_block *sb); -void logfs_cleanup_areas(struct super_block *sb); -int logfs_open_area(struct logfs_area *area, size_t bytes); -int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, - int use_filler); - -static inline int logfs_buf_write(struct logfs_area *area, u64 ofs, - void *buf, size_t len) -{ - return __logfs_buf_write(area, ofs, buf, len, 0); -} - -static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs, - void *buf, size_t len) -{ - return __logfs_buf_write(area, ofs, buf, len, 1); -} - -/* super.c */ -struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index); -void emergency_read_end(struct page *page); -void logfs_crash_dump(struct super_block *sb); -int logfs_statfs(struct dentry *dentry, struct kstatfs *stats); -int logfs_check_ds(struct logfs_disk_super *ds); -int logfs_write_sb(struct super_block *sb); - -static inline struct logfs_super *logfs_super(struct super_block *sb) -{ - return sb->s_fs_info; -} - -static inline struct logfs_inode *logfs_inode(struct inode *inode) -{ - return container_of(inode, struct logfs_inode, vfs_inode); -} - -static inline void logfs_set_ro(struct super_block *sb) -{ - logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO; -} - -#define LOGFS_BUG(sb) do { \ - struct super_block *__sb = sb; \ - logfs_crash_dump(__sb); \ - logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \ - BUG(); \ -} while (0) - -#define LOGFS_BUG_ON(condition, sb) \ - do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0) - -static inline __be32 logfs_crc32(void *data, size_t len, size_t skip) -{ - return cpu_to_be32(crc32(~0, data+skip, len-skip)); -} - -static inline u8 logfs_type(struct inode *inode) -{ - return (inode->i_mode >> 12) & 15; -} - -static inline pgoff_t logfs_index(struct super_block *sb, u64 pos) -{ - return pos >> sb->s_blocksize_bits; -} - -static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs) -{ - return ((u64)segno << logfs_super(sb)->s_segshift) + ofs; -} - -static inline u32 seg_no(struct super_block *sb, u64 ofs) -{ - return ofs >> logfs_super(sb)->s_segshift; -} - -static inline u32 seg_ofs(struct super_block *sb, u64 ofs) -{ - return ofs & logfs_super(sb)->s_segmask; -} - -static inline u64 seg_align(struct super_block *sb, u64 ofs) -{ - return ofs & ~logfs_super(sb)->s_segmask; -} - -static inline struct logfs_block *logfs_block(struct page *page) -{ - return (void *)page->private; -} - -static inline level_t shrink_level(gc_level_t __level) -{ - u8 level = (__force u8)__level; - - if (level >= LOGFS_MAX_LEVELS) - level -= LOGFS_MAX_LEVELS; - return (__force level_t)level; -} - -static inline gc_level_t expand_level(u64 ino, level_t __level) -{ - u8 level = (__force u8)__level; - - if (ino == LOGFS_INO_MASTER) { - /* ifile has separate areas */ - level += LOGFS_MAX_LEVELS; - } - return (__force gc_level_t)level; -} - -static inline int logfs_block_shift(struct super_block *sb, level_t level) -{ - level = shrink_level((__force gc_level_t)level); - return (__force int)level * (sb->s_blocksize_bits - 3); -} - -static inline u64 logfs_block_mask(struct super_block *sb, level_t level) -{ - return ~0ull << logfs_block_shift(sb, level); -} - -static inline struct logfs_area *get_area(struct super_block *sb, - gc_level_t gc_level) -{ - return logfs_super(sb)->s_area[(__force u8)gc_level]; -} - -static inline void logfs_mempool_destroy(mempool_t *pool) -{ - if (pool) - mempool_destroy(pool); -} - -#endif diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h deleted file mode 100644 index ae960519c54a..000000000000 --- a/fs/logfs/logfs_abi.h +++ /dev/null @@ -1,629 +0,0 @@ -/* - * fs/logfs/logfs_abi.h - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - * - * Public header for logfs. - */ -#ifndef FS_LOGFS_LOGFS_ABI_H -#define FS_LOGFS_LOGFS_ABI_H - -/* For out-of-kernel compiles */ -#ifndef BUILD_BUG_ON -#define BUILD_BUG_ON(condition) /**/ -#endif - -#define SIZE_CHECK(type, size) \ -static inline void check_##type(void) \ -{ \ - BUILD_BUG_ON(sizeof(struct type) != (size)); \ -} - -/* - * Throughout the logfs code, we're constantly dealing with blocks at - * various positions or offsets. To remove confusion, we stricly - * distinguish between a "position" - the logical position within a - * file and an "offset" - the physical location within the device. - * - * Any usage of the term offset for a logical location or position for - * a physical one is a bug and should get fixed. - */ - -/* - * Block are allocated in one of several segments depending on their - * level. The following levels are used: - * 0 - regular data block - * 1 - i1 indirect blocks - * 2 - i2 indirect blocks - * 3 - i3 indirect blocks - * 4 - i4 indirect blocks - * 5 - i5 indirect blocks - * 6 - ifile data blocks - * 7 - ifile i1 indirect blocks - * 8 - ifile i2 indirect blocks - * 9 - ifile i3 indirect blocks - * 10 - ifile i4 indirect blocks - * 11 - ifile i5 indirect blocks - * Potential levels to be used in the future: - * 12 - gc recycled blocks, long-lived data - * 13 - replacement blocks, short-lived data - * - * Levels 1-11 are necessary for robust gc operations and help separate - * short-lived metadata from longer-lived file data. In the future, - * file data should get separated into several segments based on simple - * heuristics. Old data recycled during gc operation is expected to be - * long-lived. New data is of uncertain life expectancy. New data - * used to replace older blocks in existing files is expected to be - * short-lived. - */ - - -/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */ -#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull -#define LOGFS_MAGIC_U32 0xc97e8168u - -/* - * Various blocksize related macros. Blocksize is currently fixed at 4KiB. - * Sooner or later that should become configurable and the macros replaced - * by something superblock-dependent. Pointers in indirect blocks are and - * will remain 64bit. - * - * LOGFS_BLOCKSIZE - self-explaining - * LOGFS_BLOCK_FACTOR - number of pointers per indirect block - * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts - */ -#define LOGFS_BLOCKSIZE (4096ull) -#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64)) -#define LOGFS_BLOCK_BITS (9) - -/* - * Number of blocks at various levels of indirection. There are 16 direct - * block pointers plus a single indirect pointer. - */ -#define I0_BLOCKS (16) -#define I1_BLOCKS LOGFS_BLOCK_FACTOR -#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS) -#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS) -#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS) -#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS) - -#define INDIRECT_INDEX I0_BLOCKS -#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1) - -/* - * Sizes at which files require another level of indirection. Files smaller - * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself, - * similar like ext2 fast symlinks. - * - * Data at a position smaller than LOGFS_I0_SIZE is accessed through the - * direct pointers, else through the 1x indirect pointer and so forth. - */ -#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64)) -#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE) -#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE) -#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE) -#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE) -#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE) -#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE) - -/* - * Each indirect block pointer must have this flag set, if all block pointers - * behind it are set, i.e. there is no hole hidden in the shadow of this - * indirect block pointer. - */ -#define LOGFS_FULLY_POPULATED (1ULL << 63) -#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED) - -/* - * LogFS needs to separate data into levels. Each level is defined as the - * maximal possible distance from the master inode (inode of the inode file). - * Data blocks reside on level 0, 1x indirect block on level 1, etc. - * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11. - * This effort is necessary to guarantee garbage collection to always make - * progress. - * - * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks, - * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is - * the maximal number of levels for one file. - * LOGFS_NO_AREAS is twice that, as the inode file and regular files are - * effectively stacked on top of each other. - */ -#define LOGFS_MAX_INDIRECT (5) -#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1) -#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS) - -/* Maximum size of filenames */ -#define LOGFS_MAX_NAMELEN (255) - -/* Number of segments in the primary journal. */ -#define LOGFS_JOURNAL_SEGS (16) - -/* Maximum number of free/erased/etc. segments in journal entries */ -#define MAX_CACHED_SEGS (64) - - -/* - * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store, - * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including - * its header, - * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for - * its segment header and the padded space at the end when no further objects - * fit. - */ -#define LOGFS_OBJECT_HEADERSIZE (0x1c) -#define LOGFS_SEGMENT_HEADERSIZE (0x18) -#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE) -#define LOGFS_SEGMENT_RESERVE \ - (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1) - -/* - * Segment types: - * SEG_SUPER - Data or indirect block - * SEG_JOURNAL - Inode - * SEG_OSTORE - Dentry - */ -enum { - SEG_SUPER = 0x01, - SEG_JOURNAL = 0x02, - SEG_OSTORE = 0x03, -}; - -/** - * struct logfs_segment_header - per-segment header in the ostore - * - * @crc: crc32 of header (there is no data) - * @pad: unused, must be 0 - * @type: segment type, see above - * @level: GC level for all objects in this segment - * @segno: segment number - * @ec: erase count for this segment - * @gec: global erase count at time of writing - */ -struct logfs_segment_header { - __be32 crc; - __be16 pad; - __u8 type; - __u8 level; - __be32 segno; - __be32 ec; - __be64 gec; -}; - -SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE); - -#define LOGFS_FEATURES_INCOMPAT (0ull) -#define LOGFS_FEATURES_RO_COMPAT (0ull) -#define LOGFS_FEATURES_COMPAT (0ull) - -/** - * struct logfs_disk_super - on-medium superblock - * - * @ds_magic: magic number, must equal LOGFS_MAGIC - * @ds_crc: crc32 of structure starting with the next field - * @ds_ifile_levels: maximum number of levels for ifile - * @ds_iblock_levels: maximum number of levels for regular files - * @ds_data_levels: number of separate levels for data - * @pad0: reserved, must be 0 - * @ds_feature_incompat: incompatible filesystem features - * @ds_feature_ro_compat: read-only compatible filesystem features - * @ds_feature_compat: compatible filesystem features - * @ds_flags: flags - * @ds_segment_shift: log2 of segment size - * @ds_block_shift: log2 of block size - * @ds_write_shift: log2 of write size - * @pad1: reserved, must be 0 - * @ds_journal_seg: segments used by primary journal - * @ds_root_reserve: bytes reserved for the superuser - * @ds_speed_reserve: bytes reserved to speed up GC - * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks - * @pad2: reserved, must be 0 - * @pad3: reserved, must be 0 - * - * Contains only read-only fields. Read-write fields like the amount of used - * space is tracked in the dynamic superblock, which is stored in the journal. - */ -struct logfs_disk_super { - struct logfs_segment_header ds_sh; - __be64 ds_magic; - - __be32 ds_crc; - __u8 ds_ifile_levels; - __u8 ds_iblock_levels; - __u8 ds_data_levels; - __u8 ds_segment_shift; - __u8 ds_block_shift; - __u8 ds_write_shift; - __u8 pad0[6]; - - __be64 ds_filesystem_size; - __be32 ds_segment_size; - __be32 ds_bad_seg_reserve; - - __be64 ds_feature_incompat; - __be64 ds_feature_ro_compat; - - __be64 ds_feature_compat; - __be64 ds_feature_flags; - - __be64 ds_root_reserve; - __be64 ds_speed_reserve; - - __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS]; - - __be64 ds_super_ofs[2]; - __be64 pad3[8]; -}; - -SIZE_CHECK(logfs_disk_super, 256); - -/* - * Object types: - * OBJ_BLOCK - Data or indirect block - * OBJ_INODE - Inode - * OBJ_DENTRY - Dentry - */ -enum { - OBJ_BLOCK = 0x04, - OBJ_INODE = 0x05, - OBJ_DENTRY = 0x06, -}; - -/** - * struct logfs_object_header - per-object header in the ostore - * - * @crc: crc32 of header, excluding data_crc - * @len: length of data - * @type: object type, see above - * @compr: compression type - * @ino: inode number - * @bix: block index - * @data_crc: crc32 of payload - */ -struct logfs_object_header { - __be32 crc; - __be16 len; - __u8 type; - __u8 compr; - __be64 ino; - __be64 bix; - __be32 data_crc; -} __attribute__((packed)); - -SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE); - -/* - * Reserved inode numbers: - * LOGFS_INO_MASTER - master inode (for inode file) - * LOGFS_INO_ROOT - root directory - * LOGFS_INO_SEGFILE - per-segment used bytes and erase count - */ -enum { - LOGFS_INO_MAPPING = 0x00, - LOGFS_INO_MASTER = 0x01, - LOGFS_INO_ROOT = 0x02, - LOGFS_INO_SEGFILE = 0x03, - LOGFS_RESERVED_INOS = 0x10, -}; - -/* - * Inode flags. High bits should never be written to the medium. They are - * reserved for in-memory usage. - * Low bits should either remain in sync with the corresponding FS_*_FL or - * reuse slots that obviously don't make sense for logfs. - * - * LOGFS_IF_DIRTY Inode must be written back - * LOGFS_IF_ZOMBIE Inode has been deleted - * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode - */ -#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */ -#define LOGFS_IF_DIRTY 0x20000000 -#define LOGFS_IF_ZOMBIE 0x40000000 -#define LOGFS_IF_STILLBORN 0x80000000 - -/* Flags available to chattr */ -#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED) -#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED) -/* Flags inherited from parent directory on file/directory creation */ -#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED) - -/** - * struct logfs_disk_inode - on-medium inode - * - * @di_mode: file mode - * @di_pad: reserved, must be 0 - * @di_flags: inode flags, see above - * @di_uid: user id - * @di_gid: group id - * @di_ctime: change time - * @di_mtime: modify time - * @di_refcount: reference count (aka nlink or link count) - * @di_generation: inode generation, for nfs - * @di_used_bytes: number of bytes used - * @di_size: file size - * @di_data: data pointers - */ -struct logfs_disk_inode { - __be16 di_mode; - __u8 di_height; - __u8 di_pad; - __be32 di_flags; - __be32 di_uid; - __be32 di_gid; - - __be64 di_ctime; - __be64 di_mtime; - - __be64 di_atime; - __be32 di_refcount; - __be32 di_generation; - - __be64 di_used_bytes; - __be64 di_size; - - __be64 di_data[LOGFS_EMBEDDED_FIELDS]; -}; - -SIZE_CHECK(logfs_disk_inode, 200); - -#define INODE_POINTER_OFS \ - (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64)) -#define INODE_USED_OFS \ - (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64)) -#define INODE_SIZE_OFS \ - (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64)) -#define INODE_HEIGHT_OFS (0) - -/** - * struct logfs_disk_dentry - on-medium dentry structure - * - * @ino: inode number - * @namelen: length of file name - * @type: file type, identical to bits 12..15 of mode - * @name: file name - */ -/* FIXME: add 6 bytes of padding to remove the __packed */ -struct logfs_disk_dentry { - __be64 ino; - __be16 namelen; - __u8 type; - __u8 name[LOGFS_MAX_NAMELEN]; -} __attribute__((packed)); - -SIZE_CHECK(logfs_disk_dentry, 266); - -#define RESERVED 0xffffffff -#define BADSEG 0xffffffff -/** - * struct logfs_segment_entry - segment file entry - * - * @ec_level: erase count and level - * @valid: number of valid bytes - * - * Segment file contains one entry for every segment. ec_level contains the - * erasecount in the upper 28 bits and the level in the lower 4 bits. An - * ec_level of BADSEG (-1) identifies bad segments. valid contains the number - * of valid bytes or RESERVED (-1 again) if the segment is used for either the - * superblock or the journal, or when the segment is bad. - */ -struct logfs_segment_entry { - __be32 ec_level; - __be32 valid; -}; - -SIZE_CHECK(logfs_segment_entry, 8); - -/** - * struct logfs_journal_header - header for journal entries (JEs) - * - * @h_crc: crc32 of journal entry - * @h_len: length of compressed journal entry, - * not including header - * @h_datalen: length of uncompressed data - * @h_type: JE type - * @h_compr: compression type - * @h_pad: reserved - */ -struct logfs_journal_header { - __be32 h_crc; - __be16 h_len; - __be16 h_datalen; - __be16 h_type; - __u8 h_compr; - __u8 h_pad[5]; -}; - -SIZE_CHECK(logfs_journal_header, 16); - -/* - * Life expectency of data. - * VIM_DEFAULT - default vim - * VIM_SEGFILE - for segment file only - very short-living - * VIM_GC - GC'd data - likely long-living - */ -enum logfs_vim { - VIM_DEFAULT = 0, - VIM_SEGFILE = 1, -}; - -/** - * struct logfs_je_area - wbuf header - * - * @segno: segment number of area - * @used_bytes: number of bytes already used - * @gc_level: GC level - * @vim: life expectancy of data - * - * "Areas" are segments currently being used for writing. There is at least - * one area per GC level. Several may be used to separate long-living from - * short-living data. If an area with unknown vim is encountered, it can - * simply be closed. - * The write buffer immediately follow this header. - */ -struct logfs_je_area { - __be32 segno; - __be32 used_bytes; - __u8 gc_level; - __u8 vim; -} __attribute__((packed)); - -SIZE_CHECK(logfs_je_area, 10); - -#define MAX_JOURNAL_HEADER \ - (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area)) - -/** - * struct logfs_je_dynsb - dynamic superblock - * - * @ds_gec: global erase count - * @ds_sweeper: current position of GC "sweeper" - * @ds_rename_dir: source directory ino (see dir.c documentation) - * @ds_rename_pos: position of source dd (see dir.c documentation) - * @ds_victim_ino: victims of incomplete dir operation (see dir.c) - * @ds_victim_ino: parent inode of victim (see dir.c) - * @ds_used_bytes: number of used bytes - */ -struct logfs_je_dynsb { - __be64 ds_gec; - __be64 ds_sweeper; - - __be64 ds_rename_dir; - __be64 ds_rename_pos; - - __be64 ds_victim_ino; - __be64 ds_victim_parent; /* XXX */ - - __be64 ds_used_bytes; - __be32 ds_generation; - __be32 pad; -}; - -SIZE_CHECK(logfs_je_dynsb, 64); - -/** - * struct logfs_je_anchor - anchor of filesystem tree, aka master inode - * - * @da_size: size of inode file - * @da_last_ino: last created inode - * @da_used_bytes: number of bytes used - * @da_data: data pointers - */ -struct logfs_je_anchor { - __be64 da_size; - __be64 da_last_ino; - - __be64 da_used_bytes; - u8 da_height; - u8 pad[7]; - - __be64 da_data[LOGFS_EMBEDDED_FIELDS]; -}; - -SIZE_CHECK(logfs_je_anchor, 168); - -/** - * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal) - * - * @so_segment: segments used for 2nd journal - * - * Length of the array is given by h_len field in the header. - */ -struct logfs_je_spillout { - __be64 so_segment[0]; -}; - -SIZE_CHECK(logfs_je_spillout, 0); - -/** - * struct logfs_je_journal_ec - erase counts for all journal segments - * - * @ec: erase count - * - * Length of the array is given by h_len field in the header. - */ -struct logfs_je_journal_ec { - __be32 ec[0]; -}; - -SIZE_CHECK(logfs_je_journal_ec, 0); - -/** - * struct logfs_je_free_segments - list of free segmetns with erase count - */ -struct logfs_je_free_segments { - __be32 segno; - __be32 ec; -}; - -SIZE_CHECK(logfs_je_free_segments, 8); - -/** - * struct logfs_seg_alias - list of segment aliases - */ -struct logfs_seg_alias { - __be32 old_segno; - __be32 new_segno; -}; - -SIZE_CHECK(logfs_seg_alias, 8); - -/** - * struct logfs_obj_alias - list of object aliases - */ -struct logfs_obj_alias { - __be64 ino; - __be64 bix; - __be64 val; - u8 level; - u8 pad[5]; - __be16 child_no; -}; - -SIZE_CHECK(logfs_obj_alias, 32); - -/** - * Compression types. - * - * COMPR_NONE - uncompressed - * COMPR_ZLIB - compressed with zlib - */ -enum { - COMPR_NONE = 0, - COMPR_ZLIB = 1, -}; - -/* - * Journal entries come in groups of 16. First group contains unique - * entries, next groups contain one entry per level - * - * JE_FIRST - smallest possible journal entry number - * - * JEG_BASE - base group, containing unique entries - * JE_COMMIT - commit entry, validates all previous entries - * JE_DYNSB - dynamic superblock, anything that ought to be in the - * superblock but cannot because it is read-write data - * JE_ANCHOR - anchor aka master inode aka inode file's inode - * JE_ERASECOUNT erasecounts for all journal segments - * JE_SPILLOUT - unused - * JE_SEG_ALIAS - aliases segments - * JE_AREA - area description - * - * JE_LAST - largest possible journal entry number - */ -enum { - JE_FIRST = 0x01, - - JEG_BASE = 0x00, - JE_COMMIT = 0x02, - JE_DYNSB = 0x03, - JE_ANCHOR = 0x04, - JE_ERASECOUNT = 0x05, - JE_SPILLOUT = 0x06, - JE_OBJ_ALIAS = 0x0d, - JE_AREA = 0x0e, - - JE_LAST = 0x0e, -}; - -#endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c deleted file mode 100644 index bf19bf4a243f..000000000000 --- a/fs/logfs/readwrite.c +++ /dev/null @@ -1,2298 +0,0 @@ -/* - * fs/logfs/readwrite.c - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - * - * - * Actually contains five sets of very similar functions: - * read read blocks from a file - * seek_hole find next hole - * seek_data find next data block - * valid check whether a block still belongs to a file - * write write blocks to a file - * delete delete a block (for directories and ifile) - * rewrite move existing blocks of a file to a new location (gc helper) - * truncate truncate a file - */ -#include "logfs.h" -#include <linux/sched.h> -#include <linux/slab.h> - -static u64 adjust_bix(u64 bix, level_t level) -{ - switch (level) { - case 0: - return bix; - case LEVEL(1): - return max_t(u64, bix, I0_BLOCKS); - case LEVEL(2): - return max_t(u64, bix, I1_BLOCKS); - case LEVEL(3): - return max_t(u64, bix, I2_BLOCKS); - case LEVEL(4): - return max_t(u64, bix, I3_BLOCKS); - case LEVEL(5): - return max_t(u64, bix, I4_BLOCKS); - default: - WARN_ON(1); - return bix; - } -} - -static inline u64 maxbix(u8 height) -{ - return 1ULL << (LOGFS_BLOCK_BITS * height); -} - -/** - * The inode address space is cut in two halves. Lower half belongs to data - * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is - * set, the actual block index (bix) and level can be derived from the page - * index. - * - * The lowest three bits of the block index are set to 0 after packing and - * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored - * anyway this is harmless. - */ -#define ARCH_SHIFT (BITS_PER_LONG - 32) -#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT) -#define LEVEL_SHIFT (28 + ARCH_SHIFT) -static inline pgoff_t first_indirect_block(void) -{ - return INDIRECT_BIT | (1ULL << LEVEL_SHIFT); -} - -pgoff_t logfs_pack_index(u64 bix, level_t level) -{ - pgoff_t index; - - BUG_ON(bix >= INDIRECT_BIT); - if (level == 0) - return bix; - - index = INDIRECT_BIT; - index |= (__force long)level << LEVEL_SHIFT; - index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS); - return index; -} - -void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level) -{ - u8 __level; - - if (!(index & INDIRECT_BIT)) { - *bix = index; - *level = 0; - return; - } - - __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT; - *level = LEVEL(__level); - *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT; - *bix = adjust_bix(*bix, *level); - return; -} -#undef ARCH_SHIFT -#undef INDIRECT_BIT -#undef LEVEL_SHIFT - -/* - * Time is stored as nanoseconds since the epoch. - */ -static struct timespec be64_to_timespec(__be64 betime) -{ - return ns_to_timespec(be64_to_cpu(betime)); -} - -static __be64 timespec_to_be64(struct timespec tsp) -{ - return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec); -} - -static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode) -{ - struct logfs_inode *li = logfs_inode(inode); - int i; - - inode->i_mode = be16_to_cpu(di->di_mode); - li->li_height = di->di_height; - li->li_flags = be32_to_cpu(di->di_flags); - i_uid_write(inode, be32_to_cpu(di->di_uid)); - i_gid_write(inode, be32_to_cpu(di->di_gid)); - inode->i_size = be64_to_cpu(di->di_size); - logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes)); - inode->i_atime = be64_to_timespec(di->di_atime); - inode->i_ctime = be64_to_timespec(di->di_ctime); - inode->i_mtime = be64_to_timespec(di->di_mtime); - set_nlink(inode, be32_to_cpu(di->di_refcount)); - inode->i_generation = be32_to_cpu(di->di_generation); - - switch (inode->i_mode & S_IFMT) { - case S_IFSOCK: /* fall through */ - case S_IFBLK: /* fall through */ - case S_IFCHR: /* fall through */ - case S_IFIFO: - inode->i_rdev = be64_to_cpu(di->di_data[0]); - break; - case S_IFDIR: /* fall through */ - case S_IFREG: /* fall through */ - case S_IFLNK: - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - li->li_data[i] = be64_to_cpu(di->di_data[i]); - break; - default: - BUG(); - } -} - -static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di) -{ - struct logfs_inode *li = logfs_inode(inode); - int i; - - di->di_mode = cpu_to_be16(inode->i_mode); - di->di_height = li->li_height; - di->di_pad = 0; - di->di_flags = cpu_to_be32(li->li_flags); - di->di_uid = cpu_to_be32(i_uid_read(inode)); - di->di_gid = cpu_to_be32(i_gid_read(inode)); - di->di_size = cpu_to_be64(i_size_read(inode)); - di->di_used_bytes = cpu_to_be64(li->li_used_bytes); - di->di_atime = timespec_to_be64(inode->i_atime); - di->di_ctime = timespec_to_be64(inode->i_ctime); - di->di_mtime = timespec_to_be64(inode->i_mtime); - di->di_refcount = cpu_to_be32(inode->i_nlink); - di->di_generation = cpu_to_be32(inode->i_generation); - - switch (inode->i_mode & S_IFMT) { - case S_IFSOCK: /* fall through */ - case S_IFBLK: /* fall through */ - case S_IFCHR: /* fall through */ - case S_IFIFO: - di->di_data[0] = cpu_to_be64(inode->i_rdev); - break; - case S_IFDIR: /* fall through */ - case S_IFREG: /* fall through */ - case S_IFLNK: - for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++) - di->di_data[i] = cpu_to_be64(li->li_data[i]); - break; - default: - BUG(); - } -} - -static void __logfs_set_blocks(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct logfs_inode *li = logfs_inode(inode); - - inode->i_blocks = ULONG_MAX; - if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX) - inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9; -} - -void logfs_set_blocks(struct inode *inode, u64 bytes) -{ - struct logfs_inode *li = logfs_inode(inode); - - li->li_used_bytes = bytes; - __logfs_set_blocks(inode); -} - -static void prelock_page(struct super_block *sb, struct page *page, int lock) -{ - struct logfs_super *super = logfs_super(sb); - - BUG_ON(!PageLocked(page)); - if (lock) { - BUG_ON(PagePreLocked(page)); - SetPagePreLocked(page); - } else { - /* We are in GC path. */ - if (PagePreLocked(page)) - super->s_lock_count++; - else - SetPagePreLocked(page); - } -} - -static void preunlock_page(struct super_block *sb, struct page *page, int lock) -{ - struct logfs_super *super = logfs_super(sb); - - BUG_ON(!PageLocked(page)); - if (lock) - ClearPagePreLocked(page); - else { - /* We are in GC path. */ - BUG_ON(!PagePreLocked(page)); - if (super->s_lock_count) - super->s_lock_count--; - else - ClearPagePreLocked(page); - } -} - -/* - * Logfs is prone to an AB-BA deadlock where one task tries to acquire - * s_write_mutex with a locked page and GC tries to get that page while holding - * s_write_mutex. - * To solve this issue logfs will ignore the page lock iff the page in question - * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked - * in addition to PG_locked. - */ -void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock) -{ - struct logfs_super *super = logfs_super(sb); - - if (page) - prelock_page(sb, page, lock); - - if (lock) { - mutex_lock(&super->s_write_mutex); - logfs_gc_pass(sb); - /* FIXME: We also have to check for shadowed space - * and mempool fill grade */ - } -} - -void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock) -{ - struct logfs_super *super = logfs_super(sb); - - if (page) - preunlock_page(sb, page, lock); - /* Order matters - we must clear PG_pre_locked before releasing - * s_write_mutex or we could race against another task. */ - if (lock) - mutex_unlock(&super->s_write_mutex); -} - -static struct page *logfs_get_read_page(struct inode *inode, u64 bix, - level_t level) -{ - return find_or_create_page(inode->i_mapping, - logfs_pack_index(bix, level), GFP_NOFS); -} - -static void logfs_put_read_page(struct page *page) -{ - unlock_page(page); - put_page(page); -} - -static void logfs_lock_write_page(struct page *page) -{ - int loop = 0; - - while (unlikely(!trylock_page(page))) { - if (loop++ > 0x1000) { - /* Has been observed once so far... */ - printk(KERN_ERR "stack at %p\n", &loop); - BUG(); - } - if (PagePreLocked(page)) { - /* Holder of page lock is waiting for us, it - * is safe to use this page. */ - break; - } - /* Some other process has this page locked and has - * nothing to do with us. Wait for it to finish. - */ - schedule(); - } - BUG_ON(!PageLocked(page)); -} - -static struct page *logfs_get_write_page(struct inode *inode, u64 bix, - level_t level) -{ - struct address_space *mapping = inode->i_mapping; - pgoff_t index = logfs_pack_index(bix, level); - struct page *page; - int err; - -repeat: - page = find_get_page(mapping, index); - if (!page) { - page = __page_cache_alloc(GFP_NOFS); - if (!page) - return NULL; - err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS); - if (unlikely(err)) { - put_page(page); - if (err == -EEXIST) - goto repeat; - return NULL; - } - } else logfs_lock_write_page(page); - BUG_ON(!PageLocked(page)); - return page; -} - -static void logfs_unlock_write_page(struct page *page) -{ - if (!PagePreLocked(page)) - unlock_page(page); -} - -static void logfs_put_write_page(struct page *page) -{ - logfs_unlock_write_page(page); - put_page(page); -} - -static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level, - int rw) -{ - if (rw == READ) - return logfs_get_read_page(inode, bix, level); - else - return logfs_get_write_page(inode, bix, level); -} - -static void logfs_put_page(struct page *page, int rw) -{ - if (rw == READ) - logfs_put_read_page(page); - else - logfs_put_write_page(page); -} - -static unsigned long __get_bits(u64 val, int skip, int no) -{ - u64 ret = val; - - ret >>= skip * no; - ret <<= 64 - no; - ret >>= 64 - no; - return ret; -} - -static unsigned long get_bits(u64 val, level_t skip) -{ - return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS); -} - -static inline void init_shadow_tree(struct super_block *sb, - struct shadow_tree *tree) -{ - struct logfs_super *super = logfs_super(sb); - - btree_init_mempool64(&tree->new, super->s_btree_pool); - btree_init_mempool64(&tree->old, super->s_btree_pool); -} - -static void indirect_write_block(struct logfs_block *block) -{ - struct page *page; - struct inode *inode; - int ret; - - page = block->page; - inode = page->mapping->host; - logfs_lock_write_page(page); - ret = logfs_write_buf(inode, page, 0); - logfs_unlock_write_page(page); - /* - * This needs some rework. Unless you want your filesystem to run - * completely synchronously (you don't), the filesystem will always - * report writes as 'successful' before the actual work has been - * done. The actual work gets done here and this is where any errors - * will show up. And there isn't much we can do about it, really. - * - * Some attempts to fix the errors (move from bad blocks, retry io,...) - * have already been done, so anything left should be either a broken - * device or a bug somewhere in logfs itself. Being relatively new, - * the odds currently favor a bug, so for now the line below isn't - * entirely tasteles. - */ - BUG_ON(ret); -} - -static void inode_write_block(struct logfs_block *block) -{ - struct inode *inode; - int ret; - - inode = block->inode; - if (inode->i_ino == LOGFS_INO_MASTER) - logfs_write_anchor(inode->i_sb); - else { - ret = __logfs_write_inode(inode, NULL, 0); - /* see indirect_write_block comment */ - BUG_ON(ret); - } -} - -/* - * This silences a false, yet annoying gcc warning. I hate it when my editor - * jumps into bitops.h each time I recompile this file. - * TODO: Complain to gcc folks about this and upgrade compiler. - */ -static unsigned long fnb(const unsigned long *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -static __be64 inode_val0(struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - u64 val; - - /* - * Explicit shifting generates good code, but must match the format - * of the structure. Add some paranoia just in case. - */ - BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0); - BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2); - BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4); - - val = (u64)inode->i_mode << 48 | - (u64)li->li_height << 40 | - (u64)li->li_flags; - return cpu_to_be64(val); -} - -static int inode_write_alias(struct super_block *sb, - struct logfs_block *block, write_alias_t *write_one_alias) -{ - struct inode *inode = block->inode; - struct logfs_inode *li = logfs_inode(inode); - unsigned long pos; - u64 ino , bix; - __be64 val; - level_t level; - int err; - - for (pos = 0; ; pos++) { - pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); - if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS) - return 0; - - switch (pos) { - case INODE_HEIGHT_OFS: - val = inode_val0(inode); - break; - case INODE_USED_OFS: - val = cpu_to_be64(li->li_used_bytes); - break; - case INODE_SIZE_OFS: - val = cpu_to_be64(i_size_read(inode)); - break; - case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1: - val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]); - break; - default: - BUG(); - } - - ino = LOGFS_INO_MASTER; - bix = inode->i_ino; - level = LEVEL(0); - err = write_one_alias(sb, ino, bix, level, pos, val); - if (err) - return err; - } -} - -static int indirect_write_alias(struct super_block *sb, - struct logfs_block *block, write_alias_t *write_one_alias) -{ - unsigned long pos; - struct page *page = block->page; - u64 ino , bix; - __be64 *child, val; - level_t level; - int err; - - for (pos = 0; ; pos++) { - pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); - if (pos >= LOGFS_BLOCK_FACTOR) - return 0; - - ino = page->mapping->host->i_ino; - logfs_unpack_index(page->index, &bix, &level); - child = kmap_atomic(page); - val = child[pos]; - kunmap_atomic(child); - err = write_one_alias(sb, ino, bix, level, pos, val); - if (err) - return err; - } -} - -int logfs_write_obj_aliases_pagecache(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_block *block; - int err; - - list_for_each_entry(block, &super->s_object_alias, alias_list) { - err = block->ops->write_alias(sb, block, write_alias_journal); - if (err) - return err; - } - return 0; -} - -void __free_block(struct super_block *sb, struct logfs_block *block) -{ - BUG_ON(!list_empty(&block->item_list)); - list_del(&block->alias_list); - mempool_free(block, logfs_super(sb)->s_block_pool); -} - -static void inode_free_block(struct super_block *sb, struct logfs_block *block) -{ - struct inode *inode = block->inode; - - logfs_inode(inode)->li_block = NULL; - __free_block(sb, block); -} - -static void indirect_free_block(struct super_block *sb, - struct logfs_block *block) -{ - struct page *page = block->page; - - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - set_page_private(page, 0); - } - __free_block(sb, block); -} - - -static const struct logfs_block_ops inode_block_ops = { - .write_block = inode_write_block, - .free_block = inode_free_block, - .write_alias = inode_write_alias, -}; - -const struct logfs_block_ops indirect_block_ops = { - .write_block = indirect_write_block, - .free_block = indirect_free_block, - .write_alias = indirect_write_alias, -}; - -struct logfs_block *__alloc_block(struct super_block *sb, - u64 ino, u64 bix, level_t level) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_block *block; - - block = mempool_alloc(super->s_block_pool, GFP_NOFS); - memset(block, 0, sizeof(*block)); - INIT_LIST_HEAD(&block->alias_list); - INIT_LIST_HEAD(&block->item_list); - block->sb = sb; - block->ino = ino; - block->bix = bix; - block->level = level; - return block; -} - -static void alloc_inode_block(struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - struct logfs_block *block; - - if (li->li_block) - return; - - block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0); - block->inode = inode; - li->li_block = block; - block->ops = &inode_block_ops; -} - -void initialize_block_counters(struct page *page, struct logfs_block *block, - __be64 *array, int page_is_empty) -{ - u64 ptr; - int i, start; - - block->partial = 0; - block->full = 0; - start = 0; - if (page->index < first_indirect_block()) { - /* Counters are pointless on level 0 */ - return; - } - if (page->index == first_indirect_block()) { - /* Skip unused pointers */ - start = I0_BLOCKS; - block->full = I0_BLOCKS; - } - if (!page_is_empty) { - for (i = start; i < LOGFS_BLOCK_FACTOR; i++) { - ptr = be64_to_cpu(array[i]); - if (ptr) - block->partial++; - if (ptr & LOGFS_FULLY_POPULATED) - block->full++; - } - } -} - -static void alloc_data_block(struct inode *inode, struct page *page) -{ - struct logfs_block *block; - u64 bix; - level_t level; - - if (PagePrivate(page)) - return; - - logfs_unpack_index(page->index, &bix, &level); - block = __alloc_block(inode->i_sb, inode->i_ino, bix, level); - block->page = page; - - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long) block); - - block->ops = &indirect_block_ops; -} - -static void alloc_indirect_block(struct inode *inode, struct page *page, - int page_is_empty) -{ - struct logfs_block *block; - __be64 *array; - - if (PagePrivate(page)) - return; - - alloc_data_block(inode, page); - - block = logfs_block(page); - array = kmap_atomic(page); - initialize_block_counters(page, block, array, page_is_empty); - kunmap_atomic(array); -} - -static void block_set_pointer(struct page *page, int index, u64 ptr) -{ - struct logfs_block *block = logfs_block(page); - __be64 *array; - u64 oldptr; - - BUG_ON(!block); - array = kmap_atomic(page); - oldptr = be64_to_cpu(array[index]); - array[index] = cpu_to_be64(ptr); - kunmap_atomic(array); - SetPageUptodate(page); - - block->full += !!(ptr & LOGFS_FULLY_POPULATED) - - !!(oldptr & LOGFS_FULLY_POPULATED); - block->partial += !!ptr - !!oldptr; -} - -static u64 block_get_pointer(struct page *page, int index) -{ - __be64 *block; - u64 ptr; - - block = kmap_atomic(page); - ptr = be64_to_cpu(block[index]); - kunmap_atomic(block); - return ptr; -} - -static int logfs_read_empty(struct page *page) -{ - zero_user_segment(page, 0, PAGE_SIZE); - return 0; -} - -static int logfs_read_direct(struct inode *inode, struct page *page) -{ - struct logfs_inode *li = logfs_inode(inode); - pgoff_t index = page->index; - u64 block; - - block = li->li_data[index]; - if (!block) - return logfs_read_empty(page); - - return logfs_segment_read(inode, page, block, index, 0); -} - -static int logfs_read_loop(struct inode *inode, struct page *page, - int rw_context) -{ - struct logfs_inode *li = logfs_inode(inode); - u64 bix, bofs = li->li_data[INDIRECT_INDEX]; - level_t level, target_level; - int ret; - struct page *ipage; - - logfs_unpack_index(page->index, &bix, &target_level); - if (!bofs) - return logfs_read_empty(page); - - if (bix >= maxbix(li->li_height)) - return logfs_read_empty(page); - - for (level = LEVEL(li->li_height); - (__force u8)level > (__force u8)target_level; - level = SUBLEVEL(level)){ - ipage = logfs_get_page(inode, bix, level, rw_context); - if (!ipage) - return -ENOMEM; - - ret = logfs_segment_read(inode, ipage, bofs, bix, level); - if (ret) { - logfs_put_read_page(ipage); - return ret; - } - - bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); - logfs_put_page(ipage, rw_context); - if (!bofs) - return logfs_read_empty(page); - } - - return logfs_segment_read(inode, page, bofs, bix, 0); -} - -static int logfs_read_block(struct inode *inode, struct page *page, - int rw_context) -{ - pgoff_t index = page->index; - - if (index < I0_BLOCKS) - return logfs_read_direct(inode, page); - return logfs_read_loop(inode, page, rw_context); -} - -static int logfs_exist_loop(struct inode *inode, u64 bix) -{ - struct logfs_inode *li = logfs_inode(inode); - u64 bofs = li->li_data[INDIRECT_INDEX]; - level_t level; - int ret; - struct page *ipage; - - if (!bofs) - return 0; - if (bix >= maxbix(li->li_height)) - return 0; - - for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { - ipage = logfs_get_read_page(inode, bix, level); - if (!ipage) - return -ENOMEM; - - ret = logfs_segment_read(inode, ipage, bofs, bix, level); - if (ret) { - logfs_put_read_page(ipage); - return ret; - } - - bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level))); - logfs_put_read_page(ipage); - if (!bofs) - return 0; - } - - return 1; -} - -int logfs_exist_block(struct inode *inode, u64 bix) -{ - struct logfs_inode *li = logfs_inode(inode); - - if (bix < I0_BLOCKS) - return !!li->li_data[bix]; - return logfs_exist_loop(inode, bix); -} - -static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data) -{ - struct logfs_inode *li = logfs_inode(inode); - - for (; bix < I0_BLOCKS; bix++) - if (data ^ (li->li_data[bix] == 0)) - return bix; - return I0_BLOCKS; -} - -static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data) -{ - struct logfs_inode *li = logfs_inode(inode); - __be64 *rblock; - u64 increment, bofs = li->li_data[INDIRECT_INDEX]; - level_t level; - int ret, slot; - struct page *page; - - BUG_ON(!bofs); - - for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) { - increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1)); - page = logfs_get_read_page(inode, bix, level); - if (!page) - return bix; - - ret = logfs_segment_read(inode, page, bofs, bix, level); - if (ret) { - logfs_put_read_page(page); - return bix; - } - - slot = get_bits(bix, SUBLEVEL(level)); - rblock = kmap_atomic(page); - while (slot < LOGFS_BLOCK_FACTOR) { - if (data && (rblock[slot] != 0)) - break; - if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED)) - break; - slot++; - bix += increment; - bix &= ~(increment - 1); - } - if (slot >= LOGFS_BLOCK_FACTOR) { - kunmap_atomic(rblock); - logfs_put_read_page(page); - return bix; - } - bofs = be64_to_cpu(rblock[slot]); - kunmap_atomic(rblock); - logfs_put_read_page(page); - if (!bofs) { - BUG_ON(data); - return bix; - } - } - return bix; -} - -/** - * logfs_seek_hole - find next hole starting at a given block index - * @inode: inode to search in - * @bix: block index to start searching - * - * Returns next hole. If the file doesn't contain any further holes, the - * block address next to eof is returned instead. - */ -u64 logfs_seek_hole(struct inode *inode, u64 bix) -{ - struct logfs_inode *li = logfs_inode(inode); - - if (bix < I0_BLOCKS) { - bix = seek_holedata_direct(inode, bix, 0); - if (bix < I0_BLOCKS) - return bix; - } - - if (!li->li_data[INDIRECT_INDEX]) - return bix; - else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED) - bix = maxbix(li->li_height); - else if (bix >= maxbix(li->li_height)) - return bix; - else { - bix = seek_holedata_loop(inode, bix, 0); - if (bix < maxbix(li->li_height)) - return bix; - /* Should not happen anymore. But if some port writes semi- - * corrupt images (as this one used to) we might run into it. - */ - WARN_ON_ONCE(bix == maxbix(li->li_height)); - } - - return bix; -} - -static u64 __logfs_seek_data(struct inode *inode, u64 bix) -{ - struct logfs_inode *li = logfs_inode(inode); - - if (bix < I0_BLOCKS) { - bix = seek_holedata_direct(inode, bix, 1); - if (bix < I0_BLOCKS) - return bix; - } - - if (bix < maxbix(li->li_height)) { - if (!li->li_data[INDIRECT_INDEX]) - bix = maxbix(li->li_height); - else - return seek_holedata_loop(inode, bix, 1); - } - - return bix; -} - -/** - * logfs_seek_data - find next data block after a given block index - * @inode: inode to search in - * @bix: block index to start searching - * - * Returns next data block. If the file doesn't contain any further data - * blocks, the last block in the file is returned instead. - */ -u64 logfs_seek_data(struct inode *inode, u64 bix) -{ - struct super_block *sb = inode->i_sb; - u64 ret, end; - - ret = __logfs_seek_data(inode, bix); - end = i_size_read(inode) >> sb->s_blocksize_bits; - if (ret >= end) - ret = max(bix, end); - return ret; -} - -static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs) -{ - return pure_ofs(li->li_data[bix]) == ofs; -} - -static int __logfs_is_valid_loop(struct inode *inode, u64 bix, - u64 ofs, u64 bofs) -{ - struct logfs_inode *li = logfs_inode(inode); - level_t level; - int ret; - struct page *page; - - for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){ - page = logfs_get_write_page(inode, bix, level); - BUG_ON(!page); - - ret = logfs_segment_read(inode, page, bofs, bix, level); - if (ret) { - logfs_put_write_page(page); - return 0; - } - - bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level))); - logfs_put_write_page(page); - if (!bofs) - return 0; - - if (pure_ofs(bofs) == ofs) - return 1; - } - return 0; -} - -static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs) -{ - struct logfs_inode *li = logfs_inode(inode); - u64 bofs = li->li_data[INDIRECT_INDEX]; - - if (!bofs) - return 0; - - if (bix >= maxbix(li->li_height)) - return 0; - - if (pure_ofs(bofs) == ofs) - return 1; - - return __logfs_is_valid_loop(inode, bix, ofs, bofs); -} - -static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) -{ - struct logfs_inode *li = logfs_inode(inode); - - if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1) - return 0; - - if (bix < I0_BLOCKS) - return logfs_is_valid_direct(li, bix, ofs); - return logfs_is_valid_loop(inode, bix, ofs); -} - -/** - * logfs_is_valid_block - check whether this block is still valid - * - * @sb: superblock - * @ofs: block physical offset - * @ino: block inode number - * @bix: block index - * @gc_level: block level - * - * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will - * become invalid once the journal is written. - */ -int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix, - gc_level_t gc_level) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode; - int ret, cookie; - - /* Umount closes a segment with free blocks remaining. Those - * blocks are by definition invalid. */ - if (ino == -1) - return 0; - - LOGFS_BUG_ON((u64)(u_long)ino != ino, sb); - - inode = logfs_safe_iget(sb, ino, &cookie); - if (IS_ERR(inode)) - goto invalid; - - ret = __logfs_is_valid_block(inode, bix, ofs); - logfs_safe_iput(inode, cookie); - if (ret) - return ret; - -invalid: - /* Block is nominally invalid, but may still sit in the shadow tree, - * waiting for a journal commit. - */ - if (btree_lookup64(&super->s_shadow_tree.old, ofs)) - return 2; - return 0; -} - -int logfs_readpage_nolock(struct page *page) -{ - struct inode *inode = page->mapping->host; - int ret = -EIO; - - ret = logfs_read_block(inode, page, READ); - - if (ret) { - ClearPageUptodate(page); - SetPageError(page); - } else { - SetPageUptodate(page); - ClearPageError(page); - } - flush_dcache_page(page); - - return ret; -} - -static int logfs_reserve_bytes(struct inode *inode, int bytes) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - u64 available = super->s_free_bytes + super->s_dirty_free_bytes - - super->s_dirty_used_bytes - super->s_dirty_pages; - - if (!bytes) - return 0; - - if (available < bytes) - return -ENOSPC; - - if (available < bytes + super->s_root_reserve && - !capable(CAP_SYS_RESOURCE)) - return -ENOSPC; - - return 0; -} - -int get_page_reserve(struct inode *inode, struct page *page) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - struct logfs_block *block = logfs_block(page); - int ret; - - if (block && block->reserved_bytes) - return 0; - - logfs_get_wblocks(inode->i_sb, page, WF_LOCK); - while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) && - !list_empty(&super->s_writeback_list)) { - block = list_entry(super->s_writeback_list.next, - struct logfs_block, alias_list); - block->ops->write_block(block); - } - if (!ret) { - alloc_data_block(inode, page); - block = logfs_block(page); - block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE; - super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE; - list_move_tail(&block->alias_list, &super->s_writeback_list); - } - logfs_put_wblocks(inode->i_sb, page, WF_LOCK); - return ret; -} - -/* - * We are protected by write lock. Push victims up to superblock level - * and release transaction when appropriate. - */ -/* FIXME: This is currently called from the wrong spots. */ -static void logfs_handle_transaction(struct inode *inode, - struct logfs_transaction *ta) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - - if (!ta) - return; - logfs_inode(inode)->li_block->ta = NULL; - - if (inode->i_ino != LOGFS_INO_MASTER) { - BUG(); /* FIXME: Yes, this needs more thought */ - /* just remember the transaction until inode is written */ - //BUG_ON(logfs_inode(inode)->li_transaction); - //logfs_inode(inode)->li_transaction = ta; - return; - } - - switch (ta->state) { - case CREATE_1: /* fall through */ - case UNLINK_1: - BUG_ON(super->s_victim_ino); - super->s_victim_ino = ta->ino; - break; - case CREATE_2: /* fall through */ - case UNLINK_2: - BUG_ON(super->s_victim_ino != ta->ino); - super->s_victim_ino = 0; - /* transaction ends here - free it */ - kfree(ta); - break; - case CROSS_RENAME_1: - BUG_ON(super->s_rename_dir); - BUG_ON(super->s_rename_pos); - super->s_rename_dir = ta->dir; - super->s_rename_pos = ta->pos; - break; - case CROSS_RENAME_2: - BUG_ON(super->s_rename_dir != ta->dir); - BUG_ON(super->s_rename_pos != ta->pos); - super->s_rename_dir = 0; - super->s_rename_pos = 0; - kfree(ta); - break; - case TARGET_RENAME_1: - BUG_ON(super->s_rename_dir); - BUG_ON(super->s_rename_pos); - BUG_ON(super->s_victim_ino); - super->s_rename_dir = ta->dir; - super->s_rename_pos = ta->pos; - super->s_victim_ino = ta->ino; - break; - case TARGET_RENAME_2: - BUG_ON(super->s_rename_dir != ta->dir); - BUG_ON(super->s_rename_pos != ta->pos); - BUG_ON(super->s_victim_ino != ta->ino); - super->s_rename_dir = 0; - super->s_rename_pos = 0; - break; - case TARGET_RENAME_3: - BUG_ON(super->s_rename_dir); - BUG_ON(super->s_rename_pos); - BUG_ON(super->s_victim_ino != ta->ino); - super->s_victim_ino = 0; - kfree(ta); - break; - default: - BUG(); - } -} - -/* - * Not strictly a reservation, but rather a check that we still have enough - * space to satisfy the write. - */ -static int logfs_reserve_blocks(struct inode *inode, int blocks) -{ - return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE); -} - -struct write_control { - u64 ofs; - long flags; -}; - -static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix, - level_t level, u64 old_ofs) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - struct logfs_shadow *shadow; - - shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS); - memset(shadow, 0, sizeof(*shadow)); - shadow->ino = inode->i_ino; - shadow->bix = bix; - shadow->gc_level = expand_level(inode->i_ino, level); - shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED; - return shadow; -} - -static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - - mempool_free(shadow, super->s_shadow_pool); -} - -static void mark_segment(struct shadow_tree *tree, u32 segno) -{ - int err; - - if (!btree_lookup32(&tree->segment_map, segno)) { - err = btree_insert32(&tree->segment_map, segno, (void *)1, - GFP_NOFS); - BUG_ON(err); - tree->no_shadowed_segments++; - } -} - -/** - * fill_shadow_tree - Propagate shadow tree changes due to a write - * @inode: Inode owning the page - * @page: Struct page that was written - * @shadow: Shadow for the current write - * - * Writes in logfs can result in two semi-valid objects. The old object - * is still valid as long as it can be reached by following pointers on - * the medium. Only when writes propagate all the way up to the journal - * has the new object safely replaced the old one. - * - * To handle this problem, a struct logfs_shadow is used to represent - * every single write. It is attached to the indirect block, which is - * marked dirty. When the indirect block is written, its shadows are - * handed up to the next indirect block (or inode). Untimately they - * will reach the master inode and be freed upon journal commit. - * - * This function handles a single step in the propagation. It adds the - * shadow for the current write to the tree, along with any shadows in - * the page's tree, in case it was an indirect block. If a page is - * written, the inode parameter is left NULL, if an inode is written, - * the page parameter is left NULL. - */ -static void fill_shadow_tree(struct inode *inode, struct page *page, - struct logfs_shadow *shadow) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - struct logfs_block *block = logfs_block(page); - struct shadow_tree *tree = &super->s_shadow_tree; - - if (PagePrivate(page)) { - if (block->alias_map) - super->s_no_object_aliases -= bitmap_weight( - block->alias_map, LOGFS_BLOCK_FACTOR); - logfs_handle_transaction(inode, block->ta); - block->ops->free_block(inode->i_sb, block); - } - if (shadow) { - if (shadow->old_ofs) - btree_insert64(&tree->old, shadow->old_ofs, shadow, - GFP_NOFS); - else - btree_insert64(&tree->new, shadow->new_ofs, shadow, - GFP_NOFS); - - super->s_dirty_used_bytes += shadow->new_len; - super->s_dirty_free_bytes += shadow->old_len; - mark_segment(tree, shadow->old_ofs >> super->s_segshift); - mark_segment(tree, shadow->new_ofs >> super->s_segshift); - } -} - -static void logfs_set_alias(struct super_block *sb, struct logfs_block *block, - long child_no) -{ - struct logfs_super *super = logfs_super(sb); - - if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) { - /* Aliases in the master inode are pointless. */ - return; - } - - if (!test_bit(child_no, block->alias_map)) { - set_bit(child_no, block->alias_map); - super->s_no_object_aliases++; - } - list_move_tail(&block->alias_list, &super->s_object_alias); -} - -/* - * Object aliases can and often do change the size and occupied space of a - * file. So not only do we have to change the pointers, we also have to - * change inode->i_size and li->li_used_bytes. Which is done by setting - * another two object aliases for the inode itself. - */ -static void set_iused(struct inode *inode, struct logfs_shadow *shadow) -{ - struct logfs_inode *li = logfs_inode(inode); - - if (shadow->new_len == shadow->old_len) - return; - - alloc_inode_block(inode); - li->li_used_bytes += shadow->new_len - shadow->old_len; - __logfs_set_blocks(inode); - logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS); - logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS); -} - -static int logfs_write_i0(struct inode *inode, struct page *page, - struct write_control *wc) -{ - struct logfs_shadow *shadow; - u64 bix; - level_t level; - int full, err = 0; - - logfs_unpack_index(page->index, &bix, &level); - if (wc->ofs == 0) - if (logfs_reserve_blocks(inode, 1)) - return -ENOSPC; - - shadow = alloc_shadow(inode, bix, level, wc->ofs); - if (wc->flags & WF_WRITE) - err = logfs_segment_write(inode, page, shadow); - if (wc->flags & WF_DELETE) - logfs_segment_delete(inode, shadow); - if (err) { - free_shadow(inode, shadow); - return err; - } - - set_iused(inode, shadow); - full = 1; - if (level != 0) { - alloc_indirect_block(inode, page, 0); - full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR; - } - fill_shadow_tree(inode, page, shadow); - wc->ofs = shadow->new_ofs; - if (wc->ofs && full) - wc->ofs |= LOGFS_FULLY_POPULATED; - return 0; -} - -static int logfs_write_direct(struct inode *inode, struct page *page, - long flags) -{ - struct logfs_inode *li = logfs_inode(inode); - struct write_control wc = { - .ofs = li->li_data[page->index], - .flags = flags, - }; - int err; - - alloc_inode_block(inode); - - err = logfs_write_i0(inode, page, &wc); - if (err) - return err; - - li->li_data[page->index] = wc.ofs; - logfs_set_alias(inode->i_sb, li->li_block, - page->index + INODE_POINTER_OFS); - return 0; -} - -static int ptr_change(u64 ofs, struct page *page) -{ - struct logfs_block *block = logfs_block(page); - int empty0, empty1, full0, full1; - - empty0 = ofs == 0; - empty1 = block->partial == 0; - if (empty0 != empty1) - return 1; - - /* The !! is necessary to shrink result to int */ - full0 = !!(ofs & LOGFS_FULLY_POPULATED); - full1 = block->full == LOGFS_BLOCK_FACTOR; - if (full0 != full1) - return 1; - return 0; -} - -static int __logfs_write_rec(struct inode *inode, struct page *page, - struct write_control *this_wc, - pgoff_t bix, level_t target_level, level_t level) -{ - int ret, page_empty = 0; - int child_no = get_bits(bix, SUBLEVEL(level)); - struct page *ipage; - struct write_control child_wc = { - .flags = this_wc->flags, - }; - - ipage = logfs_get_write_page(inode, bix, level); - if (!ipage) - return -ENOMEM; - - if (this_wc->ofs) { - ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); - if (ret) - goto out; - } else if (!PageUptodate(ipage)) { - page_empty = 1; - logfs_read_empty(ipage); - } - - child_wc.ofs = block_get_pointer(ipage, child_no); - - if ((__force u8)level-1 > (__force u8)target_level) - ret = __logfs_write_rec(inode, page, &child_wc, bix, - target_level, SUBLEVEL(level)); - else - ret = logfs_write_i0(inode, page, &child_wc); - - if (ret) - goto out; - - alloc_indirect_block(inode, ipage, page_empty); - block_set_pointer(ipage, child_no, child_wc.ofs); - /* FIXME: first condition seems superfluous */ - if (child_wc.ofs || logfs_block(ipage)->partial) - this_wc->flags |= WF_WRITE; - /* the condition on this_wc->ofs ensures that we won't consume extra - * space for indirect blocks in the future, which we cannot reserve */ - if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage)) - ret = logfs_write_i0(inode, ipage, this_wc); - else - logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no); -out: - logfs_put_write_page(ipage); - return ret; -} - -static int logfs_write_rec(struct inode *inode, struct page *page, - pgoff_t bix, level_t target_level, long flags) -{ - struct logfs_inode *li = logfs_inode(inode); - struct write_control wc = { - .ofs = li->li_data[INDIRECT_INDEX], - .flags = flags, - }; - int ret; - - alloc_inode_block(inode); - - if (li->li_height > (__force u8)target_level) - ret = __logfs_write_rec(inode, page, &wc, bix, target_level, - LEVEL(li->li_height)); - else - ret = logfs_write_i0(inode, page, &wc); - if (ret) - return ret; - - if (li->li_data[INDIRECT_INDEX] != wc.ofs) { - li->li_data[INDIRECT_INDEX] = wc.ofs; - logfs_set_alias(inode->i_sb, li->li_block, - INDIRECT_INDEX + INODE_POINTER_OFS); - } - return ret; -} - -void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta) -{ - alloc_inode_block(inode); - logfs_inode(inode)->li_block->ta = ta; -} - -void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta) -{ - struct logfs_block *block = logfs_inode(inode)->li_block; - - if (block && block->ta) - block->ta = NULL; -} - -static int grow_inode(struct inode *inode, u64 bix, level_t level) -{ - struct logfs_inode *li = logfs_inode(inode); - u8 height = (__force u8)level; - struct page *page; - struct write_control wc = { - .flags = WF_WRITE, - }; - int err; - - BUG_ON(height > 5 || li->li_height > 5); - while (height > li->li_height || bix >= maxbix(li->li_height)) { - page = logfs_get_write_page(inode, I0_BLOCKS + 1, - LEVEL(li->li_height + 1)); - if (!page) - return -ENOMEM; - logfs_read_empty(page); - alloc_indirect_block(inode, page, 1); - block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]); - err = logfs_write_i0(inode, page, &wc); - logfs_put_write_page(page); - if (err) - return err; - li->li_data[INDIRECT_INDEX] = wc.ofs; - wc.ofs = 0; - li->li_height++; - logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS); - } - return 0; -} - -static int __logfs_write_buf(struct inode *inode, struct page *page, long flags) -{ - struct logfs_super *super = logfs_super(inode->i_sb); - pgoff_t index = page->index; - u64 bix; - level_t level; - int err; - - flags |= WF_WRITE | WF_DELETE; - inode->i_ctime = inode->i_mtime = current_time(inode); - - logfs_unpack_index(index, &bix, &level); - if (logfs_block(page) && logfs_block(page)->reserved_bytes) - super->s_dirty_pages -= logfs_block(page)->reserved_bytes; - - if (index < I0_BLOCKS) - return logfs_write_direct(inode, page, flags); - - bix = adjust_bix(bix, level); - err = grow_inode(inode, bix, level); - if (err) - return err; - return logfs_write_rec(inode, page, bix, level, flags); -} - -int logfs_write_buf(struct inode *inode, struct page *page, long flags) -{ - struct super_block *sb = inode->i_sb; - int ret; - - logfs_get_wblocks(sb, page, flags & WF_LOCK); - ret = __logfs_write_buf(inode, page, flags); - logfs_put_wblocks(sb, page, flags & WF_LOCK); - return ret; -} - -static int __logfs_delete(struct inode *inode, struct page *page) -{ - long flags = WF_DELETE; - int err; - - inode->i_ctime = inode->i_mtime = current_time(inode); - - if (page->index < I0_BLOCKS) - return logfs_write_direct(inode, page, flags); - err = grow_inode(inode, page->index, 0); - if (err) - return err; - return logfs_write_rec(inode, page, page->index, 0, flags); -} - -int logfs_delete(struct inode *inode, pgoff_t index, - struct shadow_tree *shadow_tree) -{ - struct super_block *sb = inode->i_sb; - struct page *page; - int ret; - - page = logfs_get_read_page(inode, index, 0); - if (!page) - return -ENOMEM; - - logfs_get_wblocks(sb, page, 1); - ret = __logfs_delete(inode, page); - logfs_put_wblocks(sb, page, 1); - - logfs_put_read_page(page); - - return ret; -} - -int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs, - gc_level_t gc_level, long flags) -{ - level_t level = shrink_level(gc_level); - struct page *page; - int err; - - page = logfs_get_write_page(inode, bix, level); - if (!page) - return -ENOMEM; - - err = logfs_segment_read(inode, page, ofs, bix, level); - if (!err) { - if (level != 0) - alloc_indirect_block(inode, page, 0); - err = logfs_write_buf(inode, page, flags); - if (!err && shrink_level(gc_level) == 0) { - /* Rewrite cannot mark the inode dirty but has to - * write it immediately. - * Q: Can't we just create an alias for the inode - * instead? And if not, why not? - */ - if (inode->i_ino == LOGFS_INO_MASTER) - logfs_write_anchor(inode->i_sb); - else { - err = __logfs_write_inode(inode, page, flags); - } - } - } - logfs_put_write_page(page); - return err; -} - -static int truncate_data_block(struct inode *inode, struct page *page, - u64 ofs, struct logfs_shadow *shadow, u64 size) -{ - loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits; - u64 bix; - level_t level; - int err; - - /* Does truncation happen within this page? */ - if (size <= pageofs || size - pageofs >= PAGE_SIZE) - return 0; - - logfs_unpack_index(page->index, &bix, &level); - BUG_ON(level != 0); - - err = logfs_segment_read(inode, page, ofs, bix, level); - if (err) - return err; - - zero_user_segment(page, size - pageofs, PAGE_SIZE); - return logfs_segment_write(inode, page, shadow); -} - -static int logfs_truncate_i0(struct inode *inode, struct page *page, - struct write_control *wc, u64 size) -{ - struct logfs_shadow *shadow; - u64 bix; - level_t level; - int err = 0; - - logfs_unpack_index(page->index, &bix, &level); - BUG_ON(level != 0); - shadow = alloc_shadow(inode, bix, level, wc->ofs); - - err = truncate_data_block(inode, page, wc->ofs, shadow, size); - if (err) { - free_shadow(inode, shadow); - return err; - } - - logfs_segment_delete(inode, shadow); - set_iused(inode, shadow); - fill_shadow_tree(inode, page, shadow); - wc->ofs = shadow->new_ofs; - return 0; -} - -static int logfs_truncate_direct(struct inode *inode, u64 size) -{ - struct logfs_inode *li = logfs_inode(inode); - struct write_control wc; - struct page *page; - int e; - int err; - - alloc_inode_block(inode); - - for (e = I0_BLOCKS - 1; e >= 0; e--) { - if (size > (e+1) * LOGFS_BLOCKSIZE) - break; - - wc.ofs = li->li_data[e]; - if (!wc.ofs) - continue; - - page = logfs_get_write_page(inode, e, 0); - if (!page) - return -ENOMEM; - err = logfs_segment_read(inode, page, wc.ofs, e, 0); - if (err) { - logfs_put_write_page(page); - return err; - } - err = logfs_truncate_i0(inode, page, &wc, size); - logfs_put_write_page(page); - if (err) - return err; - - li->li_data[e] = wc.ofs; - } - return 0; -} - -/* FIXME: these need to become per-sb once we support different blocksizes */ -static u64 __logfs_step[] = { - 1, - I1_BLOCKS, - I2_BLOCKS, - I3_BLOCKS, -}; - -static u64 __logfs_start_index[] = { - I0_BLOCKS, - I1_BLOCKS, - I2_BLOCKS, - I3_BLOCKS -}; - -static inline u64 logfs_step(level_t level) -{ - return __logfs_step[(__force u8)level]; -} - -static inline u64 logfs_factor(u8 level) -{ - return __logfs_step[level] * LOGFS_BLOCKSIZE; -} - -static inline u64 logfs_start_index(level_t level) -{ - return __logfs_start_index[(__force u8)level]; -} - -static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level) -{ - logfs_unpack_index(index, bix, level); - if (*bix <= logfs_start_index(SUBLEVEL(*level))) - *bix = 0; -} - -static int __logfs_truncate_rec(struct inode *inode, struct page *ipage, - struct write_control *this_wc, u64 size) -{ - int truncate_happened = 0; - int e, err = 0; - u64 bix, child_bix, next_bix; - level_t level; - struct page *page; - struct write_control child_wc = { /* FIXME: flags */ }; - - logfs_unpack_raw_index(ipage->index, &bix, &level); - err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level); - if (err) - return err; - - for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) { - child_bix = bix + e * logfs_step(SUBLEVEL(level)); - next_bix = child_bix + logfs_step(SUBLEVEL(level)); - if (size > next_bix * LOGFS_BLOCKSIZE) - break; - - child_wc.ofs = pure_ofs(block_get_pointer(ipage, e)); - if (!child_wc.ofs) - continue; - - page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level)); - if (!page) - return -ENOMEM; - - if ((__force u8)level > 1) - err = __logfs_truncate_rec(inode, page, &child_wc, size); - else - err = logfs_truncate_i0(inode, page, &child_wc, size); - logfs_put_write_page(page); - if (err) - return err; - - truncate_happened = 1; - alloc_indirect_block(inode, ipage, 0); - block_set_pointer(ipage, e, child_wc.ofs); - } - - if (!truncate_happened) { - printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size); - return 0; - } - - this_wc->flags = WF_DELETE; - if (logfs_block(ipage)->partial) - this_wc->flags |= WF_WRITE; - - return logfs_write_i0(inode, ipage, this_wc); -} - -static int logfs_truncate_rec(struct inode *inode, u64 size) -{ - struct logfs_inode *li = logfs_inode(inode); - struct write_control wc = { - .ofs = li->li_data[INDIRECT_INDEX], - }; - struct page *page; - int err; - - alloc_inode_block(inode); - - if (!wc.ofs) - return 0; - - page = logfs_get_write_page(inode, 0, LEVEL(li->li_height)); - if (!page) - return -ENOMEM; - - err = __logfs_truncate_rec(inode, page, &wc, size); - logfs_put_write_page(page); - if (err) - return err; - - if (li->li_data[INDIRECT_INDEX] != wc.ofs) - li->li_data[INDIRECT_INDEX] = wc.ofs; - return 0; -} - -static int __logfs_truncate(struct inode *inode, u64 size) -{ - int ret; - - if (size >= logfs_factor(logfs_inode(inode)->li_height)) - return 0; - - ret = logfs_truncate_rec(inode, size); - if (ret) - return ret; - - return logfs_truncate_direct(inode, size); -} - -/* - * Truncate, by changing the segment file, can consume a fair amount - * of resources. So back off from time to time and do some GC. - * 8 or 2048 blocks should be well within safety limits even if - * every single block resided in a different segment. - */ -#define TRUNCATE_STEP (8 * 1024 * 1024) -int logfs_truncate(struct inode *inode, u64 target) -{ - struct super_block *sb = inode->i_sb; - u64 size = i_size_read(inode); - int err = 0; - - size = ALIGN(size, TRUNCATE_STEP); - while (size > target) { - if (size > TRUNCATE_STEP) - size -= TRUNCATE_STEP; - else - size = 0; - if (size < target) - size = target; - - logfs_get_wblocks(sb, NULL, 1); - err = __logfs_truncate(inode, size); - if (!err) - err = __logfs_write_inode(inode, NULL, 0); - logfs_put_wblocks(sb, NULL, 1); - } - - if (!err) { - err = inode_newsize_ok(inode, target); - if (err) - goto out; - - truncate_setsize(inode, target); - } - - out: - /* I don't trust error recovery yet. */ - WARN_ON(err); - return err; -} - -static void move_page_to_inode(struct inode *inode, struct page *page) -{ - struct logfs_inode *li = logfs_inode(inode); - struct logfs_block *block = logfs_block(page); - - if (!block) - return; - - log_blockmove("move_page_to_inode(%llx, %llx, %x)\n", - block->ino, block->bix, block->level); - BUG_ON(li->li_block); - block->ops = &inode_block_ops; - block->inode = inode; - li->li_block = block; - - block->page = NULL; - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - set_page_private(page, 0); - } -} - -static void move_inode_to_page(struct page *page, struct inode *inode) -{ - struct logfs_inode *li = logfs_inode(inode); - struct logfs_block *block = li->li_block; - - if (!block) - return; - - log_blockmove("move_inode_to_page(%llx, %llx, %x)\n", - block->ino, block->bix, block->level); - BUG_ON(PagePrivate(page)); - block->ops = &indirect_block_ops; - block->page = page; - - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long) block); - } - - block->inode = NULL; - li->li_block = NULL; -} - -int logfs_read_inode(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct logfs_super *super = logfs_super(sb); - struct inode *master_inode = super->s_master_inode; - struct page *page; - struct logfs_disk_inode *di; - u64 ino = inode->i_ino; - - if (ino << sb->s_blocksize_bits > i_size_read(master_inode)) - return -ENODATA; - if (!logfs_exist_block(master_inode, ino)) - return -ENODATA; - - page = read_cache_page(master_inode->i_mapping, ino, - (filler_t *)logfs_readpage, NULL); - if (IS_ERR(page)) - return PTR_ERR(page); - - di = kmap_atomic(page); - logfs_disk_to_inode(di, inode); - kunmap_atomic(di); - move_page_to_inode(inode, page); - put_page(page); - return 0; -} - -/* Caller must logfs_put_write_page(page); */ -static struct page *inode_to_page(struct inode *inode) -{ - struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode; - struct logfs_disk_inode *di; - struct page *page; - - BUG_ON(inode->i_ino == LOGFS_INO_MASTER); - - page = logfs_get_write_page(master_inode, inode->i_ino, 0); - if (!page) - return NULL; - - di = kmap_atomic(page); - logfs_inode_to_disk(inode, di); - kunmap_atomic(di); - move_inode_to_page(page, inode); - return page; -} - -static int do_write_inode(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct inode *master_inode = logfs_super(sb)->s_master_inode; - loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits; - struct page *page; - int err; - - BUG_ON(inode->i_ino == LOGFS_INO_MASTER); - /* FIXME: lock inode */ - - if (i_size_read(master_inode) < size) - i_size_write(master_inode, size); - - /* TODO: Tell vfs this inode is clean now */ - - page = inode_to_page(inode); - if (!page) - return -ENOMEM; - - /* FIXME: transaction is part of logfs_block now. Is that enough? */ - err = logfs_write_buf(master_inode, page, 0); - if (err) - move_page_to_inode(inode, page); - - logfs_put_write_page(page); - return err; -} - -static void logfs_mod_segment_entry(struct super_block *sb, u32 segno, - int write, - void (*change_se)(struct logfs_segment_entry *, long), - long arg) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode; - struct page *page; - struct logfs_segment_entry *se; - pgoff_t page_no; - int child_no; - - page_no = segno >> (sb->s_blocksize_bits - 3); - child_no = segno & ((sb->s_blocksize >> 3) - 1); - - inode = super->s_segfile_inode; - page = logfs_get_write_page(inode, page_no, 0); - BUG_ON(!page); /* FIXME: We need some reserve page for this case */ - if (!PageUptodate(page)) - logfs_read_block(inode, page, WRITE); - - if (write) - alloc_indirect_block(inode, page, 0); - se = kmap_atomic(page); - change_se(se + child_no, arg); - if (write) { - logfs_set_alias(sb, logfs_block(page), child_no); - BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize); - } - kunmap_atomic(se); - - logfs_put_write_page(page); -} - -static void __get_segment_entry(struct logfs_segment_entry *se, long _target) -{ - struct logfs_segment_entry *target = (void *)_target; - - *target = *se; -} - -void logfs_get_segment_entry(struct super_block *sb, u32 segno, - struct logfs_segment_entry *se) -{ - logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se); -} - -static void __set_segment_used(struct logfs_segment_entry *se, long increment) -{ - u32 valid; - - valid = be32_to_cpu(se->valid); - valid += increment; - se->valid = cpu_to_be32(valid); -} - -void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment) -{ - struct logfs_super *super = logfs_super(sb); - u32 segno = ofs >> super->s_segshift; - - if (!increment) - return; - - logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment); -} - -static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level) -{ - se->ec_level = cpu_to_be32(ec_level); -} - -void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec, - gc_level_t gc_level) -{ - u32 ec_level = ec << 4 | (__force u8)gc_level; - - logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level); -} - -static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore) -{ - se->valid = cpu_to_be32(RESERVED); -} - -void logfs_set_segment_reserved(struct super_block *sb, u32 segno) -{ - logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0); -} - -static void __set_segment_unreserved(struct logfs_segment_entry *se, - long ec_level) -{ - se->valid = 0; - se->ec_level = cpu_to_be32(ec_level); -} - -void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec) -{ - u32 ec_level = ec << 4; - - logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved, - ec_level); -} - -int __logfs_write_inode(struct inode *inode, struct page *page, long flags) -{ - struct super_block *sb = inode->i_sb; - int ret; - - logfs_get_wblocks(sb, page, flags & WF_LOCK); - ret = do_write_inode(inode); - logfs_put_wblocks(sb, page, flags & WF_LOCK); - return ret; -} - -static int do_delete_inode(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct inode *master_inode = logfs_super(sb)->s_master_inode; - struct page *page; - int ret; - - page = logfs_get_write_page(master_inode, inode->i_ino, 0); - if (!page) - return -ENOMEM; - - move_inode_to_page(page, inode); - - logfs_get_wblocks(sb, page, 1); - ret = __logfs_delete(master_inode, page); - logfs_put_wblocks(sb, page, 1); - - logfs_put_write_page(page); - return ret; -} - -/* - * ZOMBIE inodes have already been deleted before and should remain dead, - * if it weren't for valid checking. No need to kill them again here. - */ -void logfs_evict_inode(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct logfs_inode *li = logfs_inode(inode); - struct logfs_block *block = li->li_block; - struct page *page; - - if (!inode->i_nlink) { - if (!(li->li_flags & LOGFS_IF_ZOMBIE)) { - li->li_flags |= LOGFS_IF_ZOMBIE; - if (i_size_read(inode) > 0) - logfs_truncate(inode, 0); - do_delete_inode(inode); - } - } - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - /* Cheaper version of write_inode. All changes are concealed in - * aliases, which are moved back. No write to the medium happens. - */ - /* Only deleted files may be dirty at this point */ - BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink); - if (!block) - return; - if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) { - block->ops->free_block(inode->i_sb, block); - return; - } - - page = inode_to_page(inode); - BUG_ON(!page); /* FIXME: Use emergency page */ - logfs_put_write_page(page); -} - -void btree_write_block(struct logfs_block *block) -{ - struct inode *inode; - struct page *page; - int err, cookie; - - inode = logfs_safe_iget(block->sb, block->ino, &cookie); - page = logfs_get_write_page(inode, block->bix, block->level); - - err = logfs_readpage_nolock(page); - BUG_ON(err); - BUG_ON(!PagePrivate(page)); - BUG_ON(logfs_block(page) != block); - err = __logfs_write_buf(inode, page, 0); - BUG_ON(err); - BUG_ON(PagePrivate(page) || page->private); - - logfs_put_write_page(page); - logfs_safe_iput(inode, cookie); -} - -/** - * logfs_inode_write - write inode or dentry objects - * - * @inode: parent inode (ifile or directory) - * @buf: object to write (inode or dentry) - * @count: object size - * @bix: block index - * @flags: write flags - * @shadow_tree: shadow below this inode - * - * FIXME: All caller of this put a 200-300 byte variable on the stack, - * only to call here and do a memcpy from that stack variable. A good - * example of wasted performance and stack space. - */ -int logfs_inode_write(struct inode *inode, const void *buf, size_t count, - loff_t bix, long flags, struct shadow_tree *shadow_tree) -{ - loff_t pos = bix << inode->i_sb->s_blocksize_bits; - int err; - struct page *page; - void *pagebuf; - - BUG_ON(pos & (LOGFS_BLOCKSIZE-1)); - BUG_ON(count > LOGFS_BLOCKSIZE); - page = logfs_get_write_page(inode, bix, 0); - if (!page) - return -ENOMEM; - - pagebuf = kmap_atomic(page); - memcpy(pagebuf, buf, count); - flush_dcache_page(page); - kunmap_atomic(pagebuf); - - if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE) - i_size_write(inode, pos + LOGFS_BLOCKSIZE); - - err = logfs_write_buf(inode, page, flags); - logfs_put_write_page(page); - return err; -} - -int logfs_open_segfile(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *inode; - - inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE); - if (IS_ERR(inode)) - return PTR_ERR(inode); - super->s_segfile_inode = inode; - return 0; -} - -int logfs_init_rw(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int min_fill = 3 * super->s_no_blocks; - - INIT_LIST_HEAD(&super->s_object_alias); - INIT_LIST_HEAD(&super->s_writeback_list); - mutex_init(&super->s_write_mutex); - super->s_block_pool = mempool_create_kmalloc_pool(min_fill, - sizeof(struct logfs_block)); - super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill, - sizeof(struct logfs_shadow)); - return 0; -} - -void logfs_cleanup_rw(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - logfs_mempool_destroy(super->s_block_pool); - logfs_mempool_destroy(super->s_shadow_pool); -} diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c deleted file mode 100644 index 1efd6055f4b0..000000000000 --- a/fs/logfs/segment.c +++ /dev/null @@ -1,961 +0,0 @@ -/* - * fs/logfs/segment.c - Handling the Object Store - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - * - * Object store or ostore makes up the complete device with exception of - * the superblock and journal areas. Apart from its own metadata it stores - * three kinds of objects: inodes, dentries and blocks, both data and indirect. - */ -#include "logfs.h" -#include <linux/slab.h> - -static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct btree_head32 *head = &super->s_reserved_segments; - int err; - - err = btree_insert32(head, segno, (void *)1, GFP_NOFS); - if (err) - return err; - logfs_super(sb)->s_bad_segments++; - /* FIXME: write to journal */ - return 0; -} - -int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) -{ - struct logfs_super *super = logfs_super(sb); - - super->s_gec++; - - return super->s_devops->erase(sb, (u64)segno << super->s_segshift, - super->s_segsize, ensure_erase); -} - -static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) -{ - s32 ofs; - - logfs_open_area(area, bytes); - - ofs = area->a_used_bytes; - area->a_used_bytes += bytes; - BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize); - - return dev_ofs(area->a_sb, area->a_segno, ofs); -} - -static struct page *get_mapping_page(struct super_block *sb, pgoff_t index, - int use_filler) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - filler_t *filler = super->s_devops->readpage; - struct page *page; - - BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS)); - if (use_filler) - page = read_cache_page(mapping, index, filler, sb); - else { - page = find_or_create_page(mapping, index, GFP_NOFS); - if (page) - unlock_page(page); - } - return page; -} - -int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len, - int use_filler) -{ - pgoff_t index = ofs >> PAGE_SHIFT; - struct page *page; - long offset = ofs & (PAGE_SIZE-1); - long copylen; - - /* Only logfs_wbuf_recover may use len==0 */ - BUG_ON(!len && !use_filler); - do { - copylen = min((ulong)len, PAGE_SIZE - offset); - - page = get_mapping_page(area->a_sb, index, use_filler); - if (IS_ERR(page)) - return PTR_ERR(page); - BUG_ON(!page); /* FIXME: reserve a pool */ - SetPageUptodate(page); - memcpy(page_address(page) + offset, buf, copylen); - - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - } - put_page(page); - - buf += copylen; - len -= copylen; - offset = 0; - index++; - } while (len); - return 0; -} - -static void pad_partial_page(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - struct page *page; - u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); - pgoff_t index = ofs >> PAGE_SHIFT; - long offset = ofs & (PAGE_SIZE-1); - u32 len = PAGE_SIZE - offset; - - if (len % PAGE_SIZE) { - page = get_mapping_page(sb, index, 0); - BUG_ON(!page); /* FIXME: reserve a pool */ - memset(page_address(page) + offset, 0xff, len); - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - } - put_page(page); - } -} - -static void pad_full_pages(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - struct logfs_super *super = logfs_super(sb); - u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes); - u32 len = super->s_segsize - area->a_used_bytes; - pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT; - pgoff_t no_indizes = len >> PAGE_SHIFT; - struct page *page; - - while (no_indizes) { - page = get_mapping_page(sb, index, 0); - BUG_ON(!page); /* FIXME: reserve a pool */ - SetPageUptodate(page); - memset(page_address(page), 0xff, PAGE_SIZE); - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - } - put_page(page); - index++; - no_indizes--; - } -} - -/* - * bdev_writeseg will write full pages. Memset the tail to prevent data leaks. - * Also make sure we allocate (and memset) all pages for final writeout. - */ -static void pad_wbuf(struct logfs_area *area, int final) -{ - pad_partial_page(area); - if (final) - pad_full_pages(area); -} - -/* - * We have to be careful with the alias tree. Since lookup is done by bix, - * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with - * indirect blocks. So always use it through accessor functions. - */ -static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix, - level_t level) -{ - struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; - pgoff_t index = logfs_pack_index(bix, level); - - return btree_lookup128(head, ino, index); -} - -static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix, - level_t level, void *val) -{ - struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree; - pgoff_t index = logfs_pack_index(bix, level); - - return btree_insert128(head, ino, index, val, GFP_NOFS); -} - -static int btree_write_alias(struct super_block *sb, struct logfs_block *block, - write_alias_t *write_one_alias) -{ - struct object_alias_item *item; - int err; - - list_for_each_entry(item, &block->item_list, list) { - err = write_alias_journal(sb, block->ino, block->bix, - block->level, item->child_no, item->val); - if (err) - return err; - } - return 0; -} - -static const struct logfs_block_ops btree_block_ops = { - .write_block = btree_write_block, - .free_block = __free_block, - .write_alias = btree_write_alias, -}; - -int logfs_load_object_aliases(struct super_block *sb, - struct logfs_obj_alias *oa, int count) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_block *block; - struct object_alias_item *item; - u64 ino, bix; - level_t level; - int i, err; - - super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; - count /= sizeof(*oa); - for (i = 0; i < count; i++) { - item = mempool_alloc(super->s_alias_pool, GFP_NOFS); - if (!item) - return -ENOMEM; - memset(item, 0, sizeof(*item)); - - super->s_no_object_aliases++; - item->val = oa[i].val; - item->child_no = be16_to_cpu(oa[i].child_no); - - ino = be64_to_cpu(oa[i].ino); - bix = be64_to_cpu(oa[i].bix); - level = LEVEL(oa[i].level); - - log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n", - ino, bix, level, item->child_no, - be64_to_cpu(item->val)); - block = alias_tree_lookup(sb, ino, bix, level); - if (!block) { - block = __alloc_block(sb, ino, bix, level); - block->ops = &btree_block_ops; - err = alias_tree_insert(sb, ino, bix, level, block); - BUG_ON(err); /* mempool empty */ - } - if (test_and_set_bit(item->child_no, block->alias_map)) { - printk(KERN_ERR"LogFS: Alias collision detected\n"); - return -EIO; - } - list_move_tail(&block->alias_list, &super->s_object_alias); - list_add(&item->list, &block->item_list); - } - return 0; -} - -static void kill_alias(void *_block, unsigned long ignore0, - u64 ignore1, u64 ignore2, size_t ignore3) -{ - struct logfs_block *block = _block; - struct super_block *sb = block->sb; - struct logfs_super *super = logfs_super(sb); - struct object_alias_item *item; - - while (!list_empty(&block->item_list)) { - item = list_entry(block->item_list.next, typeof(*item), list); - list_del(&item->list); - mempool_free(item, super->s_alias_pool); - } - block->ops->free_block(sb, block); -} - -static int obj_type(struct inode *inode, level_t level) -{ - if (level == 0) { - if (S_ISDIR(inode->i_mode)) - return OBJ_DENTRY; - if (inode->i_ino == LOGFS_INO_MASTER) - return OBJ_INODE; - } - return OBJ_BLOCK; -} - -static int obj_len(struct super_block *sb, int obj_type) -{ - switch (obj_type) { - case OBJ_DENTRY: - return sizeof(struct logfs_disk_dentry); - case OBJ_INODE: - return sizeof(struct logfs_disk_inode); - case OBJ_BLOCK: - return sb->s_blocksize; - default: - BUG(); - } -} - -static int __logfs_segment_write(struct inode *inode, void *buf, - struct logfs_shadow *shadow, int type, int len, int compr) -{ - struct logfs_area *area; - struct super_block *sb = inode->i_sb; - s64 ofs; - struct logfs_object_header h; - int acc_len; - - if (shadow->gc_level == 0) - acc_len = len; - else - acc_len = obj_len(sb, type); - - area = get_area(sb, shadow->gc_level); - ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE); - LOGFS_BUG_ON(ofs <= 0, sb); - /* - * Order is important. logfs_get_free_bytes(), by modifying the - * segment file, may modify the content of the very page we're about - * to write now. Which is fine, as long as the calculated crc and - * written data still match. So do the modifications _before_ - * calculating the crc. - */ - - h.len = cpu_to_be16(len); - h.type = type; - h.compr = compr; - h.ino = cpu_to_be64(inode->i_ino); - h.bix = cpu_to_be64(shadow->bix); - h.crc = logfs_crc32(&h, sizeof(h) - 4, 4); - h.data_crc = logfs_crc32(buf, len, 0); - - logfs_buf_write(area, ofs, &h, sizeof(h)); - logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len); - - shadow->new_ofs = ofs; - shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE; - - return 0; -} - -static s64 logfs_segment_write_compress(struct inode *inode, void *buf, - struct logfs_shadow *shadow, int type, int len) -{ - struct super_block *sb = inode->i_sb; - void *compressor_buf = logfs_super(sb)->s_compressed_je; - ssize_t compr_len; - int ret; - - mutex_lock(&logfs_super(sb)->s_journal_mutex); - compr_len = logfs_compress(buf, compressor_buf, len, len); - - if (compr_len >= 0) { - ret = __logfs_segment_write(inode, compressor_buf, shadow, - type, compr_len, COMPR_ZLIB); - } else { - ret = __logfs_segment_write(inode, buf, shadow, type, len, - COMPR_NONE); - } - mutex_unlock(&logfs_super(sb)->s_journal_mutex); - return ret; -} - -/** - * logfs_segment_write - write data block to object store - * @inode: inode containing data - * - * Returns an errno or zero. - */ -int logfs_segment_write(struct inode *inode, struct page *page, - struct logfs_shadow *shadow) -{ - struct super_block *sb = inode->i_sb; - struct logfs_super *super = logfs_super(sb); - int do_compress, type, len; - int ret; - void *buf; - - super->s_flags |= LOGFS_SB_FLAG_DIRTY; - BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); - do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED; - if (shadow->gc_level != 0) { - /* temporarily disable compression for indirect blocks */ - do_compress = 0; - } - - type = obj_type(inode, shrink_level(shadow->gc_level)); - len = obj_len(sb, type); - buf = kmap(page); - if (do_compress) - ret = logfs_segment_write_compress(inode, buf, shadow, type, - len); - else - ret = __logfs_segment_write(inode, buf, shadow, type, len, - COMPR_NONE); - kunmap(page); - - log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n", - shadow->ino, shadow->bix, shadow->gc_level, - shadow->old_ofs, shadow->new_ofs, - shadow->old_len, shadow->new_len); - /* this BUG_ON did catch a locking bug. useful */ - BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1))); - return ret; -} - -int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf) -{ - pgoff_t index = ofs >> PAGE_SHIFT; - struct page *page; - long offset = ofs & (PAGE_SIZE-1); - long copylen; - - while (len) { - copylen = min((ulong)len, PAGE_SIZE - offset); - - page = get_mapping_page(sb, index, 1); - if (IS_ERR(page)) - return PTR_ERR(page); - memcpy(buf, page_address(page) + offset, copylen); - put_page(page); - - buf += copylen; - len -= copylen; - offset = 0; - index++; - } - return 0; -} - -/* - * The "position" of indirect blocks is ambiguous. It can be the position - * of any data block somewhere behind this indirect block. So we need to - * normalize the positions through logfs_block_mask() before comparing. - */ -static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level) -{ - return (pos1 & logfs_block_mask(sb, level)) != - (pos2 & logfs_block_mask(sb, level)); -} - -#if 0 -static int read_seg_header(struct super_block *sb, u64 ofs, - struct logfs_segment_header *sh) -{ - __be32 crc; - int err; - - err = wbuf_read(sb, ofs, sizeof(*sh), sh); - if (err) - return err; - crc = logfs_crc32(sh, sizeof(*sh), 4); - if (crc != sh->crc) { - printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " - "got %x\n", ofs, be32_to_cpu(sh->crc), - be32_to_cpu(crc)); - return -EIO; - } - return 0; -} -#endif - -static int read_obj_header(struct super_block *sb, u64 ofs, - struct logfs_object_header *oh) -{ - __be32 crc; - int err; - - err = wbuf_read(sb, ofs, sizeof(*oh), oh); - if (err) - return err; - crc = logfs_crc32(oh, sizeof(*oh) - 4, 4); - if (crc != oh->crc) { - printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, " - "got %x\n", ofs, be32_to_cpu(oh->crc), - be32_to_cpu(crc)); - return -EIO; - } - return 0; -} - -static void move_btree_to_page(struct inode *inode, struct page *page, - __be64 *data) -{ - struct super_block *sb = inode->i_sb; - struct logfs_super *super = logfs_super(sb); - struct btree_head128 *head = &super->s_object_alias_tree; - struct logfs_block *block; - struct object_alias_item *item, *next; - - if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS)) - return; - - block = btree_remove128(head, inode->i_ino, page->index); - if (!block) - return; - - log_blockmove("move_btree_to_page(%llx, %llx, %x)\n", - block->ino, block->bix, block->level); - list_for_each_entry_safe(item, next, &block->item_list, list) { - data[item->child_no] = item->val; - list_del(&item->list); - mempool_free(item, super->s_alias_pool); - } - block->page = page; - - if (!PagePrivate(page)) { - SetPagePrivate(page); - get_page(page); - set_page_private(page, (unsigned long) block); - } - block->ops = &indirect_block_ops; - initialize_block_counters(page, block, data, 0); -} - -/* - * This silences a false, yet annoying gcc warning. I hate it when my editor - * jumps into bitops.h each time I recompile this file. - * TODO: Complain to gcc folks about this and upgrade compiler. - */ -static unsigned long fnb(const unsigned long *addr, - unsigned long size, unsigned long offset) -{ - return find_next_bit(addr, size, offset); -} - -void move_page_to_btree(struct page *page) -{ - struct logfs_block *block = logfs_block(page); - struct super_block *sb = block->sb; - struct logfs_super *super = logfs_super(sb); - struct object_alias_item *item; - unsigned long pos; - __be64 *child; - int err; - - if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) { - block->ops->free_block(sb, block); - return; - } - log_blockmove("move_page_to_btree(%llx, %llx, %x)\n", - block->ino, block->bix, block->level); - super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS; - - for (pos = 0; ; pos++) { - pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos); - if (pos >= LOGFS_BLOCK_FACTOR) - break; - - item = mempool_alloc(super->s_alias_pool, GFP_NOFS); - BUG_ON(!item); /* mempool empty */ - memset(item, 0, sizeof(*item)); - - child = kmap_atomic(page); - item->val = child[pos]; - kunmap_atomic(child); - item->child_no = pos; - list_add(&item->list, &block->item_list); - } - block->page = NULL; - - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - set_page_private(page, 0); - } - block->ops = &btree_block_ops; - err = alias_tree_insert(block->sb, block->ino, block->bix, block->level, - block); - BUG_ON(err); /* mempool empty */ - ClearPageUptodate(page); -} - -static int __logfs_segment_read(struct inode *inode, void *buf, - u64 ofs, u64 bix, level_t level) -{ - struct super_block *sb = inode->i_sb; - void *compressor_buf = logfs_super(sb)->s_compressed_je; - struct logfs_object_header oh; - __be32 crc; - u16 len; - int err, block_len; - - block_len = obj_len(sb, obj_type(inode, level)); - err = read_obj_header(sb, ofs, &oh); - if (err) - goto out_err; - - err = -EIO; - if (be64_to_cpu(oh.ino) != inode->i_ino - || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) { - printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: " - "expected (%lx, %llx), got (%llx, %llx)\n", - ofs, inode->i_ino, bix, - be64_to_cpu(oh.ino), be64_to_cpu(oh.bix)); - goto out_err; - } - - len = be16_to_cpu(oh.len); - - switch (oh.compr) { - case COMPR_NONE: - err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf); - if (err) - goto out_err; - crc = logfs_crc32(buf, len, 0); - if (crc != oh.data_crc) { - printk(KERN_ERR"LOGFS: uncompressed data crc error at " - "%llx: expected %x, got %x\n", ofs, - be32_to_cpu(oh.data_crc), - be32_to_cpu(crc)); - goto out_err; - } - break; - case COMPR_ZLIB: - mutex_lock(&logfs_super(sb)->s_journal_mutex); - err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, - compressor_buf); - if (err) { - mutex_unlock(&logfs_super(sb)->s_journal_mutex); - goto out_err; - } - crc = logfs_crc32(compressor_buf, len, 0); - if (crc != oh.data_crc) { - printk(KERN_ERR"LOGFS: compressed data crc error at " - "%llx: expected %x, got %x\n", ofs, - be32_to_cpu(oh.data_crc), - be32_to_cpu(crc)); - mutex_unlock(&logfs_super(sb)->s_journal_mutex); - goto out_err; - } - err = logfs_uncompress(compressor_buf, buf, len, block_len); - mutex_unlock(&logfs_super(sb)->s_journal_mutex); - if (err) { - printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs); - goto out_err; - } - break; - default: - LOGFS_BUG(sb); - err = -EIO; - goto out_err; - } - return 0; - -out_err: - logfs_set_ro(sb); - printk(KERN_ERR"LOGFS: device is read-only now\n"); - LOGFS_BUG(sb); - return err; -} - -/** - * logfs_segment_read - read data block from object store - * @inode: inode containing data - * @buf: data buffer - * @ofs: physical data offset - * @bix: block index - * @level: block level - * - * Returns 0 on success or a negative errno. - */ -int logfs_segment_read(struct inode *inode, struct page *page, - u64 ofs, u64 bix, level_t level) -{ - int err; - void *buf; - - if (PageUptodate(page)) - return 0; - - ofs &= ~LOGFS_FULLY_POPULATED; - - buf = kmap(page); - err = __logfs_segment_read(inode, buf, ofs, bix, level); - if (!err) { - move_btree_to_page(inode, page, buf); - SetPageUptodate(page); - } - kunmap(page); - log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n", - inode->i_ino, bix, level, ofs, err); - return err; -} - -int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow) -{ - struct super_block *sb = inode->i_sb; - struct logfs_super *super = logfs_super(sb); - struct logfs_object_header h; - u16 len; - int err; - - super->s_flags |= LOGFS_SB_FLAG_DIRTY; - BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN); - BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED); - if (!shadow->old_ofs) - return 0; - - log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n", - shadow->ino, shadow->bix, shadow->gc_level, - shadow->old_ofs, shadow->new_ofs, - shadow->old_len, shadow->new_len); - err = read_obj_header(sb, shadow->old_ofs, &h); - LOGFS_BUG_ON(err, sb); - LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb); - LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix), - shrink_level(shadow->gc_level)), sb); - - if (shadow->gc_level == 0) - len = be16_to_cpu(h.len); - else - len = obj_len(sb, h.type); - shadow->old_len = len + sizeof(h); - return 0; -} - -void freeseg(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct page *page; - u64 ofs, start, end; - - start = dev_ofs(sb, segno, 0); - end = dev_ofs(sb, segno + 1, 0); - for (ofs = start; ofs < end; ofs += PAGE_SIZE) { - page = find_get_page(mapping, ofs >> PAGE_SHIFT); - if (!page) - continue; - if (PagePrivate(page)) { - ClearPagePrivate(page); - put_page(page); - } - put_page(page); - } -} - -int logfs_open_area(struct logfs_area *area, size_t bytes) -{ - struct super_block *sb = area->a_sb; - struct logfs_super *super = logfs_super(sb); - int err, closed = 0; - - if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize) - return 0; - - if (area->a_is_open) { - u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); - u32 len = super->s_segsize - area->a_written_bytes; - - log_gc("logfs_close_area(%x)\n", area->a_segno); - pad_wbuf(area, 1); - super->s_devops->writeseg(area->a_sb, ofs, len); - freeseg(sb, area->a_segno); - closed = 1; - } - - area->a_used_bytes = 0; - area->a_written_bytes = 0; -again: - area->a_ops->get_free_segment(area); - area->a_ops->get_erase_count(area); - - log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level); - err = area->a_ops->erase_segment(area); - if (err) { - printk(KERN_WARNING "LogFS: Error erasing segment %x\n", - area->a_segno); - logfs_mark_segment_bad(sb, area->a_segno); - goto again; - } - area->a_is_open = 1; - return closed; -} - -void logfs_sync_area(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - struct logfs_super *super = logfs_super(sb); - u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); - u32 len = (area->a_used_bytes - area->a_written_bytes); - - if (super->s_writesize) - len &= ~(super->s_writesize - 1); - if (len == 0) - return; - pad_wbuf(area, 0); - super->s_devops->writeseg(sb, ofs, len); - area->a_written_bytes += len; -} - -void logfs_sync_segments(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - for_each_area(i) - logfs_sync_area(super->s_area[i]); -} - -/* - * Pick a free segment to be used for this area. Effectively takes a - * candidate from the free list (not really a candidate anymore). - */ -static void ostore_get_free_segment(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - struct logfs_super *super = logfs_super(sb); - - if (super->s_free_list.count == 0) { - printk(KERN_ERR"LOGFS: ran out of free segments\n"); - LOGFS_BUG(sb); - } - - area->a_segno = get_best_cand(sb, &super->s_free_list, NULL); -} - -static void ostore_get_erase_count(struct logfs_area *area) -{ - struct logfs_segment_entry se; - u32 ec_level; - - logfs_get_segment_entry(area->a_sb, area->a_segno, &se); - BUG_ON(se.ec_level == cpu_to_be32(BADSEG) || - se.valid == cpu_to_be32(RESERVED)); - - ec_level = be32_to_cpu(se.ec_level); - area->a_erase_count = (ec_level >> 4) + 1; -} - -static int ostore_erase_segment(struct logfs_area *area) -{ - struct super_block *sb = area->a_sb; - struct logfs_segment_header sh; - u64 ofs; - int err; - - err = logfs_erase_segment(sb, area->a_segno, 0); - if (err) - return err; - - sh.pad = 0; - sh.type = SEG_OSTORE; - sh.level = (__force u8)area->a_level; - sh.segno = cpu_to_be32(area->a_segno); - sh.ec = cpu_to_be32(area->a_erase_count); - sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); - sh.crc = logfs_crc32(&sh, sizeof(sh), 4); - - logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, - area->a_level); - - ofs = dev_ofs(sb, area->a_segno, 0); - area->a_used_bytes = sizeof(sh); - logfs_buf_write(area, ofs, &sh, sizeof(sh)); - return 0; -} - -static const struct logfs_area_ops ostore_area_ops = { - .get_free_segment = ostore_get_free_segment, - .get_erase_count = ostore_get_erase_count, - .erase_segment = ostore_erase_segment, -}; - -static void free_area(struct logfs_area *area) -{ - if (area) - freeseg(area->a_sb, area->a_segno); - kfree(area); -} - -void free_areas(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - for_each_area(i) - free_area(super->s_area[i]); - free_area(super->s_journal_area); -} - -static struct logfs_area *alloc_area(struct super_block *sb) -{ - struct logfs_area *area; - - area = kzalloc(sizeof(*area), GFP_KERNEL); - if (!area) - return NULL; - - area->a_sb = sb; - return area; -} - -static void map_invalidatepage(struct page *page, unsigned int o, - unsigned int l) -{ - return; -} - -static int map_releasepage(struct page *page, gfp_t g) -{ - /* Don't release these pages */ - return 0; -} - -static const struct address_space_operations mapping_aops = { - .invalidatepage = map_invalidatepage, - .releasepage = map_releasepage, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - -int logfs_init_mapping(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct address_space *mapping; - struct inode *inode; - - inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING); - if (IS_ERR(inode)) - return PTR_ERR(inode); - super->s_mapping_inode = inode; - mapping = inode->i_mapping; - mapping->a_ops = &mapping_aops; - /* Would it be possible to use __GFP_HIGHMEM as well? */ - mapping_set_gfp_mask(mapping, GFP_NOFS); - return 0; -} - -int logfs_init_areas(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i = -1; - - super->s_alias_pool = mempool_create_kmalloc_pool(600, - sizeof(struct object_alias_item)); - if (!super->s_alias_pool) - return -ENOMEM; - - super->s_journal_area = alloc_area(sb); - if (!super->s_journal_area) - goto err; - - for_each_area(i) { - super->s_area[i] = alloc_area(sb); - if (!super->s_area[i]) - goto err; - super->s_area[i]->a_level = GC_LEVEL(i); - super->s_area[i]->a_ops = &ostore_area_ops; - } - btree_init_mempool128(&super->s_object_alias_tree, - super->s_btree_pool); - return 0; - -err: - for (i--; i >= 0; i--) - free_area(super->s_area[i]); - free_area(super->s_journal_area); - logfs_mempool_destroy(super->s_alias_pool); - return -ENOMEM; -} - -void logfs_cleanup_areas(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias); -} diff --git a/fs/logfs/super.c b/fs/logfs/super.c deleted file mode 100644 index 5751082dba52..000000000000 --- a/fs/logfs/super.c +++ /dev/null @@ -1,653 +0,0 @@ -/* - * fs/logfs/super.c - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - * - * Generally contains mount/umount code and also serves as a dump area for - * any functions that don't fit elsewhere and neither justify a file of their - * own. - */ -#include "logfs.h" -#include <linux/bio.h> -#include <linux/slab.h> -#include <linux/blkdev.h> -#include <linux/module.h> -#include <linux/mtd/mtd.h> -#include <linux/statfs.h> -#include <linux/buffer_head.h> - -static DEFINE_MUTEX(emergency_mutex); -static struct page *emergency_page; - -struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index) -{ - filler_t *filler = (filler_t *)mapping->a_ops->readpage; - struct page *page; - int err; - - page = read_cache_page(mapping, index, filler, NULL); - if (page) - return page; - - /* No more pages available, switch to emergency page */ - printk(KERN_INFO"Logfs: Using emergency page\n"); - mutex_lock(&emergency_mutex); - err = filler(NULL, emergency_page); - if (err) { - mutex_unlock(&emergency_mutex); - printk(KERN_EMERG"Logfs: Error reading emergency page\n"); - return ERR_PTR(err); - } - return emergency_page; -} - -void emergency_read_end(struct page *page) -{ - if (page == emergency_page) - mutex_unlock(&emergency_mutex); - else - put_page(page); -} - -static void dump_segfile(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_segment_entry se; - u32 segno; - - for (segno = 0; segno < super->s_no_segs; segno++) { - logfs_get_segment_entry(sb, segno, &se); - printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level), - be32_to_cpu(se.valid)); - if (++segno < super->s_no_segs) { - logfs_get_segment_entry(sb, segno, &se); - printk(" %6x %8x", be32_to_cpu(se.ec_level), - be32_to_cpu(se.valid)); - } - if (++segno < super->s_no_segs) { - logfs_get_segment_entry(sb, segno, &se); - printk(" %6x %8x", be32_to_cpu(se.ec_level), - be32_to_cpu(se.valid)); - } - if (++segno < super->s_no_segs) { - logfs_get_segment_entry(sb, segno, &se); - printk(" %6x %8x", be32_to_cpu(se.ec_level), - be32_to_cpu(se.valid)); - } - printk("\n"); - } -} - -/* - * logfs_crash_dump - dump debug information to device - * - * The LogFS superblock only occupies part of a segment. This function will - * write as much debug information as it can gather into the spare space. - */ -void logfs_crash_dump(struct super_block *sb) -{ - dump_segfile(sb); -} - -/* - * FIXME: There should be a reserve for root, similar to ext2. - */ -int logfs_statfs(struct dentry *dentry, struct kstatfs *stats) -{ - struct super_block *sb = dentry->d_sb; - struct logfs_super *super = logfs_super(sb); - - stats->f_type = LOGFS_MAGIC_U32; - stats->f_bsize = sb->s_blocksize; - stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3; - stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits; - stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits; - stats->f_files = 0; - stats->f_ffree = 0; - stats->f_namelen = LOGFS_MAX_NAMELEN; - return 0; -} - -static int logfs_sb_set(struct super_block *sb, void *_super) -{ - struct logfs_super *super = _super; - - sb->s_fs_info = super; - sb->s_mtd = super->s_mtd; - sb->s_bdev = super->s_bdev; -#ifdef CONFIG_BLOCK - if (sb->s_bdev) - sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; -#endif -#ifdef CONFIG_MTD - if (sb->s_mtd) - sb->s_bdi = sb->s_mtd->backing_dev_info; -#endif - return 0; -} - -static int logfs_sb_test(struct super_block *sb, void *_super) -{ - struct logfs_super *super = _super; - struct mtd_info *mtd = super->s_mtd; - - if (mtd && sb->s_mtd == mtd) - return 1; - if (super->s_bdev && sb->s_bdev == super->s_bdev) - return 1; - return 0; -} - -static void set_segment_header(struct logfs_segment_header *sh, u8 type, - u8 level, u32 segno, u32 ec) -{ - sh->pad = 0; - sh->type = type; - sh->level = level; - sh->segno = cpu_to_be32(segno); - sh->ec = cpu_to_be32(ec); - sh->gec = cpu_to_be64(segno); - sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4); -} - -static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds, - u32 segno, u32 ec) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_segment_header *sh = &ds->ds_sh; - int i; - - memset(ds, 0, sizeof(*ds)); - set_segment_header(sh, SEG_SUPER, 0, segno, ec); - - ds->ds_ifile_levels = super->s_ifile_levels; - ds->ds_iblock_levels = super->s_iblock_levels; - ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */ - ds->ds_segment_shift = super->s_segshift; - ds->ds_block_shift = sb->s_blocksize_bits; - ds->ds_write_shift = super->s_writeshift; - ds->ds_filesystem_size = cpu_to_be64(super->s_size); - ds->ds_segment_size = cpu_to_be32(super->s_segsize); - ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve); - ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat); - ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat); - ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat); - ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags); - ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve); - ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve); - journal_for_each(i) - ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]); - ds->ds_magic = cpu_to_be64(LOGFS_MAGIC); - ds->ds_crc = logfs_crc32(ds, sizeof(*ds), - LOGFS_SEGMENT_HEADERSIZE + 12); -} - -static int write_one_sb(struct super_block *sb, - struct page *(*find_sb)(struct super_block *sb, u64 *ofs)) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_disk_super *ds; - struct logfs_segment_entry se; - struct page *page; - u64 ofs; - u32 ec, segno; - int err; - - page = find_sb(sb, &ofs); - if (!page) - return -EIO; - ds = page_address(page); - segno = seg_no(sb, ofs); - logfs_get_segment_entry(sb, segno, &se); - ec = be32_to_cpu(se.ec_level) >> 4; - ec++; - logfs_set_segment_erased(sb, segno, ec, 0); - logfs_write_ds(sb, ds, segno, ec); - err = super->s_devops->write_sb(sb, page); - put_page(page); - return err; -} - -int logfs_write_sb(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int err; - - /* First superblock */ - err = write_one_sb(sb, super->s_devops->find_first_sb); - if (err) - return err; - - /* Last superblock */ - err = write_one_sb(sb, super->s_devops->find_last_sb); - if (err) - return err; - return 0; -} - -static int ds_cmp(const void *ds0, const void *ds1) -{ - size_t len = sizeof(struct logfs_disk_super); - - /* We know the segment headers differ, so ignore them */ - len -= LOGFS_SEGMENT_HEADERSIZE; - ds0 += LOGFS_SEGMENT_HEADERSIZE; - ds1 += LOGFS_SEGMENT_HEADERSIZE; - return memcmp(ds0, ds1, len); -} - -static int logfs_recover_sb(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_disk_super _ds0, *ds0 = &_ds0; - struct logfs_disk_super _ds1, *ds1 = &_ds1; - int err, valid0, valid1; - - /* read first superblock */ - err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0); - if (err) - return err; - /* read last superblock */ - err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1); - if (err) - return err; - valid0 = logfs_check_ds(ds0) == 0; - valid1 = logfs_check_ds(ds1) == 0; - - if (!valid0 && valid1) { - printk(KERN_INFO"First superblock is invalid - fixing.\n"); - return write_one_sb(sb, super->s_devops->find_first_sb); - } - if (valid0 && !valid1) { - printk(KERN_INFO"Last superblock is invalid - fixing.\n"); - return write_one_sb(sb, super->s_devops->find_last_sb); - } - if (valid0 && valid1 && ds_cmp(ds0, ds1)) { - printk(KERN_INFO"Superblocks don't match - fixing.\n"); - return logfs_write_sb(sb); - } - /* If neither is valid now, something's wrong. Didn't we properly - * check them before?!? */ - BUG_ON(!valid0 && !valid1); - return 0; -} - -static int logfs_make_writeable(struct super_block *sb) -{ - int err; - - err = logfs_open_segfile(sb); - if (err) - return err; - - /* Repair any broken superblock copies */ - err = logfs_recover_sb(sb); - if (err) - return err; - - /* Check areas for trailing unaccounted data */ - err = logfs_check_areas(sb); - if (err) - return err; - - /* Do one GC pass before any data gets dirtied */ - logfs_gc_pass(sb); - - /* after all initializations are done, replay the journal - * for rw-mounts, if necessary */ - err = logfs_replay_journal(sb); - if (err) - return err; - - return 0; -} - -static int logfs_get_sb_final(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct inode *rootdir; - int err; - - /* root dir */ - rootdir = logfs_iget(sb, LOGFS_INO_ROOT); - if (IS_ERR(rootdir)) - goto fail; - - sb->s_root = d_make_root(rootdir); - if (!sb->s_root) - goto fail; - - /* at that point we know that ->put_super() will be called */ - super->s_erase_page = alloc_pages(GFP_KERNEL, 0); - if (!super->s_erase_page) - return -ENOMEM; - memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); - - /* FIXME: check for read-only mounts */ - err = logfs_make_writeable(sb); - if (err) { - __free_page(super->s_erase_page); - return err; - } - - log_super("LogFS: Finished mounting\n"); - return 0; - -fail: - iput(super->s_master_inode); - iput(super->s_segfile_inode); - iput(super->s_mapping_inode); - return -EIO; -} - -int logfs_check_ds(struct logfs_disk_super *ds) -{ - struct logfs_segment_header *sh = &ds->ds_sh; - - if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC)) - return -EINVAL; - if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4)) - return -EINVAL; - if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds), - LOGFS_SEGMENT_HEADERSIZE + 12)) - return -EINVAL; - return 0; -} - -static struct page *find_super_block(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct page *first, *last; - - first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]); - if (!first || IS_ERR(first)) - return NULL; - last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]); - if (!last || IS_ERR(last)) { - put_page(first); - return NULL; - } - - if (!logfs_check_ds(page_address(first))) { - put_page(last); - return first; - } - - /* First one didn't work, try the second superblock */ - if (!logfs_check_ds(page_address(last))) { - put_page(first); - return last; - } - - /* Neither worked, sorry folks */ - put_page(first); - put_page(last); - return NULL; -} - -static int __logfs_read_sb(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct page *page; - struct logfs_disk_super *ds; - int i; - - page = find_super_block(sb); - if (!page) - return -EINVAL; - - ds = page_address(page); - super->s_size = be64_to_cpu(ds->ds_filesystem_size); - super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve); - super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve); - super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve); - super->s_segsize = 1 << ds->ds_segment_shift; - super->s_segmask = (1 << ds->ds_segment_shift) - 1; - super->s_segshift = ds->ds_segment_shift; - sb->s_blocksize = 1 << ds->ds_block_shift; - sb->s_blocksize_bits = ds->ds_block_shift; - super->s_writesize = 1 << ds->ds_write_shift; - super->s_writeshift = ds->ds_write_shift; - super->s_no_segs = super->s_size >> super->s_segshift; - super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits; - super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat); - super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat); - super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat); - super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags); - - journal_for_each(i) - super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]); - - super->s_ifile_levels = ds->ds_ifile_levels; - super->s_iblock_levels = ds->ds_iblock_levels; - super->s_data_levels = ds->ds_data_levels; - super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels - + super->s_data_levels; - put_page(page); - return 0; -} - -static int logfs_read_sb(struct super_block *sb, int read_only) -{ - struct logfs_super *super = logfs_super(sb); - int ret; - - super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL); - if (!super->s_btree_pool) - return -ENOMEM; - - btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); - btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); - btree_init_mempool32(&super->s_shadow_tree.segment_map, - super->s_btree_pool); - - ret = logfs_init_mapping(sb); - if (ret) - return ret; - - ret = __logfs_read_sb(sb); - if (ret) - return ret; - - if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT) - return -EIO; - if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) && - !read_only) - return -EIO; - - ret = logfs_init_rw(sb); - if (ret) - return ret; - - ret = logfs_init_areas(sb); - if (ret) - return ret; - - ret = logfs_init_gc(sb); - if (ret) - return ret; - - ret = logfs_init_journal(sb); - if (ret) - return ret; - - return 0; -} - -static void logfs_kill_sb(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - log_super("LogFS: Start unmounting\n"); - /* Alias entries slow down mount, so evict as many as possible */ - sync_filesystem(sb); - logfs_write_anchor(sb); - free_areas(sb); - - /* - * From this point on alias entries are simply dropped - and any - * writes to the object store are considered bugs. - */ - log_super("LogFS: Now in shutdown\n"); - generic_shutdown_super(sb); - super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN; - - BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes); - - logfs_cleanup_gc(sb); - logfs_cleanup_journal(sb); - logfs_cleanup_areas(sb); - logfs_cleanup_rw(sb); - if (super->s_erase_page) - __free_page(super->s_erase_page); - super->s_devops->put_device(super); - logfs_mempool_destroy(super->s_btree_pool); - logfs_mempool_destroy(super->s_alias_pool); - kfree(super); - log_super("LogFS: Finished unmounting\n"); -} - -static struct dentry *logfs_get_sb_device(struct logfs_super *super, - struct file_system_type *type, int flags) -{ - struct super_block *sb; - int err = -ENOMEM; - static int mount_count; - - log_super("LogFS: Start mount %x\n", mount_count++); - - err = -EINVAL; - sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super); - if (IS_ERR(sb)) { - super->s_devops->put_device(super); - kfree(super); - return ERR_CAST(sb); - } - - if (sb->s_root) { - /* Device is already in use */ - super->s_devops->put_device(super); - kfree(super); - return dget(sb->s_root); - } - - /* - * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache - * only covers 16TB and the upper 8TB are used for indirect blocks. - * On 64bit system we could bump up the limit, but that would make - * the filesystem incompatible with 32bit systems. - */ - sb->s_maxbytes = (1ull << 43) - 1; - sb->s_max_links = LOGFS_LINK_MAX; - sb->s_op = &logfs_super_operations; - - err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY); - if (err) - goto err1; - - sb->s_flags |= MS_ACTIVE; - err = logfs_get_sb_final(sb); - if (err) { - deactivate_locked_super(sb); - return ERR_PTR(err); - } - return dget(sb->s_root); - -err1: - /* no ->s_root, no ->put_super() */ - iput(super->s_master_inode); - iput(super->s_segfile_inode); - iput(super->s_mapping_inode); - deactivate_locked_super(sb); - return ERR_PTR(err); -} - -static struct dentry *logfs_mount(struct file_system_type *type, int flags, - const char *devname, void *data) -{ - ulong mtdnr; - struct logfs_super *super; - int err; - - super = kzalloc(sizeof(*super), GFP_KERNEL); - if (!super) - return ERR_PTR(-ENOMEM); - - mutex_init(&super->s_dirop_mutex); - mutex_init(&super->s_object_alias_mutex); - INIT_LIST_HEAD(&super->s_freeing_list); - - if (!devname) - err = logfs_get_sb_bdev(super, type, devname); - else if (strncmp(devname, "mtd", 3)) - err = logfs_get_sb_bdev(super, type, devname); - else { - char *garbage; - mtdnr = simple_strtoul(devname+3, &garbage, 0); - if (*garbage) - err = -EINVAL; - else - err = logfs_get_sb_mtd(super, mtdnr); - } - - if (err) { - kfree(super); - return ERR_PTR(err); - } - - return logfs_get_sb_device(super, type, flags); -} - -static struct file_system_type logfs_fs_type = { - .owner = THIS_MODULE, - .name = "logfs", - .mount = logfs_mount, - .kill_sb = logfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV, - -}; -MODULE_ALIAS_FS("logfs"); - -static int __init logfs_init(void) -{ - int ret; - - emergency_page = alloc_pages(GFP_KERNEL, 0); - if (!emergency_page) - return -ENOMEM; - - ret = logfs_compr_init(); - if (ret) - goto out1; - - ret = logfs_init_inode_cache(); - if (ret) - goto out2; - - ret = register_filesystem(&logfs_fs_type); - if (!ret) - return 0; - logfs_destroy_inode_cache(); -out2: - logfs_compr_exit(); -out1: - __free_pages(emergency_page, 0); - return ret; -} - -static void __exit logfs_exit(void) -{ - unregister_filesystem(&logfs_fs_type); - logfs_destroy_inode_cache(); - logfs_compr_exit(); - __free_pages(emergency_page, 0); -} - -module_init(logfs_init); -module_exit(logfs_exit); - -MODULE_LICENSE("GPL v2"); -MODULE_AUTHOR("Joern Engel <joern@logfs.org>"); -MODULE_DESCRIPTION("scalable flash filesystem"); diff --git a/fs/mbcache.c b/fs/mbcache.c index c5bd19ffa326..b19be429d655 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -29,7 +29,7 @@ struct mb_cache { /* log2 of hash table size */ int c_bucket_bits; /* Maximum entries in cache to avoid degrading hash too much */ - int c_max_entries; + unsigned long c_max_entries; /* Protects c_list, c_entry_count */ spinlock_t c_list_lock; struct list_head c_list; @@ -43,7 +43,7 @@ struct mb_cache { static struct kmem_cache *mb_entry_cache; static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan); + unsigned long nr_to_scan); static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache, u32 key) @@ -155,12 +155,12 @@ out: } /* - * mb_cache_entry_find_first - find the first entry in cache with given key + * mb_cache_entry_find_first - find the first reusable entry with the given key * @cache: cache where we should search * @key: key to look for * - * Search in @cache for entry with key @key. Grabs reference to the first - * entry found and returns the entry. + * Search in @cache for a reusable entry with key @key. Grabs reference to the + * first reusable entry found and returns the entry. */ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key) @@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, EXPORT_SYMBOL(mb_cache_entry_find_first); /* - * mb_cache_entry_find_next - find next entry in cache with the same + * mb_cache_entry_find_next - find next reusable entry with the same key * @cache: cache where we should search * @entry: entry to start search from * - * Finds next entry in the hash chain which has the same key as @entry. - * If @entry is unhashed (which can happen when deletion of entry races - * with the search), finds the first entry in the hash chain. The function - * drops reference to @entry and returns with a reference to the found entry. + * Finds next reusable entry in the hash chain which has the same key as @entry. + * If @entry is unhashed (which can happen when deletion of entry races with the + * search), finds the first reusable entry in the hash chain. The function drops + * reference to @entry and returns with a reference to the found entry. */ struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry) @@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink, /* Shrink number of entries in cache */ static unsigned long mb_cache_shrink(struct mb_cache *cache, - unsigned int nr_to_scan) + unsigned long nr_to_scan) { struct mb_cache_entry *entry; struct hlist_bl_head *head; - unsigned int shrunk = 0; + unsigned long shrunk = 0; spin_lock(&cache->c_list_lock); while (nr_to_scan-- && !list_empty(&cache->c_list)) { @@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, struct mb_cache_entry, e_list); if (entry->e_referenced) { entry->e_referenced = 0; - list_move_tail(&cache->c_list, &entry->e_list); + list_move_tail(&entry->e_list, &cache->c_list); continue; } list_del_init(&entry->e_list); @@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - int nr_to_scan = sc->nr_to_scan; struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); - return mb_cache_shrink(cache, nr_to_scan); + return mb_cache_shrink(cache, sc->nr_to_scan); } /* We shrink 1/X of the cache when we have too many entries in it */ @@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work) struct mb_cache *mb_cache_create(int bucket_bits) { struct mb_cache *cache; - int bucket_count = 1 << bucket_bits; - int i; - - if (!try_module_get(THIS_MODULE)) - return NULL; + unsigned long bucket_count = 1UL << bucket_bits; + unsigned long i; cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL); if (!cache) @@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits) return cache; err_out: - module_put(THIS_MODULE); return NULL; } EXPORT_SYMBOL(mb_cache_create); @@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache) } kfree(cache->c_hash); kfree(cache); - module_put(THIS_MODULE); } EXPORT_SYMBOL(mb_cache_destroy); @@ -420,7 +414,8 @@ static int __init mbcache_init(void) mb_entry_cache = kmem_cache_create("mbcache", sizeof(struct mb_cache_entry), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); - BUG_ON(!mb_entry_cache); + if (!mb_entry_cache) + return -ENOMEM; return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f975d667c539..e7d9bf86d975 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -434,7 +434,6 @@ static const struct address_space_operations minix_aops = { }; static const struct inode_operations minix_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .getattr = minix_getattr, }; diff --git a/fs/mount.h b/fs/mount.h index d2e25d7b64b3..2c856fc47ae3 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -94,6 +94,12 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); +static inline bool __path_is_mountpoint(const struct path *path) +{ + struct mount *m = __lookup_mnt(path->mnt, path->dentry); + return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); +} + extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) diff --git a/fs/mpage.c b/fs/mpage.c index d2413af0823a..28af984a3d96 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; - int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0); + int op_flags = wbc_to_write_flags(wbc); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, if (mpd->get_block(inode, block_in_file, &map_bh, 1)) goto confused; if (buffer_new(&map_bh)) - unmap_underlying_metadata(map_bh.b_bdev, - map_bh.b_blocknr); + clean_bdev_bh_alias(&map_bh); if (buffer_boundary(&map_bh)) { boundary_block = map_bh.b_blocknr; boundary_bdev = map_bh.b_bdev; @@ -705,7 +704,7 @@ mpage_writepages(struct address_space *mapping, ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } } @@ -726,7 +725,7 @@ int mpage_writepage(struct page *page, get_block_t get_block, int ret = __mpage_writepage(page, wbc, &mpd); if (mpd.bio) { int op_flags = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : 0); + REQ_SYNC : 0); mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio); } return ret; diff --git a/fs/namei.c b/fs/namei.c index 5b4eed221530..ad74877e1442 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -37,7 +37,7 @@ #include <linux/hash.h> #include <linux/bitops.h> #include <linux/init_task.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" #include "mount.h" @@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage(path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) break; } @@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path) } EXPORT_SYMBOL(follow_down_one); -static inline int managed_dentry_rcu(struct dentry *dentry) +static inline int managed_dentry_rcu(const struct path *path) { - return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? - dentry->d_op->d_manage(dentry, true) : 0; + return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ? + path->dentry->d_op->d_manage(path, true) : 0; } /* @@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, * Don't forget we might have a non-mountpoint managed dentry * that wants to block transit. */ - switch (managed_dentry_rcu(path->dentry)) { + switch (managed_dentry_rcu(path)) { case -ECHILD: default: return false; @@ -1392,8 +1392,7 @@ int follow_down(struct path *path) if (managed & DCACHE_MANAGE_TRANSIT) { BUG_ON(!path->dentry->d_op); BUG_ON(!path->dentry->d_op->d_manage); - ret = path->dentry->d_op->d_manage( - path->dentry, false); + ret = path->dentry->d_op->d_manage(path, false); if (ret < 0) return ret == -EISDIR ? 0 : ret; } @@ -1725,30 +1724,35 @@ static int pick_link(struct nameidata *nd, struct path *link, return 1; } +enum {WALK_FOLLOW = 1, WALK_MORE = 2}; + /* * Do we need to follow links? We _really_ want to be able * to do this check without having to look at inode->i_op, * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct nameidata *nd, struct path *link, - int follow, - struct inode *inode, unsigned seq) +static inline int step_into(struct nameidata *nd, struct path *path, + int flags, struct inode *inode, unsigned seq) { - if (likely(!d_is_symlink(link->dentry))) - return 0; - if (!follow) + if (!(flags & WALK_MORE) && nd->depth) + put_link(nd); + if (likely(!d_is_symlink(path->dentry)) || + !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) { + /* not a symlink or should not follow */ + path_to_nameidata(path, nd); + nd->inode = inode; + nd->seq = seq; return 0; + } /* make sure that d_is_symlink above matches inode */ if (nd->flags & LOOKUP_RCU) { - if (read_seqcount_retry(&link->dentry->d_seq, seq)) + if (read_seqcount_retry(&path->dentry->d_seq, seq)) return -ECHILD; } - return pick_link(nd, link, inode, seq); + return pick_link(nd, path, inode, seq); } -enum {WALK_GET = 1, WALK_PUT = 2}; - static int walk_component(struct nameidata *nd, int flags) { struct path path; @@ -1762,7 +1766,7 @@ static int walk_component(struct nameidata *nd, int flags) */ if (unlikely(nd->last_type != LAST_NORM)) { err = handle_dots(nd, nd->last_type); - if (flags & WALK_PUT) + if (!(flags & WALK_MORE) && nd->depth) put_link(nd); return err; } @@ -1789,15 +1793,7 @@ static int walk_component(struct nameidata *nd, int flags) inode = d_backing_inode(path.dentry); } - if (flags & WALK_PUT) - put_link(nd); - err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq); - if (unlikely(err)) - return err; - path_to_nameidata(&path, nd); - nd->inode = inode; - nd->seq = seq; - return 0; + return step_into(nd, &path, flags, inode, seq); } /* @@ -2104,9 +2100,10 @@ OK: if (!name) return 0; /* last component of nested symlink */ - err = walk_component(nd, WALK_GET | WALK_PUT); + err = walk_component(nd, WALK_FOLLOW); } else { - err = walk_component(nd, WALK_GET); + /* not the last component */ + err = walk_component(nd, WALK_FOLLOW | WALK_MORE); } if (err < 0) return err; @@ -2248,12 +2245,7 @@ static inline int lookup_last(struct nameidata *nd) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, - nd->flags & LOOKUP_FOLLOW - ? nd->depth - ? WALK_PUT | WALK_GET - : WALK_GET - : 0); + return walk_component(nd, 0); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ @@ -2558,28 +2550,9 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -/* - * NB: most callers don't do anything directly with the reference to the - * to struct filename, but the nd->last pointer points into the name string - * allocated by getname. So we must hold the reference to it until all - * path-walking is complete. - */ -static inline struct filename * -user_path_parent(int dfd, const char __user *path, - struct path *parent, - struct qstr *last, - int *type, - unsigned int flags) -{ - /* only LOOKUP_REVAL is allowed in extra flags */ - return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL, - parent, last, type); -} - /** * mountpoint_last - look up last component for umount * @nd: pathwalk nameidata - currently pointing at parent directory of "last" - * @path: pointer to container for result * * This is a special lookup_last function just for umount. In this case, we * need to resolve the path without doing any revalidation. @@ -2592,23 +2565,20 @@ user_path_parent(int dfd, const char __user *path, * * Returns: * -error: if there was an error during lookup. This includes -ENOENT if the - * lookup found a negative dentry. The nd->path reference will also be - * put in this case. + * lookup found a negative dentry. * - * 0: if we successfully resolved nd->path and found it to not to be a - * symlink that needs to be followed. "path" will also be populated. - * The nd->path reference will also be put. + * 0: if we successfully resolved nd->last and found it to not to be a + * symlink that needs to be followed. * * 1: if we successfully resolved nd->last and found it to be a symlink - * that needs to be followed. "path" will be populated with the path - * to the link, and nd->path will *not* be put. + * that needs to be followed. */ static int -mountpoint_last(struct nameidata *nd, struct path *path) +mountpoint_last(struct nameidata *nd) { int error = 0; - struct dentry *dentry; struct dentry *dir = nd->path.dentry; + struct path path; /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { @@ -2622,37 +2592,28 @@ mountpoint_last(struct nameidata *nd, struct path *path) error = handle_dots(nd, nd->last_type); if (error) return error; - dentry = dget(nd->path.dentry); + path.dentry = dget(nd->path.dentry); } else { - dentry = d_lookup(dir, &nd->last); - if (!dentry) { + path.dentry = d_lookup(dir, &nd->last); + if (!path.dentry) { /* * No cached dentry. Mounted dentries are pinned in the * cache, so that means that this dentry is probably * a symlink or the path doesn't actually point * to a mounted dentry. */ - dentry = lookup_slow(&nd->last, dir, + path.dentry = lookup_slow(&nd->last, dir, nd->flags | LOOKUP_NO_REVAL); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + if (IS_ERR(path.dentry)) + return PTR_ERR(path.dentry); } } - if (d_is_negative(dentry)) { - dput(dentry); + if (d_is_negative(path.dentry)) { + dput(path.dentry); return -ENOENT; } - if (nd->depth) - put_link(nd); - path->dentry = dentry; - path->mnt = nd->path.mnt; - error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW, - d_backing_inode(dentry), 0); - if (unlikely(error)) - return error; - mntget(path->mnt); - follow_mount(path); - return 0; + path.mnt = nd->path.mnt; + return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0); } /** @@ -2672,13 +2633,19 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) if (IS_ERR(s)) return PTR_ERR(s); while (!(err = link_path_walk(s, nd)) && - (err = mountpoint_last(nd, path)) > 0) { + (err = mountpoint_last(nd)) > 0) { s = trailing_symlink(nd); if (IS_ERR(s)) { err = PTR_ERR(s); break; } } + if (!err) { + *path = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + follow_mount(path); + } terminate_walk(nd); return err; } @@ -2895,7 +2862,7 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(struct path *path, int acc_mode, int flag) +static int may_open(const struct path *path, int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2945,7 +2912,7 @@ static int may_open(struct path *path, int acc_mode, int flag) static int handle_truncate(struct file *filp) { - struct path *path = &filp->f_path; + const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; int error = get_write_access(inode); if (error) @@ -3335,18 +3302,11 @@ static int do_last(struct nameidata *nd, seq = 0; /* out of RCU mode, so the value doesn't matter */ inode = d_backing_inode(path.dentry); finish_lookup: - if (nd->depth) - put_link(nd); - error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW, - inode, seq); + error = step_into(nd, &path, 0, inode, seq); if (unlikely(error)) return error; - - path_to_nameidata(&path, nd); - nd->inode = inode; - nd->seq = seq; - /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: + /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ error = complete_walk(nd); if (error) return error; @@ -3861,8 +3821,8 @@ static long do_rmdir(int dfd, const char __user *pathname) int type; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, - &path, &last, &type, lookup_flags); + name = filename_parentat(dfd, getname(pathname), lookup_flags, + &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); @@ -3991,8 +3951,8 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, - &path, &last, &type, lookup_flags); + name = filename_parentat(dfd, getname(pathname), lookup_flags, + &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); @@ -4345,11 +4305,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, bool new_is_dir = false; unsigned max_links = new_dir->i_sb->s_max_links; - /* - * Check source == target. - * On overlayfs need to look at underlying inodes. - */ - if (d_real_inode(old_dentry) == d_real_inode(new_dentry)) + if (source == target) return 0; error = may_delete(old_dir, old_dentry, is_dir); @@ -4491,15 +4447,15 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, target_flags = 0; retry: - from = user_path_parent(olddfd, oldname, - &old_path, &old_last, &old_type, lookup_flags); + from = filename_parentat(olddfd, getname(oldname), lookup_flags, + &old_path, &old_last, &old_type); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, - &new_path, &new_last, &new_type, lookup_flags); + to = filename_parentat(newdfd, getname(newname), lookup_flags, + &new_path, &new_last, &new_type); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; @@ -4650,7 +4606,8 @@ out: * have ->get_link() not calling nd_jump_link(). Using (or not using) it * for any given inode is up to filesystem. */ -int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) +static int generic_readlink(struct dentry *dentry, char __user *buffer, + int buflen) { DEFINE_DELAYED_CALL(done); struct inode *inode = d_inode(dentry); @@ -4666,7 +4623,36 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) do_delayed_call(&done); return res; } -EXPORT_SYMBOL(generic_readlink); + +/** + * vfs_readlink - copy symlink body into userspace buffer + * @dentry: dentry on which to get symbolic link + * @buffer: user memory pointer + * @buflen: size of buffer + * + * Does not touch atime. That's up to the caller if necessary + * + * Does not call security hook. + */ +int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) +{ + struct inode *inode = d_inode(dentry); + + if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) { + if (unlikely(inode->i_op->readlink)) + return inode->i_op->readlink(dentry, buffer, buflen); + + if (!d_is_symlink(dentry)) + return -EINVAL; + + spin_lock(&inode->i_lock); + inode->i_opflags |= IOP_DEFAULT_READLINK; + spin_unlock(&inode->i_lock); + } + + return generic_readlink(dentry, buffer, buflen); +} +EXPORT_SYMBOL(vfs_readlink); /** * vfs_get_link - get symlink body @@ -4783,7 +4769,6 @@ int page_symlink(struct inode *inode, const char *symname, int len) EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/fs/namespace.c b/fs/namespace.c index e6c234b1a645..487ba30bb5c6 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -96,10 +96,6 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) return &mountpoint_hashtable[tmp & mp_hash_mask]; } -/* - * allocation is serialized by namespace_sem, but we need the spinlock to - * serialize with freeing. - */ static int mnt_alloc_id(struct mount *mnt) { int res; @@ -678,7 +674,7 @@ out: * * lookup_mnt takes a reference to the found vfsmount. */ -struct vfsmount *lookup_mnt(struct path *path) +struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; @@ -746,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry) return NULL; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +static struct mountpoint *get_mountpoint(struct dentry *dentry) { - struct hlist_head *chain = mp_hash(dentry); - struct mountpoint *mp; + struct mountpoint *mp, *new = NULL; int ret; - mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); - if (!mp) + if (d_mountpoint(dentry)) { +mountpoint: + read_seqlock_excl(&mount_lock); + mp = lookup_mountpoint(dentry); + read_sequnlock_excl(&mount_lock); + if (mp) + goto done; + } + + if (!new) + new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); + if (!new) return ERR_PTR(-ENOMEM); + + /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); - if (ret) { - kfree(mp); - return ERR_PTR(ret); - } - mp->m_dentry = dentry; - mp->m_count = 1; - hlist_add_head(&mp->m_hash, chain); - INIT_HLIST_HEAD(&mp->m_list); + /* Someone else set d_mounted? */ + if (ret == -EBUSY) + goto mountpoint; + + /* The dentry is not available as a mountpoint? */ + mp = ERR_PTR(ret); + if (ret) + goto done; + + /* Add the new mountpoint to the hash table */ + read_seqlock_excl(&mount_lock); + new->m_dentry = dentry; + new->m_count = 1; + hlist_add_head(&new->m_hash, mp_hash(dentry)); + INIT_HLIST_HEAD(&new->m_list); + read_sequnlock_excl(&mount_lock); + + mp = new; + new = NULL; +done: + kfree(new); return mp; } @@ -1034,6 +1054,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; + } else { + CLEAR_MNT_SHARED(mnt); } if (flag & CL_MAKE_SHARED) set_mnt_shared(mnt); @@ -1159,7 +1181,36 @@ struct vfsmount *mntget(struct vfsmount *mnt) } EXPORT_SYMBOL(mntget); -struct vfsmount *mnt_clone_internal(struct path *path) +/* path_is_mountpoint() - Check if path is a mount in the current + * namespace. + * + * d_mountpoint() can only be used reliably to establish if a dentry is + * not mounted in any namespace and that common case is handled inline. + * d_mountpoint() isn't aware of the possibility there may be multiple + * mounts using a given dentry in a different namespace. This function + * checks if the passed in path is a mountpoint rather than the dentry + * alone. + */ +bool path_is_mountpoint(const struct path *path) +{ + unsigned seq; + bool res; + + if (!d_mountpoint(path->dentry)) + return false; + + rcu_read_lock(); + do { + seq = read_seqbegin(&mount_lock); + res = __path_is_mountpoint(path); + } while (read_seqretry(&mount_lock, seq)); + rcu_read_unlock(); + + return res; +} +EXPORT_SYMBOL(path_is_mountpoint); + +struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); @@ -1568,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry) struct mount *mnt; namespace_lock(); + lock_mount_hash(); mp = lookup_mountpoint(dentry); if (IS_ERR_OR_NULL(mp)) goto out_unlock; - lock_mount_hash(); event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); @@ -1582,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry) } else umount_tree(mnt, UMOUNT_CONNECTED); } - unlock_mount_hash(); put_mountpoint(mp); out_unlock: + unlock_mount_hash(); namespace_unlock(); } @@ -1758,7 +1809,7 @@ out: /* Caller should check returned pointer for errors */ -struct vfsmount *collect_mounts(struct path *path) +struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); @@ -1791,7 +1842,7 @@ void drop_collected_mounts(struct vfsmount *mnt) * * Release with mntput(). */ -struct vfsmount *clone_private_mount(struct path *path) +struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; @@ -1799,9 +1850,7 @@ struct vfsmount *clone_private_mount(struct path *path) if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); - down_read(&namespace_sem); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); @@ -2013,9 +2062,7 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = lookup_mountpoint(dentry); - if (!mp) - mp = new_mountpoint(dentry); + struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); @@ -2034,7 +2081,11 @@ retry: static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; + + read_seqlock_excl(&mount_lock); put_mountpoint(where); + read_sequnlock_excl(&mount_lock); + namespace_unlock(); inode_unlock(dentry->d_inode); } @@ -2997,7 +3048,7 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -bool path_is_under(struct path *path1, struct path *path2) +bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); @@ -3110,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); + put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); - put_mountpoint(root_mp); error = 0; out4: unlock_mount(old_mp); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 6df2a3827574..088f52484d6e 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -18,7 +18,7 @@ #include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/namei.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> #include "ncp_fs.h" diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index dd38ca1f2ecb..76965e772264 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -8,7 +8,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/time.h> #include <linux/kernel.h> @@ -203,7 +203,7 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) bufsize - (pos % bufsize), iov_iter_count(from)); - if (copy_from_iter(bouncebuffer, to_write, from) != to_write) { + if (!copy_from_iter_full(bouncebuffer, to_write, from)) { errno = -EFAULT; break; } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index f6cf4c7e92b1..7eb89c23c847 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -13,7 +13,7 @@ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> #include <linux/time.h> @@ -243,7 +243,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) static const struct inode_operations ncp_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = ncp_notify_change, }; diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 0a3f9b594602..4434e4977cf3 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -20,7 +20,7 @@ #include <linux/vmalloc.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "ncp_fs.h" diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c index 33b873b259a8..39f57bef8531 100644 --- a/fs/ncpfs/mmap.c +++ b/fs/ncpfs/mmap.c @@ -18,7 +18,7 @@ #include <linux/fcntl.h> #include <linux/memcontrol.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "ncp_fs.h" diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 17cfb743b5bf..b4c87cfcee95 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -21,7 +21,7 @@ #include <linux/fcntl.h> #include <linux/pagemap.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> #include <asm/unaligned.h> #include <asm/string.h> diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c index 471bc3d1139e..f32f272ee501 100644 --- a/fs/ncpfs/sock.c +++ b/fs/ncpfs/sock.c @@ -16,7 +16,7 @@ #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/in.h> #include <linux/net.h> #include <linux/mm.h> diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 421b6f91e8ec..a6d26b46fc05 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -21,7 +21,7 @@ */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index e9aa235e9d10..f073a6d2c6a5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -110,20 +110,52 @@ out: #if defined(CONFIG_NFS_V4_1) /* - * Lookup a layout by filehandle. + * Lookup a layout inode by stateid * - * Note: gets a refcount on the layout hdr and on its respective inode. - * Caller must put the layout hdr and the inode. + * Note: returns a refcount on the inode and superblock + */ +static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, + const nfs4_stateid *stateid) +{ + struct nfs_server *server; + struct inode *inode; + struct pnfs_layout_hdr *lo; + +restart: + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + list_for_each_entry(lo, &server->layouts, plh_layouts) { + if (stateid != NULL && + !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) + continue; + inode = igrab(lo->plh_inode); + if (!inode) + continue; + if (!nfs_sb_active(inode->i_sb)) { + rcu_read_lock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); + goto restart; + } + return inode; + } + } + + return NULL; +} + +/* + * Lookup a layout inode by filehandle. + * + * Note: returns a refcount on the inode and superblock * - * TODO: keep track of all layouts (and delegations) in a hash table - * hashed by filehandle. */ -static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp, - struct nfs_fh *fh) +static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, + const struct nfs_fh *fh) { struct nfs_server *server; struct nfs_inode *nfsi; - struct inode *ino; + struct inode *inode; struct pnfs_layout_hdr *lo; restart: @@ -134,37 +166,38 @@ restart: continue; if (nfsi->layout != lo) continue; - ino = igrab(lo->plh_inode); - if (!ino) - break; - spin_lock(&ino->i_lock); - /* Is this layout in the process of being freed? */ - if (nfsi->layout != lo) { - spin_unlock(&ino->i_lock); - iput(ino); + inode = igrab(lo->plh_inode); + if (!inode) + continue; + if (!nfs_sb_active(inode->i_sb)) { + rcu_read_lock(); + spin_unlock(&clp->cl_lock); + iput(inode); + spin_lock(&clp->cl_lock); goto restart; } - pnfs_get_layout_hdr(lo); - spin_unlock(&ino->i_lock); - return lo; + return inode; } } return NULL; } -static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, - struct nfs_fh *fh) +static struct inode *nfs_layout_find_inode(struct nfs_client *clp, + const struct nfs_fh *fh, + const nfs4_stateid *stateid) { - struct pnfs_layout_hdr *lo; + struct inode *inode; spin_lock(&clp->cl_lock); rcu_read_lock(); - lo = get_layout_by_fh_locked(clp, fh); + inode = nfs_layout_find_inode_by_stateid(clp, stateid); + if (!inode) + inode = nfs_layout_find_inode_by_fh(clp, fh); rcu_read_unlock(); spin_unlock(&clp->cl_lock); - return lo; + return inode; } /* @@ -213,18 +246,20 @@ static u32 initiate_file_draining(struct nfs_client *clp, u32 rv = NFS4ERR_NOMATCHING_LAYOUT; LIST_HEAD(free_me_list); - lo = get_layout_by_fh(clp, &args->cbl_fh); - if (!lo) { - trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, - &args->cbl_stateid, -rv); + ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid); + if (!ino) goto out; - } - ino = lo->plh_inode; pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); + lo = NFS_I(ino)->layout; + if (!lo) { + spin_unlock(&ino->i_lock); + goto out; + } + pnfs_get_layout_hdr(lo); rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid); if (rv != NFS_OK) goto unlock; @@ -258,10 +293,10 @@ unlock: /* Free all lsegs that are attached to commit buckets */ nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); +out: trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, &args->cbl_stateid, -rv); - iput(ino); -out: + nfs_iput_and_deactive(ino); return rv; } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ebecfb8fba06..91a8d610ba0f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -369,9 +369,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, * Look up a client by IP address and protocol version * - creates a new record if one doesn't yet exist */ -struct nfs_client * -nfs_get_client(const struct nfs_client_initdata *cl_init, - rpc_authflavor_t authflavour) +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp, *new = NULL; struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id); @@ -655,7 +653,7 @@ static int nfs_init_server(struct nfs_server *server, set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX); + clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) { dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); return PTR_ERR(clp); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index dff600ae0d74..d7df5e67b0c1 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -391,10 +391,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; - /* Ensure we revalidate the attributes and page cache! */ - spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_REVAL_FORCED; - spin_unlock(&inode->i_lock); trace_nfs4_set_delegation(inode, res->delegation_type); out: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 5f1af4cd1a33..fad81041f5ab 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -455,14 +455,17 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx) } /* - * This function is called by the lookup code to request the use of - * readdirplus to accelerate any future lookups in the same + * This function is called by the lookup and getattr code to request the + * use of readdirplus to accelerate any future lookups in the same * directory. */ -static void nfs_advise_use_readdirplus(struct inode *dir) { - set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags); + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && + !list_empty(&nfsi->open_files)) + set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); } /* @@ -475,9 +478,12 @@ void nfs_advise_use_readdirplus(struct inode *dir) */ void nfs_force_use_readdirplus(struct inode *dir) { - if (!list_empty(&NFS_I(dir)->open_files)) { - nfs_advise_use_readdirplus(dir); - nfs_zap_mapping(dir, dir->i_mapping); + struct nfs_inode *nfsi = NFS_I(dir); + + if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) && + !list_empty(&nfsi->open_files)) { + set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); + invalidate_mapping_pages(dir->i_mapping, 0, -1); } } @@ -886,17 +892,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc) goto out; } -static bool nfs_dir_mapping_need_revalidate(struct inode *dir) -{ - struct nfs_inode *nfsi = NFS_I(dir); - - if (nfs_attribute_cache_expired(dir)) - return true; - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) - return true; - return false; -} - /* The file offset position represents the dirent entry number. A last cookie cache takes care of the common case of reading the whole directory. @@ -928,7 +923,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) desc->decode = NFS_PROTO(inode)->decode_dirent; desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0; - if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode)) + if (ctx->pos == 0 || nfs_attribute_cache_expired(inode)) res = nfs_revalidate_mapping(inode, file->f_mapping); if (res < 0) goto out; @@ -1035,8 +1030,6 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate); static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, int rcu_walk) { - int ret; - if (IS_ROOT(dentry)) return 1; if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) @@ -1044,12 +1037,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry, if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ - if (rcu_walk) - ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir); - else - ret = nfs_revalidate_inode(NFS_SERVER(dir), dir); - if (ret < 0) - return 0; + if (nfs_mapping_need_revalidate_inode(dir)) { + if (rcu_walk) + return 0; + if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0) + return 0; + } if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; return 1; @@ -1161,7 +1154,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; goto out_bad; } - goto out_valid_noent; + goto out_valid; } if (is_bad_inode(inode)) { @@ -1184,6 +1177,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; goto out_zap_parent; } + nfs_advise_use_readdirplus(dir); goto out_valid; } @@ -1219,12 +1213,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) nfs_free_fhandle(fhandle); nfs4_label_free(label); + /* set a readdirplus hint that we had a cache miss */ + nfs_force_use_readdirplus(dir); + out_set_verifier: nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); out_valid: - /* Success: notify readdir to use READDIRPLUS */ - nfs_advise_use_readdirplus(dir); - out_valid_noent: if (flags & LOOKUP_RCU) { if (parent != ACCESS_ONCE(dentry->d_parent)) return -ECHILD; @@ -1279,8 +1273,8 @@ out_error: */ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { - int error; struct inode *inode = d_inode(dentry); + int error = 0; /* * I believe we can only get a negative dentry here in the case of a @@ -1299,7 +1293,8 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - error = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (nfs_mapping_need_revalidate_inode(inode)) + error = __nfs_revalidate_inode(NFS_SERVER(inode), inode); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; @@ -1424,8 +1419,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in if (IS_ERR(res)) goto out_label; - /* Success: notify readdir to use READDIRPLUS */ - nfs_advise_use_readdirplus(dir); + /* Notify readdir to use READDIRPLUS */ + nfs_force_use_readdirplus(dir); no_entry: res = d_splice_alias(inode, dentry); @@ -1467,9 +1462,9 @@ static fmode_t flags_to_mode(int flags) return res; } -static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags) +static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp) { - return alloc_nfs_open_context(dentry, flags_to_mode(open_flags)); + return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp); } static int do_open(struct inode *inode, struct file *filp) @@ -1535,8 +1530,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, return -ENAMETOOLONG; if (open_flags & O_CREAT) { + struct nfs_server *server = NFS_SERVER(dir); + + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + mode &= ~current_umask(); + attr.ia_valid |= ATTR_MODE; - attr.ia_mode = mode & ~current_umask(); + attr.ia_mode = mode; } if (open_flags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; @@ -1554,7 +1554,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, return finish_no_open(file, dentry); } - ctx = create_nfs_open_context(dentry, open_flags); + ctx = create_nfs_open_context(dentry, open_flags, file); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; @@ -2286,8 +2286,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str if (cache == NULL) goto out; /* Found an entry, is our attribute cache valid? */ - if (!nfs_attribute_cache_expired(inode) && - !(nfsi->cache_validity & NFS_INO_INVALID_ATTR)) + if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) break; err = -ECHILD; if (!may_block) @@ -2335,12 +2334,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, cache = NULL; if (cache == NULL) goto out; - err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode); - if (err) + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; + err = 0; out: rcu_read_unlock(); return err; @@ -2492,12 +2491,13 @@ EXPORT_SYMBOL_GPL(nfs_may_open); static int nfs_execute_ok(struct inode *inode, int mask) { struct nfs_server *server = NFS_SERVER(inode); - int ret; + int ret = 0; - if (mask & MAY_NOT_BLOCK) - ret = nfs_revalidate_inode_rcu(server, inode); - else - ret = nfs_revalidate_inode(server, inode); + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) { + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + ret = __nfs_revalidate_inode(server, inode); + } if (ret == 0 && !execute_ok(inode)) ret = -EACCES; return ret; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index bd81bcf3ffcf..aab32fc3d6a8 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -52,7 +52,7 @@ #include <linux/nfs_page.h> #include <linux/sunrpc/clnt.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/atomic.h> #include "internal.h" @@ -105,7 +105,7 @@ struct nfs_direct_req { static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops; static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops; -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode); +static void nfs_direct_write_complete(struct nfs_direct_req *dreq); static void nfs_direct_write_schedule_work(struct work_struct *work); static inline void get_dreq(struct nfs_direct_req *dreq) @@ -684,7 +684,7 @@ out_failed: } if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, dreq->inode); + nfs_direct_write_complete(dreq); } static void nfs_direct_commit_complete(struct nfs_commit_data *data) @@ -717,7 +717,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) } if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_direct_write_complete(dreq, data->inode); + nfs_direct_write_complete(dreq); } static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, @@ -768,7 +768,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work) } } -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) +static void nfs_direct_write_complete(struct nfs_direct_req *dreq) { schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } @@ -824,7 +824,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) out_put: if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, hdr->inode); + nfs_direct_write_complete(dreq); hdr->release(hdr); } @@ -953,7 +953,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } if (put_dreq(dreq)) - nfs_direct_write_complete(dreq, dreq->inode); + nfs_direct_write_complete(dreq); return 0; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 9ea85ae23c32..26dbe8b0c10d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -29,7 +29,7 @@ #include <linux/gfp.h> #include <linux/swap.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "delegation.h" #include "internal.h" @@ -101,18 +101,11 @@ EXPORT_SYMBOL_GPL(nfs_file_release); static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs_inode *nfsi = NFS_I(inode); - - if (nfs_have_delegated_attributes(inode)) - goto out_noreval; if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) - goto force_reval; - if (nfs_attribute_timeout(inode)) + if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE)) goto force_reval; -out_noreval: return 0; force_reval: return __nfs_revalidate_inode(server, inode); @@ -374,7 +367,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping, */ if (!PageUptodate(page)) { unsigned pglen = nfs_page_length(page); - unsigned end = offset + len; + unsigned end = offset + copied; if (pglen == 0) { zero_user_segments(page, 0, offset, diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4946ef40ba87..f956ca20a8a3 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -279,11 +279,11 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx) nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, 4, - s->nfs_client->cl_minorversion, - s->nfs_client->cl_rpcclient->cl_auth->au_flavor); + s->nfs_client->cl_minorversion); out_test_devid: - if (filelayout_test_devid_unavailable(devid)) + if (ret->ds_clp == NULL || + filelayout_test_devid_unavailable(devid)) ret = NULL; out: return ret; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 98ace127bf86..0ca4af8cca5d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -25,9 +25,20 @@ #define NFSDBG_FACILITY NFSDBG_PNFS_LD #define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) +#define FF_LAYOUTRETURN_MAXERR 20 + static struct group_info *ff_zero_group; +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr); +static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, + struct nfs42_layoutstat_devinfo *devinfo, + int dev_limit); +static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, + const struct nfs42_layoutstat_devinfo *devinfo, + struct nfs4_ff_layout_mirror *mirror); + static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) { @@ -172,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (mirror->mirror_ds != pos->mirror_ds) + if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -349,19 +360,6 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) } } -static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls) -{ - struct nfs4_deviceid_node *node; - int i; - - if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS)) - return; - for (i = 0; i < fls->mirror_array_cnt; i++) { - node = &fls->mirror_array[i]->mirror_ds->id_node; - clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags); - } -} - static struct pnfs_layout_segment * ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_layoutget_res *lgr, @@ -415,8 +413,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; - struct nfs4_deviceid devid; - struct nfs4_deviceid_node *idnode; struct auth_cred acred = { .group_info = ff_zero_group }; struct rpc_cred __rcu *cred; u32 ds_count, fh_count, id; @@ -441,24 +437,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->ds_count = ds_count; /* deviceid */ - rc = decode_deviceid(&stream, &devid); + rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); if (rc) goto out_err_free; - idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), - &devid, lh->plh_lc_cred, - gfp_flags); - /* - * upon success, mirror_ds is allocated by previous - * getdeviceinfo, or newly by .alloc_deviceid_node - * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure - */ - if (idnode) - fls->mirror_array[i]->mirror_ds = - FF_LAYOUT_MIRROR_DS(idnode); - else - goto out_err_free; - /* efficiency */ rc = -EIO; p = xdr_inline_decode(&stream, 4); @@ -556,8 +538,6 @@ out_sort_mirrors: rc = ff_layout_check_layout(lgr); if (rc) goto out_err_free; - ff_layout_mark_devices_valid(fls); - ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: @@ -639,12 +619,11 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { - static const ktime_t notime = {0}; s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL; struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (ktime_equal(mirror->start_time, notime)) + if (!mirror->start_time) mirror->start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; @@ -702,6 +681,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, spin_lock(&mirror->lock); report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) @@ -718,6 +698,7 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, requested, completed, ktime_get(), task->tk_start); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } @@ -731,6 +712,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, spin_lock(&mirror->lock); report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); if (report) @@ -750,6 +732,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, spin_lock(&mirror->lock); nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, requested, completed, ktime_get(), task->tk_start); + set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } @@ -1142,7 +1125,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, case -EPIPE: dprintk("%s DS connection error %d\n", __func__, task->tk_status); - nfs4_mark_deviceid_unavailable(devid); + nfs4_delete_deviceid(devid->ld, devid->nfs_client, + &devid->deviceid); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -1191,7 +1175,8 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); - nfs4_mark_deviceid_unavailable(devid); + nfs4_delete_deviceid(devid->ld, devid->nfs_client, + &devid->deviceid); } /* FIXME: Need to prevent infinite looping here. */ return -NFS4ERR_RESET_TO_PNFS; @@ -1293,6 +1278,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; + ff_layout_read_record_layoutstats_done(task, hdr); pnfs_read_resend_pnfs(hdr); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: @@ -1961,38 +1947,88 @@ ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) id_node)); } -static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) +static int ff_layout_encode_ioerr(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + const struct nfs4_flexfile_layoutreturn_args *ff_args) { - struct pnfs_layout_hdr *hdr = &flo->generic_hdr; __be32 *start; - int count = 0, ret = 0; start = xdr_reserve_space(xdr, 4); if (unlikely(!start)) return -E2BIG; + *start = cpu_to_be32(ff_args->num_errors); /* This assume we always return _ALL_ layouts */ - spin_lock(&hdr->plh_inode->i_lock); - ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range); - spin_unlock(&hdr->plh_inode->i_lock); + return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors); +} - *start = cpu_to_be32(count); +static void +encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len) +{ + __be32 *p; - return ret; + p = xdr_reserve_space(xdr, len); + xdr_encode_opaque_fixed(p, buf, len); +} + +static void +ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs42_layoutstat_devinfo *devinfo) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); + p = xdr_reserve_space(xdr, 4*8); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE); +} + +static void +ff_layout_encode_ff_iostat(struct xdr_stream *xdr, + const nfs4_stateid *stateid, + const struct nfs42_layoutstat_devinfo *devinfo) +{ + ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo); + ff_layout_encode_ff_layoutupdate(xdr, devinfo, + devinfo->ld_private.data); } /* report nothing for now */ -static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) +static void ff_layout_encode_iostats_array(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct nfs4_flexfile_layoutreturn_args *ff_args) { __be32 *p; + int i; p = xdr_reserve_space(xdr, 4); - if (likely(p)) - *p = cpu_to_be32(0); + *p = cpu_to_be32(ff_args->num_dev); + for (i = 0; i < ff_args->num_dev; i++) + ff_layout_encode_ff_iostat(xdr, + &args->layout->plh_stateid, + &ff_args->devinfo[i]); +} + +static void +ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo, + unsigned int num_entries) +{ + unsigned int i; + + for (i = 0; i < num_entries; i++) { + if (!devinfo[i].ld_private.ops) + continue; + if (!devinfo[i].ld_private.ops->free) + continue; + devinfo[i].ld_private.ops->free(&devinfo[i].ld_private); + } } static struct nfs4_deviceid_node * @@ -2008,24 +2044,91 @@ ff_layout_alloc_deviceid_node(struct nfs_server *server, } static void -ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); +ff_layout_encode_layoutreturn(struct xdr_stream *xdr, + const void *voidargs, + const struct nfs4_xdr_opaque_data *ff_opaque) +{ + const struct nfs4_layoutreturn_args *args = voidargs; + struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data; + struct xdr_buf tmp_buf = { + .head = { + [0] = { + .iov_base = page_address(ff_args->pages[0]), + }, + }, + .buflen = PAGE_SIZE, + }; + struct xdr_stream tmp_xdr; __be32 *start; dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - ff_layout_encode_ioerr(flo, xdr, args); - ff_layout_encode_iostats(flo, xdr, args); + xdr_init_encode(&tmp_xdr, &tmp_buf, NULL); + + ff_layout_encode_ioerr(&tmp_xdr, args, ff_args); + ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args); + + start = xdr_reserve_space(xdr, 4); + *start = cpu_to_be32(tmp_buf.len); + xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len); - *start = cpu_to_be32((xdr->p - start - 1) * 4); dprintk("%s: Return\n", __func__); } +static void +ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args) +{ + struct nfs4_flexfile_layoutreturn_args *ff_args; + + if (!args->data) + return; + ff_args = args->data; + args->data = NULL; + + ff_layout_free_ds_ioerr(&ff_args->errors); + ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev); + + put_page(ff_args->pages[0]); + kfree(ff_args); +} + +const struct nfs4_xdr_opaque_ops layoutreturn_ops = { + .encode = ff_layout_encode_layoutreturn, + .free = ff_layout_free_layoutreturn, +}; + +static int +ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args) +{ + struct nfs4_flexfile_layoutreturn_args *ff_args; + struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout); + + ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL); + if (!ff_args) + goto out_nomem; + ff_args->pages[0] = alloc_page(GFP_KERNEL); + if (!ff_args->pages[0]) + goto out_nomem_free; + + INIT_LIST_HEAD(&ff_args->errors); + ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout, + &args->range, &ff_args->errors, + FF_LAYOUTRETURN_MAXERR); + + spin_lock(&args->inode->i_lock); + ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, + &ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo)); + spin_unlock(&args->inode->i_lock); + + args->ld_private->ops = &layoutreturn_ops; + args->ld_private->data = ff_args; + return 0; +out_nomem_free: + kfree(ff_args); +out_nomem: + return -ENOMEM; +} + static int ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) { @@ -2146,21 +2249,18 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, } static void -ff_layout_encode_layoutstats(struct xdr_stream *xdr, - struct nfs42_layoutstat_args *args, - struct nfs42_layoutstat_devinfo *devinfo) +ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, + const struct nfs42_layoutstat_devinfo *devinfo, + struct nfs4_ff_layout_mirror *mirror) { - struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; struct nfs4_pnfs_ds_addr *da; struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; struct nfs_fh *fh = &mirror->fh_versions[0]; - __be32 *p, *start; + __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); dprintk("%s: DS %s: encoding address %s\n", __func__, ds->ds_remotestr, da->da_remotestr); - /* layoutupdate length */ - start = xdr_reserve_space(xdr, 4); /* netaddr4 */ ff_layout_encode_netaddr(xdr, da); /* nfs_fh4 */ @@ -2177,42 +2277,71 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, + const struct nfs4_xdr_opaque_data *opaque) +{ + struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque, + struct nfs42_layoutstat_devinfo, ld_private); + __be32 *start; + + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data); *start = cpu_to_be32((xdr->p - start - 1) * 4); } +static void +ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) +{ + struct nfs4_ff_layout_mirror *mirror = opaque->data; + + ff_layout_put_mirror(mirror); +} + +static const struct nfs4_xdr_opaque_ops layoutstat_ops = { + .encode = ff_layout_encode_layoutstats, + .free = ff_layout_free_layoutstats, +}; + static int -ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, - struct pnfs_layout_hdr *lo, +ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, + struct nfs42_layoutstat_devinfo *devinfo, int dev_limit) { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *dev; - struct nfs42_layoutstat_devinfo *devinfo; int i = 0; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { if (i >= dev_limit) break; - if (!mirror->mirror_ds) + if (IS_ERR_OR_NULL(mirror->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) continue; /* mirror refcount put in cleanup_layoutstats */ if (!atomic_inc_not_zero(&mirror->ref)) continue; dev = &mirror->mirror_ds->id_node; - devinfo = &args->devinfo[i]; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); devinfo->offset = 0; devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); devinfo->read_count = mirror->read_stat.io_stat.ops_completed; devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; devinfo->write_count = mirror->write_stat.io_stat.ops_completed; devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->layoutstats_encode = ff_layout_encode_layoutstats; - devinfo->layout_private = mirror; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = mirror; + devinfo++; i++; } return i; @@ -2222,47 +2351,27 @@ static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { struct nfs4_flexfile_layout *ff_layout; - struct nfs4_ff_layout_mirror *mirror; - int dev_count = 0; + const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; - spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (atomic_read(&mirror->ref) != 0) - dev_count ++; - } - spin_unlock(&args->inode->i_lock); /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ - if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { - dprintk("%s: truncating devinfo to limit (%d:%d)\n", - __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); - dev_count = PNFS_LAYOUTSTATS_MAXDEV; - } args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO); if (!args->devinfo) return -ENOMEM; spin_lock(&args->inode->i_lock); - args->num_dev = ff_layout_mirror_prepare_stats(args, - &ff_layout->generic_hdr, dev_count); + ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); + args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, + &args->devinfo[0], dev_count); spin_unlock(&args->inode->i_lock); + if (!args->num_dev) { + kfree(args->devinfo); + args->devinfo = NULL; + return -ENOENT; + } return 0; } -static void -ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) -{ - struct nfs4_ff_layout_mirror *mirror; - int i; - - for (i = 0; i < data->args.num_dev; i++) { - mirror = data->args.devinfo[i].layout_private; - data->args.devinfo[i].layout_private = NULL; - ff_layout_put_mirror(mirror); - } -} - static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -2284,10 +2393,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .read_pagelist = ff_layout_read_pagelist, .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, - .encode_layoutreturn = ff_layout_encode_layoutreturn, + .prepare_layoutreturn = ff_layout_prepare_layoutreturn, .sync = pnfs_nfs_generic_sync, .prepare_layoutstats = ff_layout_prepare_layoutstats, - .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 3ee0c9fcea76..f4f39b0ab09b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,7 @@ /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) +#define FF_LAYOUTSTATS_MAXDEV 4 struct nfs4_ff_ds_version { u32 version; @@ -73,6 +74,7 @@ struct nfs4_ff_layout_mirror { struct list_head mirrors; u32 ds_count; u32 efficiency; + struct nfs4_deviceid devid; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; @@ -81,12 +83,15 @@ struct nfs4_ff_layout_mirror { struct rpc_cred __rcu *rw_cred; atomic_t ref; spinlock_t lock; + unsigned long flags; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; u32 report_interval; }; +#define NFS4_FF_MIRROR_STAT_AVAIL (0) + struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; @@ -103,6 +108,14 @@ struct nfs4_flexfile_layout { ktime_t last_report_time; /* Layoutstat report times */ }; +struct nfs4_flexfile_layoutreturn_args { + struct list_head errors; + struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV]; + unsigned int num_errors; + unsigned int num_dev; + struct page *pages[1]; +}; + static inline struct nfs4_flexfile_layout * FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) { @@ -180,9 +193,12 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_mirror *mirror, u64 offset, u64 length, int status, enum nfs_opnum4 opnum, gfp_t gfp_flags); -int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, int *count, - const struct pnfs_layout_range *range); +int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); +void ff_layout_free_ds_ioerr(struct list_head *head); +unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum); struct nfs_fh * nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); @@ -197,7 +213,6 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, struct inode *inode); struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, struct rpc_cred *mdscred); -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index f7a3f6b05369..e5a6f248697b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -20,9 +20,11 @@ static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS; static unsigned int dataserver_retrans; +static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); + void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { - if (mirror_ds) + if (!IS_ERR_OR_NULL(mirror_ds)) nfs4_put_deviceid_node(&mirror_ds->id_node); } @@ -175,19 +177,36 @@ out_err: static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg, struct nfs4_deviceid_node *devid) { - nfs4_mark_deviceid_unavailable(devid); + nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid); if (!ff_layout_has_available_ds(lseg)) pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); } static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + bool create) { - if (mirror == NULL || mirror->mirror_ds == NULL) { - pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, - lseg); - return false; + if (mirror == NULL || IS_ERR(mirror->mirror_ds)) + goto outerr; + if (mirror->mirror_ds == NULL) { + if (create) { + struct nfs4_deviceid_node *node; + struct pnfs_layout_hdr *lh = lseg->pls_layout; + struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); + + node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), + &mirror->devid, lh->plh_lc_cred, + GFP_KERNEL); + if (node) + mirror_ds = FF_LAYOUT_MIRROR_DS(node); + + /* check for race with another call to this function */ + if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + mirror_ds != ERR_PTR(-ENODEV)) + nfs4_put_deviceid_node(node); + } else + goto outerr; } if (mirror->mirror_ds->ds == NULL) { struct nfs4_deviceid_node *devid; @@ -196,15 +215,9 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg, return false; } return true; -} - -static u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; +outerr: + pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg); + return false; } static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, @@ -212,8 +225,8 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, { u64 end; - end = max_t(u64, end_offset(err->offset, err->length), - end_offset(offset, length)); + end = max_t(u64, pnfs_end_offset(err->offset, err->length), + pnfs_end_offset(offset, length)); err->offset = min_t(u64, err->offset, offset); err->length = end - err->offset; } @@ -235,9 +248,9 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); if (ret != 0) return ret; - if (end_offset(e1->offset, e1->length) < e2->offset) + if (pnfs_end_offset(e1->offset, e1->length) < e2->offset) return -1; - if (e1->offset > end_offset(e2->offset, e2->length)) + if (e1->offset > pnfs_end_offset(e2->offset, e2->length)) return 1; /* If ranges overlap or are contiguous, they are the same */ return 0; @@ -263,8 +276,9 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } /* Entries match, so merge "err" into "dserr" */ extend_ds_error(dserr, err->offset, err->length); - list_del(&err->list); + list_replace(&err->list, &dserr->list); kfree(err); + return; } list_add_tail(&dserr->list, head); @@ -331,7 +345,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); struct nfs_fh *fh = NULL; - if (!ff_layout_mirror_valid(lseg, mirror)) { + if (!ff_layout_mirror_valid(lseg, mirror, false)) { pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n", __func__, mirror_idx); goto out; @@ -371,7 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, struct nfs_server *s = NFS_SERVER(ino); unsigned int max_payload; - if (!ff_layout_mirror_valid(lseg, mirror)) { + if (!ff_layout_mirror_valid(lseg, mirror, true)) { pr_err_ratelimited("NFS: %s: No data server for offset index %d\n", __func__, ds_idx); goto out; @@ -393,8 +407,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, dataserver_retrans, mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version, - RPC_AUTH_UNIX); + mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (ds->ds_clp) { @@ -457,28 +470,26 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, } } -static bool is_range_intersecting(u64 offset1, u64 length1, - u64 offset2, u64 length2) +void ff_layout_free_ds_ioerr(struct list_head *head) { - u64 end1 = end_offset(offset1, length1); - u64 end2 = end_offset(offset2, length2); + struct nfs4_ff_layout_ds_err *err; - return (end1 == NFS4_MAX_UINT64 || end1 > offset2) && - (end2 == NFS4_MAX_UINT64 || end2 > offset1); + while (!list_empty(head)) { + err = list_first_entry(head, + struct nfs4_ff_layout_ds_err, + list); + list_del(&err->list); + kfree(err); + } } /* called with inode i_lock held */ -int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, - struct xdr_stream *xdr, int *count, - const struct pnfs_layout_range *range) +int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head) { - struct nfs4_ff_layout_ds_err *err, *n; + struct nfs4_ff_layout_ds_err *err; __be32 *p; - list_for_each_entry_safe(err, n, &flo->error_list, list) { - if (!is_range_intersecting(err->offset, err->length, - range->offset, range->length)) - continue; + list_for_each_entry(err, head, list) { /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) * + array length + deviceid(NFS4_DEVICEID4_SIZE) * + status(4) + opnum(4) @@ -497,17 +508,59 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(err->status); *p++ = cpu_to_be32(err->opnum); - *count += 1; - list_del(&err->list); - dprintk("%s: offset %llu length %llu status %d op %d count %d\n", + dprintk("%s: offset %llu length %llu status %d op %d\n", __func__, err->offset, err->length, err->status, - err->opnum, *count); - kfree(err); + err->opnum); } return 0; } +static +unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum) +{ + struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); + struct inode *inode = lo->plh_inode; + struct nfs4_ff_layout_ds_err *err, *n; + unsigned int ret = 0; + + spin_lock(&inode->i_lock); + list_for_each_entry_safe(err, n, &flo->error_list, list) { + if (!pnfs_is_range_intersecting(err->offset, + pnfs_end_offset(err->offset, err->length), + range->offset, + pnfs_end_offset(range->offset, range->length))) + continue; + if (!maxnum) + break; + list_move(&err->list, head); + maxnum--; + ret++; + } + spin_unlock(&inode->i_lock); + return ret; +} + +unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, + const struct pnfs_layout_range *range, + struct list_head *head, + unsigned int maxnum) +{ + unsigned int ret; + + ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum); + /* If we're over the max, discard all remaining entries */ + if (ret == maxnum) { + LIST_HEAD(discard); + do_layout_fetch_ds_ioerr(lo, range, &discard, -1); + ff_layout_free_ds_ioerr(&discard); + } + return ret; +} + static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; @@ -516,7 +569,11 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror && mirror->mirror_ds) { + if (mirror) { + if (!mirror->mirror_ds) + return true; + if (IS_ERR(mirror->mirror_ds)) + continue; devid = &mirror->mirror_ds->id_node; if (!ff_layout_test_devid_unavailable(devid)) return true; @@ -534,8 +591,10 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || !mirror->mirror_ds) + if (!mirror || IS_ERR(mirror->mirror_ds)) return false; + if (!mirror->mirror_ds) + continue; devid = &mirror->mirror_ds->id_node; if (ff_layout_test_devid_unavailable(devid)) return false; @@ -544,7 +603,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; } -bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) +static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) { if (lseg->pls_range.iomode == IOMODE_READ) return ff_read_layout_has_available_ds(lseg); diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index a608ffd28acc..391dafaf9182 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -30,7 +30,7 @@ #include <linux/namei.h> #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bf4ec5ecc97e..5ca4d96b1942 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -39,7 +39,7 @@ #include <linux/compat.h> #include <linux/freezer.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "nfs4_fs.h" #include "callback.h" @@ -160,6 +160,43 @@ int nfs_sync_mapping(struct address_space *mapping) return ret; } +static int nfs_attribute_timeout(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); +} + +static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags) +{ + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + + /* Special case for the pagecache or access cache */ + if (flags == NFS_INO_REVAL_PAGECACHE && + !(cache_validity & NFS_INO_REVAL_FORCED)) + return false; + return (cache_validity & flags) != 0; +} + +static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags) +{ + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + + if ((cache_validity & flags) != 0) + return true; + if (nfs_attribute_timeout(inode)) + return true; + return false; +} + +bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) +{ + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + return nfs_check_cache_invalid_delegated(inode, flags); + + return nfs_check_cache_invalid_not_delegated(inode, flags); +} + static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); @@ -634,15 +671,28 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); -static void nfs_request_parent_use_readdirplus(struct dentry *dentry) +static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { struct dentry *parent; + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) + return; parent = dget_parent(dentry); nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } +static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) +{ + struct dentry *parent; + + if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) + return; + parent = dget_parent(dentry); + nfs_advise_use_readdirplus(d_inode(parent)); + dput(parent); +} + static bool nfs_need_revalidate_inode(struct inode *inode) { if (NFS_I(inode)->cache_validity & @@ -683,10 +733,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - if (server->caps & NFS_CAP_READDIRPLUS) - nfs_request_parent_use_readdirplus(dentry); + nfs_readdirplus_parent_cache_miss(dentry); err = __nfs_revalidate_inode(server, inode); - } + } else + nfs_readdirplus_parent_cache_hit(dentry); if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); @@ -702,8 +752,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { atomic_set(&l_ctx->count, 1); - l_ctx->lockowner.l_owner = current->files; - l_ctx->lockowner.l_pid = current->tgid; + l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); } @@ -714,9 +763,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *pos = head; do { - if (pos->lockowner.l_owner != current->files) - continue; - if (pos->lockowner.l_pid != current->tgid) + if (pos->lockowner != current->files) continue; atomic_inc(&pos->count); return pos; @@ -785,6 +832,8 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) if (!is_sync) return; inode = d_inode(ctx->dentry); + if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) return; @@ -799,7 +848,9 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) } EXPORT_SYMBOL_GPL(nfs_close_context); -struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode) +struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, + fmode_t f_mode, + struct file *filp) { struct nfs_open_context *ctx; struct rpc_cred *cred = rpc_lookup_cred(); @@ -818,6 +869,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; + ctx->flock_owner = (fl_owner_t)filp; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); @@ -942,7 +994,7 @@ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); @@ -1031,13 +1083,6 @@ out: return status; } -int nfs_attribute_timeout(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); -} - int nfs_attribute_cache_expired(struct inode *inode) { if (nfs_have_delegated_attributes(inode)) @@ -1060,15 +1105,6 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); -int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode) -{ - if (!(NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - && !nfs_attribute_cache_expired(inode)) - return NFS_STALE(inode) ? -ESTALE : 0; - return -ECHILD; -} - static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); @@ -1099,13 +1135,10 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } -static bool nfs_mapping_need_revalidate_inode(struct inode *inode) +bool nfs_mapping_need_revalidate_inode(struct inode *inode) { - if (nfs_have_delegated_attributes(inode)) - return false; - return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE) - || nfs_attribute_timeout(inode) - || NFS_STALE(inode); + return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || + NFS_STALE(inode); } int nfs_revalidate_mapping_rcu(struct inode *inode) @@ -1317,7 +1350,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) - nfs_set_cache_invalid(inode, invalid); + nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED); nfsi->read_cache_jiffies = fattr->time_start; return 0; @@ -1517,13 +1550,6 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr { unsigned long invalid = NFS_INO_INVALID_ATTR; - /* - * Don't revalidate the pagecache if we hold a delegation, but do - * force an attribute update - */ - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED; - if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); @@ -2015,7 +2041,7 @@ static void nfsiod_stop(void) destroy_workqueue(wq); } -int nfs_net_id; +unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 80bcc0befb07..09ca5095c04e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -154,8 +154,7 @@ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); -struct nfs_client *nfs_get_client(const struct nfs_client_initdata *, - rpc_authflavor_t); +struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); @@ -194,14 +193,13 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - u32 minor_version, - rpc_authflavor_t au_flavor); + u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, - unsigned int ds_retrans, rpc_authflavor_t au_flavor); + unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); @@ -346,6 +344,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ +extern void nfs_advise_use_readdirplus(struct inode *dir); extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); @@ -382,6 +381,7 @@ extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); +extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); extern int nfs_wait_atomic_killable(atomic_t *p); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index fbce0d885d4c..5fbd2bde91ba 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -35,6 +35,6 @@ struct nfs_net { #endif }; -extern int nfs_net_id; +extern unsigned int nfs_net_id; #endif diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index ee753547fb0a..7879f2a0fcfd 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -78,8 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source, */ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, - int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - rpc_authflavor_t au_flavor) + int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans) { struct rpc_timeout ds_timeout; struct nfs_client *mds_clp = mds_srv->nfs_client; @@ -106,7 +105,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, /* Use the MDS nfs_client cl_ipaddr. */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, au_flavor); + clp = nfs_get_client(&cl_init); return clp; } diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 608501971fe0..d12ff9385f49 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -397,10 +397,13 @@ static void nfs42_layoutstat_release(void *calldata) { struct nfs42_layoutstat_data *data = calldata; - struct nfs_server *nfss = NFS_SERVER(data->args.inode); + struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo; + int i; - if (nfss->pnfs_curr_ld->cleanup_layoutstats) - nfss->pnfs_curr_ld->cleanup_layoutstats(data); + for (i = 0; i < data->args.num_dev; i++) { + if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free) + devinfo[i].ld_private.ops->free(&devinfo[i].ld_private); + } pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); smp_mb__before_atomic(); diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 8b2605882a20..6c7296454bbc 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -181,8 +181,9 @@ static void encode_layoutstats(struct xdr_stream *xdr, NFS4_DEVICEID4_SIZE); /* Encode layoutupdate4 */ *p++ = cpu_to_be32(devinfo->layout_type); - if (devinfo->layoutstats_encode != NULL) - devinfo->layoutstats_encode(xdr, args, devinfo); + if (devinfo->ld_private.ops) + devinfo->ld_private.ops->encode(xdr, args, + &devinfo->ld_private); else encode_uint32(xdr, 0); } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 1452177c822d..665165833660 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -457,7 +457,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *, extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, - const struct nfs_lockowner *, nfs4_stateid *, + const struct nfs_lock_context *, nfs4_stateid *, struct rpc_cred **); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 074ac7131459..5ae9d64ea08b 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -464,6 +464,11 @@ static bool nfs4_match_client_owner_id(const struct nfs_client *clp1, return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0; } +static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) +{ + return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -521,7 +526,21 @@ int nfs40_walk_client_list(struct nfs_client *new, if (!nfs4_match_client_owner_id(pos, new)) continue; - + /* + * We just sent a new SETCLIENTID, which should have + * caused the server to return a new cl_confirm. So if + * cl_confirm is the same, then this is a different + * server that just returned the same cl_confirm by + * coincidence: + */ + if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm, + &new->cl_confirm)) + continue; + /* + * But if the cl_confirm's are different, then the only + * way that a SETCLIENTID_CONFIRM to pos can succeed is + * if new and pos point to the same server: + */ atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); @@ -534,6 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new, break; case 0: nfs4_swap_callback_idents(pos, new); + pos->cl_confirm = new->cl_confirm; prev = NULL; *result = pos; @@ -881,7 +901,6 @@ static int nfs4_set_client(struct nfs_server *server, const struct sockaddr *addr, const size_t addrlen, const char *ip_addr, - rpc_authflavor_t authflavour, int proto, const struct rpc_timeout *timeparms, u32 minorversion, struct net *net) { @@ -907,7 +926,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_MIGRATION, &cl_init.init_flags); /* Allocate or find a client reference we can use */ - clp = nfs_get_client(&cl_init, authflavour); + clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) { error = PTR_ERR(clp); goto error; @@ -948,7 +967,7 @@ error: struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, - u32 minor_version, rpc_authflavor_t au_flavor) + u32 minor_version) { struct rpc_timeout ds_timeout; struct nfs_client *mds_clp = mds_srv->nfs_client; @@ -979,7 +998,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init, au_flavor); + clp = nfs_get_client(&cl_init); dprintk("<-- %s %p\n", __func__, clp); return clp; @@ -1103,7 +1122,6 @@ static int nfs4_init_server(struct nfs_server *server, (const struct sockaddr *)&data->nfs_server.address, data->nfs_server.addrlen, data->client_address, - data->selected_flavor, data->nfs_server.protocol, &timeparms, data->minorversion, @@ -1200,7 +1218,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->addr, data->addrlen, parent_client->cl_ipaddr, - data->authflavor, rpc_protocol(parent_server->client), parent_server->client->cl_timeout, parent_client->cl_mvops->minor_version, @@ -1311,7 +1328,6 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, - clp->cl_rpcclient->cl_auth->au_flavor, clp->cl_proto, clnt->cl_timeout, clp->cl_minorversion, net); nfs_put_client(clp); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 89a77950e0b0..0efba77789b9 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) parent = dget_parent(dentry); dir = d_inode(parent); - ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode); + ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); err = PTR_ERR(ctx); if (IS_ERR(ctx)) goto out; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 241da19b7da4..0a0eaecf9676 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,7 +38,6 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/errno.h> -#include <linux/file.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/printk.h> @@ -94,7 +93,7 @@ static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fa static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, @@ -226,7 +225,6 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { static const u32 nfs4_open_noattr_bitmap[3] = { FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE | FATTR4_WORD0_FILEID, }; @@ -817,6 +815,10 @@ static int nfs41_sequence_process(struct rpc_task *task, case -NFS4ERR_SEQ_FALSE_RETRY: ++slot->seq_nr; goto retry_nowait; + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_BADSESSION: + nfs4_schedule_session_recovery(session, res->sr_status); + goto retry_nowait; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -1080,15 +1082,24 @@ int nfs4_call_sync(struct rpc_clnt *clnt, return nfs4_call_sync_sequence(clnt, server, msg, args, res); } -static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) +static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo, + unsigned long timestamp) { struct nfs_inode *nfsi = NFS_I(dir); spin_lock(&dir->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - if (!cinfo->atomic || cinfo->before != dir->i_version) + if (cinfo->atomic && cinfo->before == dir->i_version) { + nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE; + nfsi->attrtimeo_timestamp = jiffies; + } else { nfs_force_lookup_revalidate(dir); + if (cinfo->before != dir->i_version) + nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | + NFS_INO_INVALID_ACL; + } dir->i_version = cinfo->after; + nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfs_fscache_invalidate(dir); spin_unlock(&dir->i_lock); @@ -1221,6 +1232,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, atomic_inc(&sp->so_count); p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); + p->o_arg.umask = current_umask(); + p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); p->o_arg.share_access = nfs4_map_atomic_open_share(server, fmode, flags); /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS @@ -1228,8 +1241,16 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (!(flags & O_EXCL)) { /* ask server to check for all possible rights as results * are cached */ - p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY | - NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE; + switch (p->o_arg.claim) { + default: + break; + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_FH: + p->o_arg.access = NFS4_ACCESS_READ | + NFS4_ACCESS_MODIFY | + NFS4_ACCESS_EXTEND | + NFS4_ACCESS_EXECUTE; + } } p->o_arg.clientid = server->nfs_client->cl_clientid; p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); @@ -1239,7 +1260,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; p->o_arg.label = nfs4_label_copy(p->a_label, label); - p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: case NFS4_OPEN_CLAIM_DELEGATE_CUR: @@ -2372,11 +2392,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_fattr_map_and_free_names(server, &data->f_attr); if (o_arg->open_flags & O_CREAT) { - update_changeattr(dir, &o_res->cinfo); if (o_arg->open_flags & O_EXCL) data->file_created = 1; else if (o_res->cinfo.before != o_res->cinfo.after) data->file_created = 1; + if (data->file_created || dir->i_version != o_res->cinfo.after) + update_changeattr(dir, &o_res->cinfo, + o_res->f_attr->time_start); } if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) server->caps &= ~NFS_CAP_POSIX_LOCK; @@ -2678,7 +2700,8 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, sattr->ia_valid |= ATTR_MTIME; /* Except MODE, it seems harmless of setting twice. */ - if ((attrset[1] & FATTR4_WORD1_MODE)) + if (opendata->o_arg.createmode != NFS4_CREATE_EXCLUSIVE && + attrset[1] & FATTR4_WORD1_MODE) sattr->ia_valid &= ~ATTR_MODE; if (attrset[2] & FATTR4_WORD2_SECURITY_LABEL) @@ -2819,7 +2842,7 @@ static int _nfs4_do_open(struct inode *dir, nfs_fattr_init(opendata->o_res.f_attr); status = nfs4_do_setattr(state->inode, cred, opendata->o_res.f_attr, sattr, - state, label, olabel); + ctx, label, olabel); if (status == 0) { nfs_setattr_update_inode(state->inode, sattr, opendata->o_res.f_attr); @@ -2914,7 +2937,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct nfs_setattrargs *arg, struct nfs_setattrres *res, struct rpc_cred *cred, - struct nfs4_state *state) + struct nfs_open_context *ctx) { struct nfs_server *server = NFS_SERVER(inode); struct rpc_message msg = { @@ -2937,15 +2960,17 @@ static int _nfs4_do_setattr(struct inode *inode, if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) { /* Use that stateid */ - } else if (truncate && state != NULL) { - struct nfs_lockowner lockowner = { - .l_owner = current->files, - .l_pid = current->tgid, - }; - if (!nfs4_valid_open_stateid(state)) + } else if (truncate && ctx != NULL) { + struct nfs_lock_context *l_ctx; + if (!nfs4_valid_open_stateid(ctx->state)) return -EBADF; - if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner, - &arg->stateid, &delegation_cred) == -EIO) + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + return PTR_ERR(l_ctx); + status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx, + &arg->stateid, &delegation_cred); + nfs_put_lock_context(l_ctx); + if (status == -EIO) return -EBADF; } else nfs4_stateid_copy(&arg->stateid, &zero_stateid); @@ -2955,7 +2980,7 @@ static int _nfs4_do_setattr(struct inode *inode, status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1); put_rpccred(delegation_cred); - if (status == 0 && state != NULL) + if (status == 0 && ctx != NULL) renew_lease(server, timestamp); trace_nfs4_setattr(inode, &arg->stateid, status); return status; @@ -2963,10 +2988,11 @@ static int _nfs4_do_setattr(struct inode *inode, static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_fattr *fattr, struct iattr *sattr, - struct nfs4_state *state, struct nfs4_label *ilabel, + struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel) { struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_state *state = ctx ? ctx->state : NULL; struct nfs_setattrargs arg = { .fh = NFS_FH(inode), .iap = sattr, @@ -2991,7 +3017,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, arg.bitmask = nfs4_bitmask(server, olabel); do { - err = _nfs4_do_setattr(inode, &arg, &res, cred, state); + err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -3028,10 +3054,15 @@ struct nfs4_closedata { struct nfs4_state *state; struct nfs_closeargs arg; struct nfs_closeres res; + struct { + struct nfs4_layoutreturn_args arg; + struct nfs4_layoutreturn_res res; + struct nfs4_xdr_opaque_data ld_private; + u32 roc_barrier; + bool roc; + } lr; struct nfs_fattr fattr; unsigned long timestamp; - bool roc; - u32 roc_barrier; }; static void nfs4_free_closedata(void *data) @@ -3040,8 +3071,9 @@ static void nfs4_free_closedata(void *data) struct nfs4_state_owner *sp = calldata->state->owner; struct super_block *sb = calldata->state->inode->i_sb; - if (calldata->roc) - pnfs_roc_release(calldata->state->inode); + if (calldata->lr.roc) + pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res, + calldata->res.lr_ret); nfs4_put_open_state(calldata->state); nfs_free_seqid(calldata->arg.seqid); nfs4_put_state_owner(sp); @@ -3060,17 +3092,50 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status); + + /* Handle Layoutreturn errors */ + if (calldata->arg.lr_args && task->tk_status != 0) { + switch (calldata->res.lr_ret) { + default: + calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + break; + case 0: + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + case -NFS4ERR_WRONG_CRED: + calldata->arg.lr_args = NULL; + calldata->res.lr_res = NULL; + calldata->res.lr_ret = 0; + rpc_restart_call_prepare(task); + return; + } + } + /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ switch (task->tk_status) { case 0: res_stateid = &calldata->res.stateid; - if (calldata->roc) - pnfs_roc_set_barrier(state->inode, - calldata->roc_barrier); renew_lease(server, calldata->timestamp); break; + case -NFS4ERR_ACCESS: + if (calldata->arg.bitmask != NULL) { + calldata->arg.bitmask = NULL; + calldata->res.fattr = NULL; + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out_release; + + } + break; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -3096,7 +3161,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) res_stateid, calldata->arg.fmode); out_release: nfs_release_seqid(calldata->arg.seqid); - nfs_refresh_inode(calldata->inode, calldata->res.fattr); + nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); } @@ -3144,21 +3209,30 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) goto out_no_action; } - if (nfs4_wait_on_layoutreturn(inode, task)) { + if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) { nfs_release_seqid(calldata->arg.seqid); goto out_wait; } if (calldata->arg.fmode == 0) task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE]; - if (calldata->roc) - pnfs_roc_get_barrier(inode, &calldata->roc_barrier); + + if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) { + /* Close-to-open cache consistency revalidation */ + if (!nfs4_have_delegation(inode, FMODE_READ)) + calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask; + else + calldata->arg.bitmask = NULL; + } calldata->arg.share_access = nfs4_map_atomic_open_share(NFS_SERVER(inode), calldata->arg.fmode, 0); - nfs_fattr_init(calldata->res.fattr); + if (calldata->res.fattr == NULL) + calldata->arg.bitmask = NULL; + else if (calldata->arg.bitmask == NULL) + calldata->res.fattr = NULL; calldata->timestamp = jiffies; if (nfs4_setup_sequence(NFS_SERVER(inode), &calldata->arg.seq_args, @@ -3179,13 +3253,6 @@ static const struct rpc_call_ops nfs4_close_ops = { .rpc_release = nfs4_free_closedata, }; -static bool nfs4_roc(struct inode *inode) -{ - if (!nfs_have_layout(inode)) - return false; - return pnfs_roc(inode); -} - /* * It is possible for data to be read/written from a mem-mapped file * after the sys_close call (which hits the vfs layer as a flush). @@ -3232,12 +3299,19 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); if (IS_ERR(calldata->arg.seqid)) goto out_free_calldata; + nfs_fattr_init(&calldata->fattr); calldata->arg.fmode = 0; - calldata->arg.bitmask = server->cache_consistency_bitmask; + calldata->lr.arg.ld_private = &calldata->lr.ld_private; calldata->res.fattr = &calldata->fattr; calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; - calldata->roc = nfs4_roc(state->inode); + calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + calldata->lr.roc = pnfs_roc(state->inode, + &calldata->lr.arg, &calldata->lr.res, msg.rpc_cred); + if (calldata->lr.roc) { + calldata->arg.lr_args = &calldata->lr.arg; + calldata->res.lr_res = &calldata->lr.res; + } nfs_sb_active(calldata->inode->i_sb); msg.rpc_argp = &calldata->arg; @@ -3290,7 +3364,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL) #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL) -#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL) +#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL) static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle) { @@ -3687,7 +3761,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, { struct inode *inode = d_inode(dentry); struct rpc_cred *cred = NULL; - struct nfs4_state *state = NULL; + struct nfs_open_context *ctx = NULL; struct nfs4_label *label = NULL; int status; @@ -3708,20 +3782,17 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, /* Search for an existing open(O_WRITE) file */ if (sattr->ia_valid & ATTR_FILE) { - struct nfs_open_context *ctx; ctx = nfs_file_open_context(sattr->ia_file); - if (ctx) { + if (ctx) cred = ctx->cred; - state = ctx->state; - } } label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); if (IS_ERR(label)) return PTR_ERR(label); - status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label); + status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label); if (status == 0) { nfs_setattr_update_inode(inode, sattr, fattr); nfs_setsecurity(inode, fattr, label); @@ -3966,18 +4037,20 @@ static int nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, int flags) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_label l, *ilabel = NULL; struct nfs_open_context *ctx; struct nfs4_state *state; int status = 0; - ctx = alloc_nfs_open_context(dentry, FMODE_READ); + ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL); if (IS_ERR(ctx)) return PTR_ERR(ctx); ilabel = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL); if (IS_ERR(state)) { status = PTR_ERR(state); @@ -4004,11 +4077,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name) .rpc_argp = &args, .rpc_resp = &res, }; + unsigned long timestamp = jiffies; int status; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1); if (status == 0) - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, timestamp); return status; } @@ -4056,7 +4130,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(dir, &res->cinfo); + if (task->tk_status == 0) + update_changeattr(dir, &res->cinfo, res->dir_attr->time_start); return 1; } @@ -4090,8 +4165,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; - update_changeattr(old_dir, &res->old_cinfo); - update_changeattr(new_dir, &res->new_cinfo); + if (task->tk_status == 0) { + update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start); + if (new_dir != old_dir) + update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start); + } return 1; } @@ -4128,7 +4206,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (!status) { - update_changeattr(dir, &res.cinfo); + update_changeattr(dir, &res.cinfo, res.fattr->time_start); status = nfs_post_op_update_inode(inode, res.fattr); if (!status) nfs_setsecurity(inode, res.fattr, res.label); @@ -4185,6 +4263,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir, data->arg.attrs = sattr; data->arg.ftype = ftype; data->arg.bitmask = nfs4_bitmask(server, data->label); + data->arg.umask = current_umask(); data->res.server = server; data->res.fh = &data->fh; data->res.fattr = &data->fattr; @@ -4202,7 +4281,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { - update_changeattr(dir, &data->res.dir_cinfo); + update_changeattr(dir, &data->res.dir_cinfo, + data->res.fattr->time_start); status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label); } return status; @@ -4282,13 +4362,15 @@ out: static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_label l, *label = NULL; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); do { err = _nfs4_proc_mkdir(dir, dentry, sattr, label); trace_nfs4_mkdir(dir, &dentry->d_name, err); @@ -4391,13 +4473,15 @@ out: static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, dev_t rdev) { + struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_label l, *label = NULL; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); - sattr->ia_mode &= ~current_umask(); + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + sattr->ia_mode &= ~current_umask(); do { err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev); trace_nfs4_mknod(dir, &dentry->d_name, err); @@ -4541,11 +4625,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_lock_context *l_ctx, fmode_t fmode) { - const struct nfs_lockowner *lockowner = NULL; - - if (l_ctx != NULL) - lockowner = &l_ctx->lockowner; - return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL); + return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL); } EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid); @@ -5564,11 +5644,16 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + struct { + struct nfs4_layoutreturn_args arg; + struct nfs4_layoutreturn_res res; + struct nfs4_xdr_opaque_data ld_private; + u32 roc_barrier; + bool roc; + } lr; struct nfs_fattr fattr; int rpc_status; struct inode *inode; - bool roc; - u32 roc_barrier; }; static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) @@ -5579,6 +5664,32 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) return; trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status); + + /* Handle Layoutreturn errors */ + if (data->args.lr_args && task->tk_status != 0) { + switch(data->res.lr_ret) { + default: + data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + break; + case 0: + data->args.lr_args = NULL; + data->res.lr_res = NULL; + break; + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_EXPIRED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_UNKNOWN_LAYOUTTYPE: + case -NFS4ERR_WRONG_CRED: + data->args.lr_args = NULL; + data->res.lr_res = NULL; + data->res.lr_ret = 0; + rpc_restart_call_prepare(task); + return; + } + } + switch (task->tk_status) { case 0: renew_lease(data->res.server, data->timestamp); @@ -5594,6 +5705,14 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) case -NFS4ERR_STALE_STATEID: task->tk_status = 0; break; + case -NFS4ERR_ACCESS: + if (data->args.bitmask) { + data->args.bitmask = NULL; + data->res.fattr = NULL; + task->tk_status = 0; + rpc_restart_call_prepare(task); + return; + } default: if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) == -EAGAIN) { @@ -5602,8 +5721,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) } } data->rpc_status = task->tk_status; - if (data->roc && data->rpc_status == 0) - pnfs_roc_set_barrier(data->inode, data->roc_barrier); } static void nfs4_delegreturn_release(void *calldata) @@ -5612,8 +5729,10 @@ static void nfs4_delegreturn_release(void *calldata) struct inode *inode = data->inode; if (inode) { - if (data->roc) - pnfs_roc_release(inode); + if (data->lr.roc) + pnfs_roc_release(&data->lr.arg, &data->lr.res, + data->res.lr_ret); + nfs_post_op_update_inode_force_wcc(inode, &data->fattr); nfs_iput_and_deactive(inode); } kfree(calldata); @@ -5625,12 +5744,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_wait_on_layoutreturn(d_data->inode, task)) + if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) return; - if (d_data->roc) - pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier); - nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, &d_data->res.seq_res, @@ -5676,12 +5792,22 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co nfs4_stateid_copy(&data->stateid, stateid); data->res.fattr = &data->fattr; data->res.server = server; + data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT; + data->lr.arg.ld_private = &data->lr.ld_private; nfs_fattr_init(data->res.fattr); data->timestamp = jiffies; data->rpc_status = 0; + data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred); data->inode = nfs_igrab_and_active(inode); - if (data->inode) - data->roc = nfs4_roc(inode); + if (data->inode) { + if (data->lr.roc) { + data->args.lr_args = &data->lr.arg; + data->res.lr_res = &data->lr.res; + } + } else if (data->lr.roc) { + pnfs_roc_release(&data->lr.arg, &data->lr.res, 0); + data->lr.roc = false; + } task_setup_data.callback_data = data; msg.rpc_argp = &data->args; @@ -5695,10 +5821,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (status != 0) goto out; status = data->rpc_status; - if (status == 0) - nfs_post_op_update_inode_force_wcc(inode, &data->fattr); - else - nfs_refresh_inode(inode, &data->fattr); out: rpc_put_task(task); return status; @@ -6015,7 +6137,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); - get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -6128,7 +6249,6 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); - fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } @@ -8371,6 +8491,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, goto out; } + nfs4_sequence_free_slot(&lgp->res.seq_res); err = nfs4_handle_exception(server, nfs4err, exception); if (!status) { if (exception->retry) @@ -8559,21 +8680,13 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; - LIST_HEAD(freeme); dprintk("--> %s\n", __func__); - spin_lock(&lo->plh_inode->i_lock); - if (lrp->res.lrs_present) { - pnfs_mark_matching_lsegs_invalid(lo, &freeme, - &lrp->args.range, - be32_to_cpu(lrp->args.stateid.seqid)); - pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - } else - pnfs_mark_layout_stateid_invalid(lo, &freeme); - pnfs_clear_layoutreturn_waitbit(lo); - spin_unlock(&lo->plh_inode->i_lock); + pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range, + lrp->res.lrs_present ? &lrp->res.stateid : NULL); nfs4_sequence_free_slot(&lrp->res.seq_res); - pnfs_free_lseg_list(&freeme); + if (lrp->ld_private.ops && lrp->ld_private.ops->free) + lrp->ld_private.ops->free(&lrp->ld_private); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c index a61350f75c74..769b85655c4b 100644 --- a/fs/nfs/nfs4session.c +++ b/fs/nfs/nfs4session.c @@ -169,7 +169,7 @@ bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot) struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid) { if (slotid <= tbl->max_slotid) - return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT); + return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT); return ERR_PTR(-E2BIG); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0959c9661662..daeb94e3acd4 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -494,21 +494,18 @@ nfs4_alloc_state_owner(struct nfs_server *server, } static void -nfs4_drop_state_owner(struct nfs4_state_owner *sp) -{ - struct rb_node *rb_node = &sp->so_server_node; - - if (!RB_EMPTY_NODE(rb_node)) { - struct nfs_server *server = sp->so_server; - struct nfs_client *clp = server->nfs_client; - - spin_lock(&clp->cl_lock); - if (!RB_EMPTY_NODE(rb_node)) { - rb_erase(rb_node, &server->state_owners); - RB_CLEAR_NODE(rb_node); - } - spin_unlock(&clp->cl_lock); - } +nfs4_reset_state_owner(struct nfs4_state_owner *sp) +{ + /* This state_owner is no longer usable, but must + * remain in place so that state recovery can find it + * and the opens associated with it. + * It may also be used for new 'open' request to + * return a delegation to the server. + * So update the 'create_time' so that it looks like + * a new state_owner. This will cause the server to + * request an OPEN_CONFIRM to start a new sequence. + */ + sp->so_seqid.create_time = ktime_get(); } static void nfs4_free_state_owner(struct nfs4_state_owner *sp) @@ -797,19 +794,33 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode) /* * Search the state->lock_states for an existing lock_owner - * that is compatible with current->files + * that is compatible with either of the given owners. + * If the second is non-zero, then the first refers to a Posix-lock + * owner (current->files) and the second refers to a flock/OFD + * owner (struct file*). In that case, prefer a match for the first + * owner. + * If both sorts of locks are held on the one file we cannot know + * which stateid was intended to be used, so a "correct" choice cannot + * be made. Failing that, a "consistent" choice is preferable. The + * consistent choice we make is to prefer the first owner, that of a + * Posix lock. */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, + fl_owner_t fl_owner, fl_owner_t fl_owner2) { - struct nfs4_lock_state *pos; + struct nfs4_lock_state *pos, *ret = NULL; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) - continue; - atomic_inc(&pos->ls_count); - return pos; + if (pos->ls_owner == fl_owner) { + ret = pos; + break; + } + if (pos->ls_owner == fl_owner2) + ret = pos; } - return NULL; + if (ret) + atomic_inc(&ret->ls_count); + return ret; } /* @@ -857,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, 0); if (lsp != NULL) break; if (new != NULL) { @@ -939,22 +950,23 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) static int nfs4_copy_lock_stateid(nfs4_stateid *dst, struct nfs4_state *state, - const struct nfs_lockowner *lockowner) + const struct nfs_lock_context *l_ctx) { struct nfs4_lock_state *lsp; - fl_owner_t fl_owner; + fl_owner_t fl_owner, fl_flock_owner; int ret = -ENOENT; - - if (lockowner == NULL) + if (l_ctx == NULL) goto out; if (test_bit(LK_STATE_IN_USE, &state->flags) == 0) goto out; - fl_owner = lockowner->l_owner; + fl_owner = l_ctx->lockowner; + fl_flock_owner = l_ctx->open_context->flock_owner; + spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner); if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags)) ret = -EIO; else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) { @@ -986,7 +998,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) * requests. */ int nfs4_select_rw_stateid(struct nfs4_state *state, - fmode_t fmode, const struct nfs_lockowner *lockowner, + fmode_t fmode, const struct nfs_lock_context *l_ctx, nfs4_stateid *dst, struct rpc_cred **cred) { int ret; @@ -995,7 +1007,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state, return -EIO; if (cred != NULL) *cred = NULL; - ret = nfs4_copy_lock_stateid(dst, state, lockowner); + ret = nfs4_copy_lock_stateid(dst, state, l_ctx); if (ret == -EIO) /* A lost lock - don't even consider delegations */ goto out; @@ -1079,6 +1091,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) case -NFS4ERR_BADXDR: case -NFS4ERR_RESOURCE: case -NFS4ERR_NOFILEHANDLE: + case -NFS4ERR_MOVED: /* Non-seqid mutating errors */ return; }; @@ -1098,7 +1111,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid) sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid); if (status == -NFS4ERR_BAD_SEQID) - nfs4_drop_state_owner(sp); + nfs4_reset_state_owner(sp); if (!nfs4_has_session(sp->so_server->nfs_client)) nfs_increment_seqid(status, seqid); } @@ -1717,7 +1730,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) break; case -NFS4ERR_STALE_CLIENTID: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); - nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_EXPIRED: @@ -2190,7 +2202,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err) case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state); } - nfs4_schedule_lease_recovery(clp); + nfs4_schedule_state_manager(clp); } EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index fc89e5ed07ee..e9255cb453e6 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,6 +52,7 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> +#include <linux/fs_struct.h> #include "nfs4_fs.h" #include "internal.h" @@ -415,6 +416,8 @@ static int nfs4_stat_to_errno(int); #else /* CONFIG_NFS_V4_1 */ #define encode_sequence_maxsz 0 #define decode_sequence_maxsz 0 +#define encode_layoutreturn_maxsz 0 +#define decode_layoutreturn_maxsz 0 #endif /* CONFIG_NFS_V4_1 */ #define NFS4_enc_compound_sz (1024) /* XXX: large enough? */ @@ -499,22 +502,24 @@ static int nfs4_stat_to_errno(int); (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ - encode_open_downgrade_maxsz + \ - encode_getattr_maxsz) + encode_layoutreturn_maxsz + \ + encode_open_downgrade_maxsz) #define NFS4_dec_open_downgrade_sz \ (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ - decode_open_downgrade_maxsz + \ - decode_getattr_maxsz) + decode_layoutreturn_maxsz + \ + decode_open_downgrade_maxsz) #define NFS4_enc_close_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ + encode_layoutreturn_maxsz + \ encode_close_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_close_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_putfh_maxsz + \ + decode_layoutreturn_maxsz + \ decode_close_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_setattr_sz (compound_encode_hdr_maxsz + \ @@ -708,10 +713,13 @@ static int nfs4_stat_to_errno(int); #define NFS4_enc_delegreturn_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ + encode_layoutreturn_maxsz + \ encode_delegreturn_maxsz + \ encode_getattr_maxsz) #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_layoutreturn_maxsz + \ decode_delegreturn_maxsz + \ decode_getattr_maxsz) #define NFS4_enc_getacl_sz (compound_encode_hdr_maxsz + \ @@ -1003,7 +1011,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, const struct nfs_server *server, - bool excl_check) + bool excl_check, const umode_t *umask) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1017,18 +1025,21 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. - * In the worst-case, this would be - * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime) - * = 40 bytes, plus any contribution from variable-length fields - * such as owner/group. */ if (iap->ia_valid & ATTR_SIZE) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } + if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) + umask = NULL; if (iap->ia_valid & ATTR_MODE) { - bmval[1] |= FATTR4_WORD1_MODE; - len += 4; + if (umask) { + bmval[2] |= FATTR4_WORD2_MODE_UMASK; + len += 8; + } else { + bmval[1] |= FATTR4_WORD1_MODE; + len += 4; + } } if (iap->ia_valid & ATTR_UID) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); @@ -1129,6 +1140,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, *p++ = cpu_to_be32(label->len); p = xdr_encode_opaque_fixed(p, label->label, label->len); } + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(*umask); + } /* out: */ } @@ -1183,7 +1198,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false); + encode_attrs(xdr, create->attrs, create->label, create->server, false, + &create->umask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1403,11 +1419,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, + &arg->umask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, + &arg->umask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1416,7 +1434,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true); + encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, + &arg->umask); } } @@ -1672,7 +1691,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false); + encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2015,6 +2034,7 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { + const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2029,10 +2049,11 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_lock(&args->inode->i_lock); encode_nfs4_stateid(xdr, &args->stateid); spin_unlock(&args->inode->i_lock); - if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) { - NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn( - NFS_I(args->inode)->layout, xdr, args); - } else + if (args->ld_private->ops && args->ld_private->ops->encode) + args->ld_private->ops->encode(xdr, args, args->ld_private); + else if (lr_ops->encode_layoutreturn) + lr_ops->encode_layoutreturn(xdr, args); + else encode_uint32(xdr, 0); } @@ -2062,6 +2083,13 @@ static void encode_free_stateid(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr); encode_nfs4_stateid(xdr, &args->stateid); } +#else +static inline void +encode_layoutreturn(struct xdr_stream *xdr, + const struct nfs4_layoutreturn_args *args, + struct compound_hdr *hdr) +{ +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -2249,8 +2277,11 @@ static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + if (args->lr_args) + encode_layoutreturn(xdr, args->lr_args, &hdr); + if (args->bitmask != NULL) + encode_getfattr(xdr, args->bitmask, &hdr); encode_close(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2327,8 +2358,9 @@ static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fh, &hdr); + if (args->lr_args) + encode_layoutreturn(xdr, args->lr_args, &hdr); encode_open_downgrade(xdr, args, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); encode_nops(&hdr); } @@ -2671,7 +2703,10 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, encode_compound_hdr(xdr, req, &hdr); encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->fhandle, &hdr); - encode_getfattr(xdr, args->bitmask, &hdr); + if (args->lr_args) + encode_layoutreturn(xdr, args->lr_args, &hdr); + if (args->bitmask) + encode_getfattr(xdr, args->bitmask, &hdr); encode_delegreturn(xdr, args->stateid, &hdr); encode_nops(&hdr); } @@ -6089,6 +6124,13 @@ static int decode_free_stateid(struct xdr_stream *xdr, res->status = decode_op_hdr(xdr, OP_FREE_STATEID); return res->status; } +#else +static inline +int decode_layoutreturn(struct xdr_stream *xdr, + struct nfs4_layoutreturn_res *res) +{ + return 0; +} #endif /* CONFIG_NFS_V4_1 */ /* @@ -6114,10 +6156,13 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, status = decode_putfh(xdr); if (status) goto out; + if (res->lr_res) { + status = decode_layoutreturn(xdr, res->lr_res); + res->lr_ret = status; + if (status) + goto out; + } status = decode_open_downgrade(xdr, res); - if (status != 0) - goto out; - decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6444,16 +6489,18 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr, status = decode_putfh(xdr); if (status) goto out; + if (res->lr_res) { + status = decode_layoutreturn(xdr, res->lr_res); + res->lr_ret = status; + if (status) + goto out; + } + if (res->fattr != NULL) { + status = decode_getfattr(xdr, res->fattr, res->server); + if (status != 0) + goto out; + } status = decode_close(xdr, res); - if (status != 0) - goto out; - /* - * Note: Server may do delete on close for this file - * in which case the getattr call will fail with - * an ESTALE error. Shouldn't be a problem, - * though, since fattr->valid will remain unset. - */ - decode_getfattr(xdr, res->fattr, res->server); out: return status; } @@ -6920,9 +6967,17 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, status = decode_putfh(xdr); if (status != 0) goto out; - status = decode_getfattr(xdr, res->fattr, res->server); - if (status != 0) - goto out; + if (res->lr_res) { + status = decode_layoutreturn(xdr, res->lr_res); + res->lr_ret = status; + if (status) + goto out; + } + if (res->fattr) { + status = decode_getfattr(xdr, res->fattr, res->server); + if (status != 0) + goto out; + } status = decode_delegreturn(xdr); out: return status; diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 919efd4a1a23..2a4cdce939a0 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -504,10 +504,10 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p) } void -objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, +objlayout_encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args) { + struct pnfs_layout_hdr *pnfslay = args->layout; struct objlayout *objlay = OBJLAYOUT(pnfslay); struct objlayout_io_res *oir, *tmp; __be32 *start; diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index 2641dbad345c..fc94a5872ed4 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -175,7 +175,6 @@ extern void objlayout_encode_layoutcommit( const struct nfs4_layoutcommit_args *); extern void objlayout_encode_layoutreturn( - struct pnfs_layout_hdr *, struct xdr_stream *, const struct nfs4_layoutreturn_args *); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 965db474f4b0..6e629b856a00 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -867,8 +867,7 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { - return l1->lockowner.l_owner == l2->lockowner.l_owner - && l1->lockowner.l_pid == l2->lockowner.l_pid; + return l1->lockowner == l2->lockowner; } /** diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 259ef85f435a..dd042498ce7c 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -54,6 +54,12 @@ static DEFINE_SPINLOCK(pnfs_spinlock); static LIST_HEAD(pnfs_modules_tbl); static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo); +static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, + struct list_head *free_me, + const struct pnfs_layout_range *range, + u32 seq); +static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, + struct list_head *tmp_list); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -299,6 +305,49 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +static void +pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, + u32 seq) +{ + if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + if (seq != 0) { + WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); + lo->plh_return_seq = seq; + } +} + +static void +pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) +{ + lo->plh_return_iomode = 0; + lo->plh_return_seq = 0; + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + +static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +{ + clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); + clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags); + smp_mb__after_atomic(); + wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); +} + +static void +pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, + struct list_head *free_me) +{ + clear_bit(NFS_LSEG_ROC, &lseg->pls_flags); + clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + pnfs_lseg_dec_and_remove_zero(lseg, free_me); + if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) + pnfs_lseg_dec_and_remove_zero(lseg, free_me); +} + /* * Mark a pnfs_layout_hdr and all associated layout segments as invalid * @@ -315,9 +364,17 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, .offset = 0, .length = NFS4_MAX_UINT64, }; + struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0); + pnfs_clear_layoutreturn_info(lo); + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) + pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && + !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) + pnfs_clear_layoutreturn_waitbit(lo); + return !list_empty(&lo->plh_segs); } static int @@ -396,27 +453,42 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, static void pnfs_free_lseg(struct pnfs_layout_segment *lseg) { - struct inode *ino = lseg->pls_layout->plh_inode; - - NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (lseg != NULL) { + struct inode *inode = lseg->pls_layout->plh_inode; + NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg); + } } static void pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) { - struct inode *inode = lo->plh_inode; - WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ atomic_dec(&lo->plh_refcount); - if (list_empty(&lo->plh_segs)) { + if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + return; + if (list_empty(&lo->plh_segs) && + !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) && + !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { if (atomic_read(&lo->plh_outstanding) == 0) set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); } - rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); +} + +static bool +pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo, + struct pnfs_layout_segment *lseg) +{ + if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) && + pnfs_layout_is_valid(lo)) { + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + list_move_tail(&lseg->pls_list, &lo->plh_return_segs); + return true; + } + return false; } void @@ -442,6 +514,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); + if (pnfs_cache_lseg_for_layoutreturn(lo, lseg)) + lseg = NULL; spin_unlock(&inode->i_lock); pnfs_free_lseg(lseg); pnfs_put_layout_hdr(lo); @@ -482,22 +556,15 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) struct pnfs_layout_hdr *lo = lseg->pls_layout; if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) return; - pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); - pnfs_free_lseg_async(lseg); + if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { + pnfs_get_layout_hdr(lo); + pnfs_free_lseg_async(lseg); + } } } EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); -static u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - /* * is l2 fully contained in l1? * start1 end1 @@ -510,33 +577,13 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, const struct pnfs_layout_range *l2) { u64 start1 = l1->offset; - u64 end1 = end_offset(start1, l1->length); + u64 end1 = pnfs_end_offset(start1, l1->length); u64 start2 = l2->offset; - u64 end2 = end_offset(start2, l2->length); + u64 end2 = pnfs_end_offset(start2, l2->length); return (start1 <= start2) && (end1 >= end2); } -/* - * is l1 and l2 intersecting? - * start1 end1 - * [----------------------------------) - * start2 end2 - * [----------------) - */ -static bool -pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, - const struct pnfs_layout_range *l2) -{ - u64 start1 = l1->offset; - u64 end1 = end_offset(start1, l1->length); - u64 start2 = l2->offset; - u64 end2 = end_offset(start2, l2->length); - - return (end1 == NFS4_MAX_UINT64 || end1 > start2) && - (end2 == NFS4_MAX_UINT64 || end2 > start1); -} - static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { @@ -637,6 +684,20 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void +pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, + struct list_head *free_me, + const struct pnfs_layout_range *range, + u32 seq) +{ + struct pnfs_layout_segment *lseg, *next; + + list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) { + if (pnfs_match_lseg_recall(lseg, range, seq)) + list_move_tail(&lseg->pls_list, free_me); + } +} + /* note free_me must contain lsegs from a single layout_hdr */ void pnfs_free_lseg_list(struct list_head *free_me) @@ -701,6 +762,8 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp, struct inode *inode; list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) { + if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) + continue; inode = igrab(lo->plh_inode); if (inode == NULL) continue; @@ -816,14 +879,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp) pnfs_destroy_layouts_byclid(clp, false); } -static void -pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) -{ - lo->plh_return_iomode = 0; - lo->plh_return_seq = 0; - clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -941,12 +996,31 @@ static void pnfs_clear_layoutcommit(struct inode *inode, } } -void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) +void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid) { - clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); - smp_mb__after_atomic(); - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); + struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); + + spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo) || !arg_stateid || + !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + goto out_unlock; + if (stateid) { + u32 seq = be32_to_cpu(arg_stateid->seqid); + + pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); + pnfs_free_returned_lsegs(lo, &freeme, range, seq); + pnfs_set_layout_stateid(lo, stateid, true); + } else + pnfs_mark_layout_stateid_invalid(lo, &freeme); +out_unlock: + pnfs_clear_layoutreturn_waitbit(lo); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); + } static bool @@ -957,8 +1031,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, /* Serialise LAYOUTGET/LAYOUTRETURN */ if (atomic_read(&lo->plh_outstanding) != 0) return false; - if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) return false; + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); pnfs_get_layout_hdr(lo); if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { if (stateid != NULL) { @@ -978,11 +1053,29 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo, return true; } +static void +pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args, + struct pnfs_layout_hdr *lo, + const nfs4_stateid *stateid, + enum pnfs_iomode iomode) +{ + struct inode *inode = lo->plh_inode; + + args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id; + args->inode = inode; + args->range.iomode = iomode; + args->range.offset = 0; + args->range.length = NFS4_MAX_UINT64; + args->layout = lo; + nfs4_stateid_copy(&args->stateid, stateid); +} + static int pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; struct nfs4_layoutreturn *lrp; int status = 0; @@ -996,15 +1089,12 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, goto out; } - nfs4_stateid_copy(&lrp->args.stateid, stateid); - lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; - lrp->args.inode = ino; - lrp->args.range.iomode = iomode; - lrp->args.range.offset = 0; - lrp->args.range.length = NFS4_MAX_UINT64; - lrp->args.layout = lo; + pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode); + lrp->args.ld_private = &lrp->ld_private; lrp->clp = NFS_SERVER(ino)->nfs_client; lrp->cred = lo->plh_lc_cred; + if (ld->prepare_layoutreturn) + ld->prepare_layoutreturn(&lrp->args); status = nfs4_proc_layoutreturn(lrp, sync); out: @@ -1067,7 +1157,7 @@ _pnfs_return_layout(struct inode *ino) struct nfs_inode *nfsi = NFS_I(ino); LIST_HEAD(tmp_list); nfs4_stateid stateid; - int status = 0, empty; + int status = 0; bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1081,7 +1171,14 @@ _pnfs_return_layout(struct inode *ino) } /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); - empty = list_empty(&lo->plh_segs); + /* Is there an outstanding layoutreturn ? */ + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { + spin_unlock(&ino->i_lock); + if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, + TASK_UNINTERRUPTIBLE)) + goto out_put_layout_hdr; + spin_lock(&ino->i_lock); + } pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0); @@ -1095,7 +1192,7 @@ _pnfs_return_layout(struct inode *ino) } /* Don't send a LAYOUTRETURN if list was initially empty */ - if (empty) { + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) { spin_unlock(&ino->i_lock); dprintk("NFS: %s no layout segments to return\n", __func__); goto out_put_layout_hdr; @@ -1103,10 +1200,10 @@ _pnfs_return_layout(struct inode *ino) send = pnfs_prepare_layoutreturn(lo, &stateid, NULL); spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); if (send) status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: + pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); @@ -1141,105 +1238,125 @@ pnfs_commit_and_return_layout(struct inode *inode) return ret; } -bool pnfs_roc(struct inode *ino) +bool pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred) { struct nfs_inode *nfsi = NFS_I(ino); struct nfs_open_context *ctx; struct nfs4_state *state; struct pnfs_layout_hdr *lo; - struct pnfs_layout_segment *lseg, *tmp; + struct pnfs_layout_segment *lseg, *next; nfs4_stateid stateid; - LIST_HEAD(tmp_list); - bool found = false, layoutreturn = false, roc = false; + enum pnfs_iomode iomode = 0; + bool layoutreturn = false, roc = false; + bool skip_read = false; + if (!nfs_have_layout(ino)) + return false; +retry: spin_lock(&ino->i_lock); lo = nfsi->layout; - if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) + if (!lo || !pnfs_layout_is_valid(lo) || + test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) goto out_noroc; + if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) { + pnfs_get_layout_hdr(lo); + spin_unlock(&ino->i_lock); + wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, + TASK_UNINTERRUPTIBLE); + pnfs_put_layout_hdr(lo); + goto retry; + } /* no roc if we hold a delegation */ - if (nfs4_check_delegation(ino, FMODE_READ)) - goto out_noroc; + if (nfs4_check_delegation(ino, FMODE_READ)) { + if (nfs4_check_delegation(ino, FMODE_WRITE)) + goto out_noroc; + skip_read = true; + } list_for_each_entry(ctx, &nfsi->open_files, list) { state = ctx->state; + if (state == NULL) + continue; /* Don't return layout if there is open file state */ - if (state != NULL && state->state != 0) + if (state->state & FMODE_WRITE) goto out_noroc; + if (state->state & FMODE_READ) + skip_read = true; } - /* always send layoutreturn if being marked so */ - if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) - layoutreturn = pnfs_prepare_layoutreturn(lo, - &stateid, NULL); - list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) + list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) { + if (skip_read && lseg->pls_range.iomode == IOMODE_READ) + continue; /* If we are sending layoutreturn, invalidate all valid lsegs */ - if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - mark_lseg_invalid(lseg, &tmp_list); - found = true; - } + if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags)) + continue; + /* + * Note: mark lseg for return so pnfs_layout_remove_lseg + * doesn't invalidate the layout for us. + */ + set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); + if (!mark_lseg_invalid(lseg, &lo->plh_return_segs)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + goto out_noroc; + /* ROC in two conditions: * 1. there are ROC lsegs * 2. we don't send layoutreturn */ - if (found && !layoutreturn) { - /* lo ref dropped in pnfs_roc_release() */ - pnfs_get_layout_hdr(lo); - roc = true; - } + /* lo ref dropped in pnfs_roc_release() */ + layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode); + /* If the creds don't match, we can't compound the layoutreturn */ + if (!layoutreturn || cred != lo->plh_lc_cred) + goto out_noroc; + + roc = layoutreturn; + pnfs_init_layoutreturn_args(args, lo, &stateid, iomode); + res->lrs_present = 0; + layoutreturn = false; out_noroc: spin_unlock(&ino->i_lock); - pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); + if (roc) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld; + if (ld->prepare_layoutreturn) + ld->prepare_layoutreturn(args); + return true; + } if (layoutreturn) - pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); - return roc; -} - -void pnfs_roc_release(struct inode *ino) -{ - struct pnfs_layout_hdr *lo; - - spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; - pnfs_clear_layoutreturn_waitbit(lo); - if (atomic_dec_and_test(&lo->plh_refcount)) { - pnfs_detach_layout_hdr(lo); - spin_unlock(&ino->i_lock); - pnfs_free_layout_hdr(lo); - } else - spin_unlock(&ino->i_lock); -} - -void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) -{ - struct pnfs_layout_hdr *lo; - - spin_lock(&ino->i_lock); - lo = NFS_I(ino)->layout; - if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) - lo->plh_barrier = barrier; - spin_unlock(&ino->i_lock); - trace_nfs4_layoutreturn_on_close(ino, 0); + pnfs_send_layoutreturn(lo, &stateid, iomode, true); + return false; } -void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) +void pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret) { - struct nfs_inode *nfsi = NFS_I(ino); - struct pnfs_layout_hdr *lo; - u32 current_seqid; - - spin_lock(&ino->i_lock); - lo = nfsi->layout; - current_seqid = be32_to_cpu(lo->plh_stateid.seqid); + struct pnfs_layout_hdr *lo = args->layout; + const nfs4_stateid *arg_stateid = NULL; + const nfs4_stateid *res_stateid = NULL; + struct nfs4_xdr_opaque_data *ld_private = args->ld_private; - /* Since close does not return a layout stateid for use as - * a barrier, we choose the worst-case barrier. - */ - *barrier = current_seqid + atomic_read(&lo->plh_outstanding); - spin_unlock(&ino->i_lock); + if (ret == 0) { + arg_stateid = &args->stateid; + if (res->lrs_present) + res_stateid = &res->stateid; + } + pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range, + res_stateid); + if (ld_private && ld_private->ops && ld_private->ops->free) + ld_private->ops->free(ld_private); + pnfs_put_layout_hdr(lo); + trace_nfs4_layoutreturn_on_close(args->inode, 0); } bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) @@ -1252,13 +1369,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task) * i_lock */ spin_lock(&ino->i_lock); lo = nfsi->layout; - if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) { + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); sleep = true; + } spin_unlock(&ino->i_lock); - - if (sleep) - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - return sleep; } @@ -1375,6 +1490,7 @@ alloc_init_layout_hdr(struct inode *ino, atomic_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); + INIT_LIST_HEAD(&lo->plh_return_segs); INIT_LIST_HEAD(&lo->plh_bulk_destroy); lo->plh_inode = ino; lo->plh_lc_cred = get_rpccred(ctx->cred); @@ -1841,7 +1957,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } - if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + if (!pnfs_layout_is_valid(lo)) { + /* We have a completely new layout */ + pnfs_set_layout_stateid(lo, &res->stateid, true); + } else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { dprintk("%s forget reply due to sequence\n", __func__); @@ -1851,12 +1970,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } else { /* * We got an entirely new state ID. Mark all segments for the - * inode invalid, and don't bother validating the stateid - * sequence number. + * inode invalid, and retry the layoutget */ pnfs_mark_layout_stateid_invalid(lo, &free_me); - - pnfs_set_layout_stateid(lo, &res->stateid, true); + goto out_forget; } pnfs_get_lseg(lseg); @@ -1877,20 +1994,6 @@ out_forget: return ERR_PTR(-EAGAIN); } -static void -pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, - u32 seq) -{ - if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode) - iomode = IOMODE_ANY; - lo->plh_return_iomode = iomode; - set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); - if (seq != 0) { - WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq); - lo->plh_return_seq = seq; - } -} - /** * pnfs_mark_matching_lsegs_return - Free or return matching layout segments * @lo: pointer to layout header @@ -1945,17 +2048,18 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, .offset = 0, .length = NFS4_MAX_UINT64, }; - LIST_HEAD(free_me); bool return_now = false; spin_lock(&inode->i_lock); pnfs_set_plh_return_info(lo, range.iomode, 0); + /* Block LAYOUTGET */ + set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) { + if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) { nfs4_stateid stateid; enum pnfs_iomode iomode; @@ -1967,7 +2071,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_unlock(&inode->i_lock); nfs_commit_inode(inode, 0); } - pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -2063,7 +2166,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, * */ if (pgio->pg_lseg) { - seg_end = end_offset(pgio->pg_lseg->pls_range.offset, + seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset, pgio->pg_lseg->pls_range.length); req_start = req_offset(req); WARN_ON_ONCE(req_start >= seg_end); @@ -2286,6 +2389,10 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr) struct nfs_pageio_descriptor pgio; if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + /* Prevent deadlocks with layoutreturn! */ + pnfs_put_lseg(hdr->lseg); + hdr->lseg = NULL; + nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops); hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 5c295512c967..63f77b49a586 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -96,6 +96,7 @@ enum { NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_RETURN, /* layoutreturn in progress */ + NFS_LAYOUT_RETURN_LOCK, /* Serialise layoutreturn */ NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ @@ -171,8 +172,8 @@ struct pnfs_layoutdriver_type { (struct nfs_server *server, struct pnfs_device *pdev, gfp_t gfp_flags); - void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, - struct xdr_stream *xdr, + int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); + void (*encode_layoutreturn) (struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); @@ -181,7 +182,6 @@ struct pnfs_layoutdriver_type { struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); - void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { @@ -190,6 +190,7 @@ struct pnfs_layout_hdr { struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; struct list_head plh_segs; /* layout segments list */ + struct list_head plh_return_segs; /* invalid layout segments */ unsigned long plh_block_lgets; /* block LAYOUTGET if >0 */ unsigned long plh_retry_timestamp; unsigned long plh_flags; @@ -270,10 +271,13 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, u32 seq); int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct list_head *lseg_list); -bool pnfs_roc(struct inode *ino); -void pnfs_roc_release(struct inode *ino); -void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); -void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier); +bool pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred); +void pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret); bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task); void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); @@ -292,7 +296,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino, enum pnfs_iomode iomode, bool strict_iomode, gfp_t gfp_flags); -void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo); +void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, + const nfs4_stateid *arg_stateid, + const struct pnfs_layout_range *range, + const nfs4_stateid *stateid); void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, @@ -362,8 +369,7 @@ struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, void nfs4_pnfs_v3_ds_connect_unload(void); void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, - unsigned int retrans, u32 version, u32 minor_version, - rpc_authflavor_t au_flavor); + unsigned int retrans, u32 version, u32 minor_version); struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags); @@ -559,6 +565,38 @@ pnfs_copy_range(struct pnfs_layout_range *dst, memcpy(dst, src, sizeof(*dst)); } +static inline u64 +pnfs_end_offset(u64 start, u64 len) +{ + if (NFS4_MAX_UINT64 - start <= len) + return NFS4_MAX_UINT64; + return start + len; +} + +/* + * Are 2 ranges intersecting? + * start1 end1 + * [----------------------------------) + * start2 end2 + * [----------------) + */ +static inline bool +pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2) +{ + return (end1 == NFS4_MAX_UINT64 || start2 < end1) && + (end2 == NFS4_MAX_UINT64 || start1 < end2); +} + +static inline bool +pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1, + const struct pnfs_layout_range *l2) +{ + u64 end1 = pnfs_end_offset(l1->offset, l1->length); + u64 end2 = pnfs_end_offset(l2->offset, l2->length); + + return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG @@ -630,23 +668,18 @@ pnfs_layoutcommit_outstanding(struct inode *inode) static inline bool -pnfs_roc(struct inode *ino) +pnfs_roc(struct inode *ino, + struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + const struct rpc_cred *cred) { return false; } static inline void -pnfs_roc_release(struct inode *ino) -{ -} - -static inline void -pnfs_roc_set_barrier(struct inode *ino, u32 barrier) -{ -} - -static inline void -pnfs_roc_get_barrier(struct inode *ino, u32 *barrier) +pnfs_roc_release(struct nfs4_layoutreturn_args *args, + struct nfs4_layoutreturn_res *res, + int ret) { } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 53b4705abcc7..9414b492439f 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -600,8 +600,7 @@ static struct nfs_client *(*get_v3_ds_connect)( int ds_addrlen, int ds_proto, unsigned int ds_timeo, - unsigned int ds_retrans, - rpc_authflavor_t au_flavor); + unsigned int ds_retrans); static bool load_v3_ds_connect(void) { @@ -625,15 +624,13 @@ EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, unsigned int timeo, - unsigned int retrans, - rpc_authflavor_t au_flavor) + unsigned int retrans) { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s DS %s au_flavor %d\n", __func__, - ds->ds_remotestr, au_flavor); + dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) goto out; @@ -657,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, clp = get_v3_ds_connect(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, - timeo, retrans, au_flavor); + timeo, retrans); } if (IS_ERR(clp)) { @@ -676,15 +673,13 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, unsigned int timeo, unsigned int retrans, - u32 minor_version, - rpc_authflavor_t au_flavor) + u32 minor_version) { struct nfs_client *clp = ERR_PTR(-EIO); struct nfs4_pnfs_ds_addr *da; int status = 0; - dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr, - au_flavor); + dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n", @@ -720,8 +715,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv, clp = nfs4_set_ds_client(mds_srv, (struct sockaddr *)&da->da_addr, da->da_addrlen, IPPROTO_TCP, - timeo, retrans, minor_version, - au_flavor); + timeo, retrans, minor_version); if (IS_ERR(clp)) continue; @@ -755,19 +749,17 @@ out: */ void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, struct nfs4_deviceid_node *devid, unsigned int timeo, - unsigned int retrans, u32 version, - u32 minor_version, rpc_authflavor_t au_flavor) + unsigned int retrans, u32 version, u32 minor_version) { if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { int err = 0; if (version == 3) { err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans, au_flavor); + retrans); } else if (version == 4) { err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version, - au_flavor); + retrans, minor_version); } else { dprintk("%s: unsupported DS version %d\n", __func__, version); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 001796bcd6c8..6bca17883b93 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -55,7 +55,7 @@ #include <linux/nsproxy.h> #include <linux/rcupdate.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "nfs4_fs.h" #include "callback.h" @@ -2904,7 +2904,7 @@ module_param(max_session_slots, ushort, 0644); MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 " "requests the client will negotiate"); module_param(max_session_cb_slots, ushort, 0644); -MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 " +MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 " "callbacks the client will process for a given server"); module_param(send_implementation_id, ushort, 0644); MODULE_PARM_DESC(send_implementation_id, diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 4fe3eead3868..5a1d0ded8979 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -77,7 +77,6 @@ static const char *nfs_get_link(struct dentry *dentry, * symlinks can't do much... */ const struct inode_operations nfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = nfs_get_link, .getattr = nfs_getattr, .setattr = nfs_setattr, diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53211838f72a..b00d53d13d47 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -24,7 +24,7 @@ #include <linux/freezer.h> #include <linux/wait.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "delegation.h" #include "internal.h" @@ -1151,8 +1151,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { - do_flush |= l_ctx->lockowner.l_owner != current->files - || l_ctx->lockowner.l_pid != current->tgid; + do_flush |= l_ctx->lockowner != current->files; } nfs_release_request(req); if (!do_flush) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index fd8c9a5bcac4..420d3a0ab258 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -9,7 +9,7 @@ #include <net/netns/generic.h> #include <linux/fs.h> -static int grace_net_id; +static unsigned int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index c16bf5af6831..34c1c449fddf 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -10,7 +10,7 @@ #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/sunrpc/addr.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "state.h" #include "netns.h" diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ee36efd5aece..3714231a9d0f 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -124,5 +124,5 @@ struct nfsd_net { /* Simple check to find out if a given net was properly initialized */ #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl) -extern int nfsd_net_id; +extern unsigned int nfsd_net_id; #endif /* __NFSD_NETNS_H__ */ diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 211dc2aed8e1..eb78109d666c 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -1061,7 +1061,7 @@ static const struct rpc_call_ops nfsd4_cb_ops = { int nfsd4_create_callback_queue(void) { - callback_wq = create_singlethread_workqueue("nfsd4_callbacks"); + callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0); if (!callback_wq) return -ENOMEM; return 0; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 42aace4fc4c8..1fc07a9c70e9 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -223,10 +223,11 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, struct nfs4_layout_stateid *ls; struct nfs4_stid *stp; - stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache); + stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache, + nfsd4_free_layout_stateid); if (!stp) return NULL; - stp->sc_free = nfsd4_free_layout_stateid; + get_nfs4_file(fp); stp->sc_file = fp; @@ -686,10 +687,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) return 0; } /* Fallthrough */ - case -NFS4ERR_NOMATCHING_LAYOUT: - trace_layout_recall_done(&ls->ls_stid.sc_stateid); - task->tk_status = 0; - return 1; default: /* * Unknown error or non-responding client, we'll need to fence. @@ -702,6 +699,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) else nfsd4_cb_layout_fail(ls); return -1; + case -NFS4ERR_NOMATCHING_LAYOUT: + trace_layout_recall_done(&ls->ls_stid.sc_stateid); + task->tk_status = 0; + return 1; } } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index abb09b580389..74a6e573e061 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -96,33 +96,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { struct dentry *dentry = cstate->current_fh.fh_dentry; - /* - * Check about attributes are supported by the NFSv4 server or not. - * According to spec, unsupported attributes return ERR_ATTRNOTSUPP. - */ - if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) || - (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) || - (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion))) + if (!nfsd_attrs_supported(cstate->minorversion, bmval)) return nfserr_attrnotsupp; - - /* - * Check FATTR4_WORD0_ACL can be supported - * in current environment or not. - */ - if (bmval[0] & FATTR4_WORD0_ACL) { - if (!IS_POSIXACL(d_inode(dentry))) - return nfserr_attrnotsupp; - } - - /* - * According to spec, read-only attributes return ERR_INVAL. - */ - if (writable) { - if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) || - (bmval[2] & ~writable[2])) - return nfserr_inval; - } - + if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry))) + return nfserr_attrnotsupp; + if (writable && !bmval_is_subset(bmval, writable)) + return nfserr_inval; + if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) && + (bmval[1] & FATTR4_WORD1_MODE)) + return nfserr_inval; return nfs_ok; } @@ -695,9 +677,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); - getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); - getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; getattr->ga_fhp = &cstate->current_fh; return nfs_ok; @@ -799,9 +781,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) return nfserr_inval; - readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion); - readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion); - readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion); + readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0]; + readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1]; + readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2]; if ((cookie == 1) || (cookie == 2) || (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4b4beaaa4eaa..a0dee8ae9f97 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -633,8 +633,8 @@ out: return co; } -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab) +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)) { struct nfs4_stid *stid; int new_id; @@ -650,6 +650,8 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, idr_preload_end(); if (new_id < 0) goto out_free; + + stid->sc_free = sc_free; stid->sc_client = cl; stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; @@ -675,15 +677,12 @@ out_free: static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) { struct nfs4_stid *stid; - struct nfs4_ol_stateid *stp; - stid = nfs4_alloc_stid(clp, stateid_slab); + stid = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_ol_stateid); if (!stid) return NULL; - stp = openlockstateid(stid); - stp->st_stid.sc_free = nfs4_free_ol_stateid; - return stp; + return openlockstateid(stid); } static void nfs4_free_deleg(struct nfs4_stid *stid) @@ -781,11 +780,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, goto out_dec; if (delegation_blocked(¤t_fh->fh_handle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab)); + dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); if (dp == NULL) goto out_dec; - dp->dl_stid.sc_free = nfs4_free_deleg; /* * delegation seqid's are never incremented. The 4.1 special * meaning of seqid 0 isn't meaningful, really, but let's avoid @@ -5580,7 +5578,6 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; - stp->st_stid.sc_free = nfs4_free_lock_stateid; stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; @@ -5623,7 +5620,7 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, lst = find_lock_stateid(lo, fi); if (lst == NULL) { spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab); + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); if (ns == NULL) return NULL; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index c2d2895a1ec1..8fae53ce21d1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/fs_struct.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -57,6 +58,20 @@ #define NFSDDBG_FACILITY NFSDDBG_XDR +u32 nfsd_suppattrs[3][3] = { + {NFSD4_SUPPORTED_ATTRS_WORD0, + NFSD4_SUPPORTED_ATTRS_WORD1, + NFSD4_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_1_SUPPORTED_ATTRS_WORD2}, + + {NFSD4_1_SUPPORTED_ATTRS_WORD0, + NFSD4_1_SUPPORTED_ATTRS_WORD1, + NFSD4_2_SUPPORTED_ATTRS_WORD2}, +}; + /* * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing * directory in order to indicate to the client that a filesystem boundary is present @@ -285,7 +300,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval) static __be32 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, struct nfs4_acl **acl, - struct xdr_netobj *label) + struct xdr_netobj *label, int *umask) { int expected_len, len = 0; u32 dummy32; @@ -296,6 +311,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, if ((status = nfsd4_decode_bitmap(argp, bmval))) return status; + if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 + || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 + || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) { + if (nfsd_attrs_supported(argp->minorversion, bmval)) + return nfserr_inval; + return nfserr_attrnotsupp; + } + READ_BUF(4); expected_len = be32_to_cpup(p++); @@ -435,12 +458,18 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, return nfserr_jukebox; } #endif - - if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0 - || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1 - || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) - READ_BUF(expected_len - len); - else if (len != expected_len) + if (bmval[2] & FATTR4_WORD2_MODE_UMASK) { + if (!umask) + goto xdr_error; + READ_BUF(8); + len += 8; + dummy32 = be32_to_cpup(p++); + iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO); + dummy32 = be32_to_cpup(p++); + *umask = dummy32 & S_IRWXUGO; + iattr->ia_valid |= ATTR_MODE; + } + if (len != expected_len) goto xdr_error; DECODE_TAIL; @@ -634,7 +663,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create return status; status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, - &create->cr_acl, &create->cr_label); + &create->cr_acl, &create->cr_label, + ¤t->fs->umask); if (status) goto out; @@ -879,13 +909,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) case NFS4_OPEN_NOCREATE: break; case NFS4_OPEN_CREATE: + current->fs->umask = 0; READ_BUF(4); open->op_createmode = be32_to_cpup(p++); switch (open->op_createmode) { case NFS4_CREATE_UNCHECKED: case NFS4_CREATE_GUARDED: status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -899,7 +931,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ_BUF(NFS4_VERIFIER_SIZE); COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE); status = nfsd4_decode_fattr(argp, open->op_bmval, - &open->op_iattr, &open->op_acl, &open->op_label); + &open->op_iattr, &open->op_acl, &open->op_label, + ¤t->fs->umask); if (status) goto out; break; @@ -1136,7 +1169,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta if (status) return status; return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, - &setattr->sa_acl, &setattr->sa_label); + &setattr->sa_acl, &setattr->sa_label, NULL); } static __be32 @@ -2340,9 +2373,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion)); - BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion)); - BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); + BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); if (exp->ex_fslocs.migrated) { status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); @@ -2409,29 +2440,29 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p++; /* to be backfilled later */ if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 word0 = nfsd_suppattrs0(minorversion); - u32 word1 = nfsd_suppattrs1(minorversion); - u32 word2 = nfsd_suppattrs2(minorversion); + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); if (!IS_POSIXACL(dentry->d_inode)) - word0 &= ~FATTR4_WORD0_ACL; + supp[0] &= ~FATTR4_WORD0_ACL; if (!contextsupport) - word2 &= ~FATTR4_WORD2_SECURITY_LABEL; - if (!word2) { + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + if (!supp[2]) { p = xdr_reserve_space(xdr, 12); if (!p) goto out_resource; *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(word0); - *p++ = cpu_to_be32(word1); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); } else { p = xdr_reserve_space(xdr, 16); if (!p) goto out_resource; *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(word0); - *p++ = cpu_to_be32(word1); - *p++ = cpu_to_be32(word2); + *p++ = cpu_to_be32(supp[0]); + *p++ = cpu_to_be32(supp[1]); + *p++ = cpu_to_be32(supp[2]); } } if (bmval0 & FATTR4_WORD0_TYPE) { @@ -3576,10 +3607,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd if (!p) return nfserr_resource; /* - * XXX: By default, the ->readlink() VFS op will truncate symlinks - * if they would overflow the buffer. Is this kosher in NFSv4? If - * not, one easy fix is: if ->readlink() precisely fills the buffer, - * assume that truncation occurred, and return NFS4ERR_RESOURCE. + * XXX: By default, vfs_readlink() will truncate symlinks if they + * would overflow the buffer. Is this kosher in NFSv4? If not, one + * easy fix is: if vfs_readlink() precisely fills the buffer, assume + * that truncation occurred, and return NFS4ERR_RESOURCE. */ nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp, (char *)p, &maxcount); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 54cde9a5864e..d6b97b424ad1 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -9,6 +9,7 @@ */ #include <linux/slab.h> +#include <linux/vmalloc.h> #include <linux/sunrpc/addr.h> #include <linux/highmem.h> #include <linux/log2.h> @@ -174,8 +175,12 @@ int nfsd_reply_cache_init(void) goto out_nomem; drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); - if (!drc_hashtbl) - goto out_nomem; + if (!drc_hashtbl) { + drc_hashtbl = vzalloc(hashsize * sizeof(*drc_hashtbl)); + if (!drc_hashtbl) + goto out_nomem; + } + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); spin_lock_init(&drc_hashtbl[i].cache_lock); @@ -204,7 +209,7 @@ void nfsd_reply_cache_shutdown(void) } } - kfree (drc_hashtbl); + kvfree(drc_hashtbl); drc_hashtbl = NULL; drc_hashsize = 0; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 36b2af931e06..f3b2f34b10a3 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -217,7 +217,7 @@ static const struct file_operations pool_stats_operations = { .release = nfsd_pool_stats_release, }; -static struct file_operations reply_cache_stats_operations = { +static const struct file_operations reply_cache_stats_operations = { .open = nfsd_reply_cache_stats_open, .read = seq_read, .llseek = seq_lseek, @@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void) } #endif -int nfsd_net_id; +unsigned int nfsd_net_id; static __net_init int nfsd_init_net(struct net *net) { diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 9446849888d5..d74c8c44dc35 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -359,44 +359,46 @@ void nfsd_lockd_shutdown(void); #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \ (NFSD4_1_SUPPORTED_ATTRS_WORD2 | \ + FATTR4_WORD2_MODE_UMASK | \ NFSD4_2_SECURITY_ATTRS) -static inline u32 nfsd_suppattrs0(u32 minorversion) -{ - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0 - : NFSD4_SUPPORTED_ATTRS_WORD0; -} +extern u32 nfsd_suppattrs[3][3]; -static inline u32 nfsd_suppattrs1(u32 minorversion) +static inline bool bmval_is_subset(u32 *bm1, u32 *bm2) { - return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1 - : NFSD4_SUPPORTED_ATTRS_WORD1; + return !((bm1[0] & ~bm2[0]) || + (bm1[1] & ~bm2[1]) || + (bm1[2] & ~bm2[2])); } -static inline u32 nfsd_suppattrs2(u32 minorversion) +static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval) { - switch (minorversion) { - default: return NFSD4_2_SUPPORTED_ATTRS_WORD2; - case 1: return NFSD4_1_SUPPORTED_ATTRS_WORD2; - case 0: return NFSD4_SUPPORTED_ATTRS_WORD2; - } + return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]); } /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ #define NFSD_WRITEONLY_ATTRS_WORD1 \ (FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) -/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */ +/* + * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add + * a writeable attribute here without also adding code to parse it to + * nfsd4_decode_fattr(). + */ #define NFSD_WRITEABLE_ATTRS_WORD0 \ (FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL) #define NFSD_WRITEABLE_ATTRS_WORD1 \ (FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \ | FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET) #ifdef CONFIG_NFSD_V4_SECURITY_LABEL -#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \ + FATTR4_WORD2_SECURITY_LABEL #else -#define NFSD_WRITEABLE_ATTRS_WORD2 0 +#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0 #endif +#define NFSD_WRITEABLE_ATTRS_WORD2 \ + (FATTR4_WORD2_MODE_UMASK \ + | MAYBE_FATTR4_WORD2_SECURITY_LABEL) #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \ NFSD_WRITEABLE_ATTRS_WORD0 diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index a2b65fc56dd6..e6bfd96734c0 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -661,8 +661,8 @@ nfsd(void *vrqstp) mutex_lock(&nfsd_mutex); /* At this point, the thread shares current->fs - * with the init process. We need to create files with a - * umask of 0 instead of init's umask. */ + * with the init process. We need to create files with the + * umask as defined by the client instead of init's umask. */ if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index c9399366f9df..4516e8b7d776 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -603,8 +603,8 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); -struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, - struct kmem_cache *slab); +struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab, + void (*sc_free)(struct nfs4_stid *)); void nfs4_unhash_stid(struct nfs4_stid *s); void nfs4_put_stid(struct nfs4_stid *s); void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 8ca642fe9b21..26c6fdb4bf67 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -26,7 +26,7 @@ #include <linux/jhash.h> #include <linux/ima.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> @@ -509,8 +509,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, u64 dst_pos, u64 count) { - return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, - count)); + return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count)); } ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst, @@ -1451,7 +1450,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) { - struct inode *inode; mm_segment_t oldfs; __be32 err; int host_err; @@ -1463,10 +1461,9 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = d_inode(path.dentry); err = nfserr_inval; - if (!inode->i_op->readlink) + if (!d_is_symlink(path.dentry)) goto out; touch_atime(&path); @@ -1475,7 +1472,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) */ oldfs = get_fs(); set_fs(KERNEL_DS); - host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp); + host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp); set_fs(oldfs); if (host_err < 0) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 2b71c60fe982..515d13c196da 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -568,7 +568,6 @@ const struct inode_operations nilfs_special_inode_operations = { }; const struct inode_operations nilfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .permission = nilfs_permission, }; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index c95d369e90aa..12eeae62a2b1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag) set_buffer_dirty(nilfs->ns_sbh[0]); if (nilfs_test_opt(nilfs, BARRIER)) { err = __sync_dirty_buffer(nilfs->ns_sbh[0], - WRITE_SYNC | WRITE_FLUSH_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } else { err = sync_dirty_buffer(nilfs->ns_sbh[0]); } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 6faaf710e563..5a4ec309e283 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct dnotify_mark *dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e0e5f7c3c99f..bbc175d4213d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group, static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmnt_mark, u32 event_mask, - void *data, int data_type) + const void *data, int data_type) { __u32 marks_mask, marks_ignored_mask; - struct path *path = data; + const struct path *path = data; pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p" " data_type=%d\n", __func__, inode_mark, vfsmnt_mark, @@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark, } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path) + const struct path *path) { struct fanotify_event_info *event; @@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { int ret = 0; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 2a5fb14115df..4500a74f8d38 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse) } struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask, - struct path *path); + const struct path *path); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index db39de2dd4cb..b41515d3f081 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -86,7 +86,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode) } /* Notify this dentry's parent about a child's events. */ -int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask) +int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask) { struct dentry *parent; struct inode *p_inode; @@ -125,7 +125,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent); static int send_to_group(struct inode *to_tell, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - __u32 mask, void *data, + __u32 mask, const void *data, int data_is, u32 cookie, const unsigned char *file_name) { @@ -187,7 +187,7 @@ static int send_to_group(struct inode *to_tell, * out to all of the registered fsnotify_group. Those groups can then use the * notification event in whatever means they feel necessary. */ -int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, +int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, const unsigned char *file_name, u32 cookie) { struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; @@ -199,7 +199,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); if (data_is == FSNOTIFY_EVENT_PATH) - mnt = real_mount(((struct path *)data)->mnt); + mnt = real_mount(((const struct path *)data)->mnt); else mnt = NULL; diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 741077deef3b..a3645249f7ec 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -150,12 +150,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, */ void fsnotify_unmount_inodes(struct super_block *sb) { - struct inode *inode, *next_i, *need_iput = NULL; + struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { - struct inode *need_iput_tmp; - + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point @@ -178,49 +176,24 @@ void fsnotify_unmount_inodes(struct super_block *sb) continue; } - need_iput_tmp = need_iput; - need_iput = NULL; - - /* In case fsnotify_inode_delete() drops a reference. */ - if (inode != need_iput_tmp) - __iget(inode); - else - need_iput_tmp = NULL; + __iget(inode); spin_unlock(&inode->i_lock); - - /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != &sb->s_inodes) { - spin_lock(&next_i->i_lock); - if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && - atomic_read(&next_i->i_count)) { - __iget(next_i); - need_iput = next_i; - spin_unlock(&next_i->i_lock); - break; - } - spin_unlock(&next_i->i_lock); - next_i = list_next_entry(next_i, i_sb_list); - } - - /* - * We can safely drop s_inode_list_lock here because either - * we actually hold references on both inode and next_i or - * end of list. Also no new inodes will be added since the - * umount has begun. - */ spin_unlock(&sb->s_inode_list_lock); - if (need_iput_tmp) - iput(need_iput_tmp); + if (iput_inode) + iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); fsnotify_inode_delete(inode); - iput(inode); + iput_inode = inode; spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); } diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..a6f5907a3fee 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 2cd900c2c737..19e7ec109a75 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group, struct inode *inode, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, + u32 mask, const void *data, int data_type, const unsigned char *file_name, u32 cookie) { struct inotify_inode_mark *i_mark; @@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group, if ((inode_mark->mask & FS_EXCL_UNLINK) && (data_type == FSNOTIFY_EVENT_PATH)) { - struct path *path = data; + const struct path *path = data; if (d_unlinked(path->dentry)) return 0; diff --git a/fs/notify/mark.c b/fs/notify/mark.c index d3fea0bd89e2..6043306e8e21 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group) } } -void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old) -{ - assert_spin_locked(&old->lock); - new->inode = old->inode; - new->mnt = old->mnt; - if (old->group) - fsnotify_get_group(old->group); - new->group = old->group; - new->mask = old->mask; - new->free_mark = old->free_mark; -} - /* * Nothing fancy, just initialize lists and locks and counters. */ diff --git a/fs/nsfs.c b/fs/nsfs.c index 8718af895eab..8c9fb29c6673 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -118,7 +118,7 @@ again: return ret; } -static int open_related_ns(struct ns_common *ns, +int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)) { struct path path = {}; diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fe251f187ff8..cc91856b5e2d 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -29,6 +29,7 @@ #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/bit_spinlock.h> +#include <linux/bio.h> #include "aops.h" #include "attrib.h" @@ -764,7 +765,7 @@ lock_retry_remap: } // TODO: Instantiate the hole. // clear_buffer_new(bh); - // unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + // clean_bdev_bh_alias(bh); ntfs_error(vol->sb, "Writing into sparse regions is " "not supported yet. Sorry."); err = -EOPNOTSUPP; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index bf72a2c58b75..358ed7e1195a 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -30,7 +30,7 @@ #include <linux/writeback.h> #include <asm/page.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "attrib.h" #include "bitmap.h" @@ -740,8 +740,7 @@ map_buffer_cached: set_buffer_uptodate(bh); if (unlikely(was_hole)) { /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); if (bh_end <= pos || bh_pos >= end) mark_buffer_dirty(bh); else @@ -784,7 +783,7 @@ map_buffer_cached: continue; } /* We allocated the buffer. */ - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * If the buffer is fully outside the write, zero it, * set it uptodate, and mark it dirty so it gets diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index 761f12f7f3ef..353379ff6057 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -27,6 +27,7 @@ #include <linux/buffer_head.h> #include <linux/bitops.h> #include <linux/log2.h> +#include <linux/bio.h> #include "attrib.h" #include "aops.h" diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index d3c009626032..b6f402194f02 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -23,6 +23,7 @@ #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/swap.h> +#include <linux/bio.h> #include "attrib.h" #include "aops.h" diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f72712f6c28d..d4ec0d8961a6 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle, rec = &el->l_recs[index]; if (new_flags && (rec->e_flags & new_flags)) { mlog(ML_ERROR, "Owner %llu tried to set %d flags on an " - "extent that already had them", + "extent that already had them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), new_flags); goto out; @@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle, if (clear_flags && !(rec->e_flags & clear_flags)) { mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an " - "extent that didn't have them", + "extent that didn't have them\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), clear_flags); goto out; @@ -5713,8 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_refcount_tree *ref_tree = NULL; if ((flags & OCFS2_EXT_REFCOUNTED) && len) { - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); if (!refcount_tree_locked) { ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c5c5b9748ea3..11556b7d93ec 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)block); + /* + * The swap code (ab-)uses ->bmap to get a block mapping and then + * bypasseÑ• the file system for actual I/O. We really can't allow + * that on refcounted inodes, so we have to skip out here. And yes, + * 0 is the magic code for a bmap error.. + */ + if (ocfs2_is_refcount_inode(inode)) + return 0; + /* We don't need to lock journal system files, since they aren't * accessed concurrently from multiple nodes. */ @@ -630,7 +639,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (!buffer_mapped(bh)) { map_bh(bh, inode->i_sb, *p_blkno); - unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); + clean_bdev_bh_alias(bh); } if (PageUptodate(page)) { @@ -1950,8 +1959,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos, } int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + loff_t pos, unsigned len, unsigned copied, void *fsdata) { int i, ret; unsigned from, to, start = pos & (PAGE_SIZE - 1); @@ -2064,7 +2072,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping, int ret; struct inode *inode = mapping->host; - ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata); up_write(&OCFS2_I(inode)->ip_alloc_sem); ocfs2_inode_unlock(inode, 1); @@ -2241,7 +2249,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, dwc->dw_zero_count++; } - ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc); + ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); BUG_ON(ret != len); ret = 0; unlock: @@ -2254,10 +2262,10 @@ out: return ret; } -static void ocfs2_dio_end_io_write(struct inode *inode, - struct ocfs2_dio_write_ctxt *dwc, - loff_t offset, - ssize_t bytes) +static int ocfs2_dio_end_io_write(struct inode *inode, + struct ocfs2_dio_write_ctxt *dwc, + loff_t offset, + ssize_t bytes) { struct ocfs2_cached_dealloc_ctxt dealloc; struct ocfs2_extent_tree et; @@ -2308,7 +2316,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode, mlog_errno(ret); } - di = (struct ocfs2_dinode *)di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); @@ -2365,6 +2373,8 @@ out: if (locked) inode_unlock(inode); ocfs2_dio_free_write_ctx(inode, dwc); + + return ret; } /* @@ -2379,21 +2389,19 @@ static int ocfs2_dio_end_io(struct kiocb *iocb, { struct inode *inode = file_inode(iocb->ki_filp); int level; - - if (bytes <= 0) - return 0; + int ret = 0; /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (private) - ocfs2_dio_end_io_write(inode, private, offset, bytes); + if (bytes > 0 && private) + ret = ocfs2_dio_end_io_write(inode, private, offset, bytes); ocfs2_iocb_clear_rw_locked(iocb); level = ocfs2_iocb_rw_locked_level(iocb); ocfs2_rw_unlock(inode, level); - return 0; + return ret; } static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index b1c9f28a57b1..8614ff069d99 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -44,8 +44,7 @@ int walk_page_buffers( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata); + loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 8f040f88ade4..d9ebe11c8990 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -26,6 +26,7 @@ #include <linux/fs.h> #include <linux/types.h> #include <linux/highmem.h> +#include <linux/bio.h> #include <cluster/masklog.h> diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 636abcbd4650..f6e871760f8d 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg, slot = o2nm_this_node(); bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE, - WRITE_SYNC); + REQ_SYNC); if (IS_ERR(bio)) { status = PTR_ERR(bio); mlog_errno(status); @@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block; memset(hb_block, 0, reg->hr_block_bytes); /* TODO: time stuff */ - cputime = CURRENT_TIME.tv_sec; + cputime = ktime_get_real_seconds(); if (!cputime) cputime = 1; @@ -1250,7 +1250,7 @@ static int o2hb_thread(void *data) mlog(ML_HEARTBEAT, "start = %lld, end = %lld, msec = %u, ret = %d\n", - before_hb.tv64, after_hb.tv64, elapsed_msec, ret); + before_hb, after_hb, elapsed_msec, ret); if (!kthread_should_stop() && elapsed_msec < reg->hr_timeout_ms) { diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index dfe162f5fd4c..d331c2386b94 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -24,7 +24,7 @@ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/string.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "masklog.h" diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 8abab16b4602..d4b5c81f0445 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -62,7 +62,7 @@ #include <linux/export.h> #include <net/tcp.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "heartbeat.h" #include "tcp.h" diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3f828a187049..a464c8088170 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1609,8 +1609,6 @@ way_up_top: __dlm_insert_mle(dlm, mle); response = DLM_MASTER_RESP_NO; } else { - // mlog(0, "mle was found\n"); - set_maybe = 1; spin_lock(&tmpmle->spinlock); if (tmpmle->master == dlm->node_num) { mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n"); @@ -1625,8 +1623,7 @@ way_up_top: response = DLM_MASTER_RESP_NO; } else response = DLM_MASTER_RESP_MAYBE; - if (set_maybe) - set_bit(request->node_idx, tmpmle->maybe_map); + set_bit(request->node_idx, tmpmle->maybe_map); spin_unlock(&tmpmle->spinlock); } spin_unlock(&dlm->master_lock); @@ -1644,12 +1641,6 @@ send_response: * dlm_assert_master_worker() isn't called, we drop it here. */ if (dispatch_assert) { - if (response != DLM_MASTER_RESP_YES) - mlog(ML_ERROR, "invalid response %d\n", response); - if (!res) { - mlog(ML_ERROR, "bad lockres while trying to assert!\n"); - BUG(); - } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); spin_lock(&res->spinlock); diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index dd5cb8bcefd1..74407c6dd592 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); break; - default: - BUG(); } mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 1079fae5aa12..9ab9e1892b5f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -45,7 +45,7 @@ #include <linux/backing-dev.h> #include <linux/poll.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "stackglue.h" #include "userdlm.h" diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 83d576f6a287..77d1632e905d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb, mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name, lockres->l_level, new_level); + /* + * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always + * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that + * we can recover correctly from node failure. Otherwise, we may get + * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set. + */ + if (!ocfs2_is_o2cb_active() && + lockres->l_ops->flags & LOCK_TYPE_USES_LVB) + lvb = 1; + if (lvb) dlm_flags |= DLM_LKF_VALBLK; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 000c234d7bbd..c4889655d32b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1030,7 +1030,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, * Only quota files call this without a bh, and they can't be * refcounted. */ - BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode)); BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE)); clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); @@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode, *done = ret; } -static int ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, u64 byte_start, - u64 byte_len) +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len) { int ret = 0, flags = 0, done = 0, i; u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos; @@ -1719,8 +1719,7 @@ static int ocfs2_remove_inode_range(struct inode *inode, * within one cluster(means is not exactly aligned to clustersize). */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { - + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_cow_file_pos(inode, di_bh, byte_start); if (ret) { mlog_errno(ret); @@ -2036,7 +2035,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, struct super_block *sb = inode->i_sb; if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) || - !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) || + !ocfs2_is_refcount_inode(inode) || OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) return 0; @@ -2440,6 +2439,31 @@ out: return offset; } +static int ocfs2_file_clone_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len) +{ + return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out, + len, false); +} + +static ssize_t ocfs2_file_dedupe_range(struct file *src_file, + u64 loff, + u64 len, + struct file *dst_file, + u64 dst_loff) +{ + int error; + + error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff, + len, true); + if (error) + return error; + return len; +} + const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, @@ -2479,6 +2503,8 @@ const struct file_operations ocfs2_fops = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops = { @@ -2524,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = ocfs2_fallocate, + .clone_file_range = ocfs2_file_clone_range, + .dedupe_file_range = ocfs2_file_dedupe_range, }; const struct file_operations ocfs2_dops_no_plocks = { diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e8c62f22215c..897fd9a2e51d 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos, size_t count); +int ocfs2_remove_inode_range(struct inode *inode, + struct buffer_head *di_bh, u64 byte_start, + u64 byte_len); #endif /* OCFS2_FILE_H */ diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index c56a7679df93..382401d3e88f 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_commit; } - di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec); + di->i_dtime = cpu_to_le64(ktime_get_real_seconds()); di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL)); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 5af68fcdf9d3..9b955f732bca 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -181,4 +181,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_ return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache); } +/* Does this inode have the reflink flag set? */ +static inline bool ocfs2_is_refcount_inode(struct inode *inode) +{ + return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); +} + #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index a244f14c6b87..d5e5fa7f0743 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) */ seqno++; os->os_count++; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); unlock: ocfs2_orphan_scan_unlock(osb, seqno); out: @@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb) struct ocfs2_orphan_scan *os; os = &osb->osb_orphan_scan; - os->os_scantime = CURRENT_TIME; + os->os_scantime = ktime_get_seconds(); if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 71545ad4628c..429088786e93 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); + ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); BUG_ON(ret != len); ret = VM_FAULT_LOCKED; out: diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 4e8f32eb0bdb..e52a2852d50d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -235,10 +235,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, @@ -581,10 +578,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { - - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & - OCFS2_HAS_REFCOUNT_FL)); - + BUG_ON(!ocfs2_is_refcount_inode(inode)); BUG_ON(!context->refcount_loc); ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8d887c75765c..3b0a10d9b36f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir, struct ocfs2_extent_list *fel; u16 feat; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct timespec64 ts; *new_fe_bh = NULL; @@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir, fe->i_last_eb_blk = 0; strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE); fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL); + ktime_get_real_ts64(&ts); fe->i_atime = fe->i_ctime = fe->i_mtime = - cpu_to_le64(CURRENT_TIME.tv_sec); + cpu_to_le64(ts.tv_sec); fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec = - cpu_to_le32(CURRENT_TIME.tv_nsec); + cpu_to_le32(ts.tv_nsec); fe->i_dtime = 0; /* diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index e63af7ddfe68..7e5958b0be6b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,7 +224,7 @@ struct ocfs2_orphan_scan { struct ocfs2_super *os_osb; struct ocfs2_lock_res os_lockres; /* lock to synchronize scans */ struct delayed_work os_orphan_scan_work; - struct timespec os_scantime; /* time this node ran the scan */ + time64_t os_scantime; /* time this node ran the scan */ u32 os_count; /* tracks node specific scans */ u32 os_seqno; /* tracks cluster wide scans */ atomic_t os_state; /* ACTIVE or INACTIVE */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 87e577a49b0d..cec495a921e3 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -634,7 +634,15 @@ static void qsync_work_fn(struct work_struct *work) dqi_sync_work.work); struct super_block *sb = oinfo->dqi_gqinode->i_sb; - dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + /* + * We have to be careful here not to deadlock on s_umount as umount + * disabling quotas may be in progress and it waits for this work to + * complete. If trylock fails, we'll do the sync next time... + */ + if (down_read_trylock(&sb->s_umount)) { + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + up_read(&sb->s_umount); + } schedule_delayed_work(&oinfo->dqi_sync_work, msecs_to_jiffies(oinfo->dqi_syncms)); } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 8a54fd8a4fa5..32c5a40c1257 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -454,7 +454,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. * The function expects local quota file to be already locked and - * dqonoff_mutex locked. */ + * s_umount locked in shared mode. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -597,7 +597,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -674,7 +674,7 @@ out_put: break; } out: - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + up_read(&sb->s_umount); kfree(rec); return status; } @@ -840,7 +840,10 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) } ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); - /* dqonoff_mutex protects us against racing with recovery thread... */ + /* + * s_umount held in exclusive mode protects us against racing with + * recovery thread... + */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); mark_clean = 0; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 19238512a324..f8933cb53d68 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -34,6 +34,7 @@ #include "xattr.h" #include "namei.h" #include "ocfs2_trace.h" +#include "file.h" #include <linux/bio.h> #include <linux/blkdev.h> @@ -410,7 +411,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno) goto out; } - BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); di = (struct ocfs2_dinode *)di_bh->b_data; *ref_blkno = le64_to_cpu(di->i_refcount_loc); @@ -478,7 +479,6 @@ again: if (ret) { mlog_errno(ret); ocfs2_unlock_refcount_tree(osb, tree, rw); - ocfs2_refcount_tree_put(tree); goto out; } @@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode, u32 num_got; u64 suballoc_loc, first_blkno; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); trace_ocfs2_create_refcount_tree( (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -708,7 +708,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode, struct ocfs2_refcount_block *rb; struct ocfs2_refcount_tree *ref_tree; - BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL); + BUG_ON(ocfs2_is_refcount_inode(inode)); ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1, &ref_tree, &ref_root_bh); @@ -775,7 +775,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc); u16 bit = 0; - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) + if (!ocfs2_is_refcount_inode(inode)) return 0; BUG_ON(!ref_blkno); @@ -2299,11 +2299,10 @@ int ocfs2_decrease_refcount(struct inode *inode, { int ret; u64 ref_blkno; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_block(inode, &ref_blkno); if (ret) { @@ -2533,7 +2532,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, int *ref_blocks) { int ret; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *tree; u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno); @@ -2544,7 +2542,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode, goto out; } - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), refcount_loc, &tree); @@ -3412,14 +3410,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode, { int ret; u32 cow_start = 0, cow_len = 0; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *ref_root_bh = NULL; struct ocfs2_refcount_tree *ref_tree; struct ocfs2_cow_context *context = NULL; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list, cpos, write_len, max_cpos, @@ -3629,11 +3626,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode, { int ret; struct ocfs2_xattr_value_root *xv = vb->vb_xv; - struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_cow_context *context = NULL; u32 cow_start, cow_len; - BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)); + BUG_ON(!ocfs2_is_refcount_inode(inode)); ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list, cpos, write_len, UINT_MAX, @@ -3696,6 +3692,9 @@ int ocfs2_add_refcount_flag(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_alloc_context *meta_ac = NULL; + /* We need to be able to handle at least an extent tree split. */ + ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el); + ret = ocfs2_calc_refcount_meta_credits(inode->i_sb, ref_ci, ref_root_bh, p_cluster, num_clusters, @@ -3807,7 +3806,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode, ocfs2_init_dealloc_ctxt(&dealloc); - if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) { + if (!ocfs2_is_refcount_inode(inode)) { ret = ocfs2_create_refcount_tree(inode, di_bh); if (ret) { mlog_errno(ret); @@ -3934,6 +3933,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh, p_cluster, num_clusters, meta_ac, dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = dquot_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, num_clusters)); if (ret) mlog_errno(ret); @@ -4442,3 +4448,434 @@ out: return error; } + +/* Update destination inode size, if necessary. */ +static int ocfs2_reflink_update_dest(struct inode *dest, + struct buffer_head *d_bh, + loff_t newlen) +{ + handle_t *handle; + int ret; + + dest->i_blocks = ocfs2_inode_sector_count(dest); + + if (newlen <= i_size_read(dest)) + return 0; + + handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return ret; + } + + /* Extend i_size if needed. */ + spin_lock(&OCFS2_I(dest)->ip_lock); + if (newlen > i_size_read(dest)) + i_size_write(dest, newlen); + spin_unlock(&OCFS2_I(dest)->ip_lock); + dest->i_ctime = dest->i_mtime = current_time(dest); + + ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle); + return ret; +} + +/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */ +static int ocfs2_reflink_remap_extent(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + struct ocfs2_extent_tree s_et; + struct ocfs2_extent_tree t_et; + struct ocfs2_dinode *dis; + struct buffer_head *ref_root_bh = NULL; + struct ocfs2_refcount_tree *ref_tree; + struct ocfs2_super *osb; + loff_t pstart, plen; + u32 p_cluster, num_clusters, slast, spos, tpos; + unsigned int ext_flags; + int ret = 0; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh); + ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh); + + spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in); + tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out); + slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len); + + while (spos < slast) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + goto out; + } + + /* Look up the extent. */ + ret = ocfs2_get_clusters(s_inode, spos, &p_cluster, + &num_clusters, &ext_flags); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_clusters = min_t(u32, num_clusters, slast - spos); + + /* Punch out the dest range. */ + pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos); + plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters); + ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (p_cluster == 0) + goto next_loop; + + /* Lock the refcount btree... */ + ret = ocfs2_lock_refcount_tree(osb, + le64_to_cpu(dis->i_refcount_loc), + 1, &ref_tree, &ref_root_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* Mark s_inode's extent as refcounted. */ + if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) { + ret = ocfs2_add_refcount_flag(s_inode, &s_et, + &ref_tree->rf_ci, + ref_root_bh, spos, + p_cluster, num_clusters, + dealloc, NULL); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + } + + /* Map in the new extent. */ + ext_flags |= OCFS2_EXT_REFCOUNTED; + ret = ocfs2_add_refcounted_extent(t_inode, &t_et, + &ref_tree->rf_ci, + ref_root_bh, + tpos, p_cluster, + num_clusters, + ext_flags, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_unlock_refcount; + } + + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); +next_loop: + spos += num_clusters; + tpos += num_clusters; + } + +out: + return ret; +out_unlock_refcount: + ocfs2_unlock_refcount_tree(osb, ref_tree, 1); + brelse(ref_root_bh); + return ret; +} + +/* Set up refcount tree and remap s_inode to t_inode. */ +static int ocfs2_reflink_remap_blocks(struct inode *s_inode, + struct buffer_head *s_bh, + loff_t pos_in, + struct inode *t_inode, + struct buffer_head *t_bh, + loff_t pos_out, + loff_t len) +{ + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_super *osb; + struct ocfs2_dinode *dis; + struct ocfs2_dinode *dit; + int ret; + + osb = OCFS2_SB(s_inode->i_sb); + dis = (struct ocfs2_dinode *)s_bh->b_data; + dit = (struct ocfs2_dinode *)t_bh->b_data; + ocfs2_init_dealloc_ctxt(&dealloc); + + /* + * If we're reflinking the entire file and the source is inline + * data, just copy the contents. + */ + if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) && + i_size_read(t_inode) <= len && + (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) { + ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh); + if (ret) + mlog_errno(ret); + goto out; + } + + /* + * If both inodes belong to two different refcount groups then + * forget it because we don't know how (or want) to go merging + * refcount trees. + */ + ret = -EOPNOTSUPP; + if (ocfs2_is_refcount_inode(s_inode) && + ocfs2_is_refcount_inode(t_inode) && + le64_to_cpu(dis->i_refcount_loc) != + le64_to_cpu(dit->i_refcount_loc)) + goto out; + + /* Neither inode has a refcount tree. Add one to s_inode. */ + if (!ocfs2_is_refcount_inode(s_inode) && + !ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_create_refcount_tree(s_inode, s_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Ensure that both inodes end up with the same refcount tree. */ + if (!ocfs2_is_refcount_inode(s_inode)) { + ret = ocfs2_set_refcount_tree(s_inode, s_bh, + le64_to_cpu(dit->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + if (!ocfs2_is_refcount_inode(t_inode)) { + ret = ocfs2_set_refcount_tree(t_inode, t_bh, + le64_to_cpu(dis->i_refcount_loc)); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Turn off inline data in the dest file. */ + if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* Actually remap extents now. */ + ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh, + pos_out, len, &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + if (ocfs2_dealloc_has_cluster(&dealloc)) { + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + } + + return ret; +} + +/* Lock an inode and grab a bh pointing to the inode. */ +static int ocfs2_reflink_inodes_lock(struct inode *s_inode, + struct buffer_head **bh1, + struct inode *t_inode, + struct buffer_head **bh2) +{ + struct inode *inode1; + struct inode *inode2; + struct ocfs2_inode_info *oi1; + struct ocfs2_inode_info *oi2; + bool same_inode = (s_inode == t_inode); + int status; + + /* First grab the VFS and rw locks. */ + lock_two_nondirectories(s_inode, t_inode); + inode1 = s_inode; + inode2 = t_inode; + if (inode1->i_ino > inode2->i_ino) + swap(inode1, inode2); + + status = ocfs2_rw_lock(inode1, 1); + if (status) { + mlog_errno(status); + goto out_i1; + } + if (!same_inode) { + status = ocfs2_rw_lock(inode2, 1); + if (status) { + mlog_errno(status); + goto out_i2; + } + } + + /* Now go for the cluster locks */ + oi1 = OCFS2_I(inode1); + oi2 = OCFS2_I(inode2); + + trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, + (unsigned long long)oi2->ip_blkno); + + if (*bh1) + *bh1 = NULL; + if (*bh2) + *bh2 = NULL; + + /* We always want to lock the one with the lower lockid first. */ + if (oi1->ip_blkno > oi2->ip_blkno) + mlog_errno(-ENOLCK); + + /* lock id1 */ + status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_rw2; + } + + /* lock id2 */ + if (!same_inode) { + status = ocfs2_inode_lock_nested(inode2, bh2, 1, + OI_LS_REFLINK_TARGET); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + goto out_cl1; + } + } else + *bh2 = *bh1; + + trace_ocfs2_double_lock_end( + (unsigned long long)OCFS2_I(inode1)->ip_blkno, + (unsigned long long)OCFS2_I(inode2)->ip_blkno); + + return 0; + +out_cl1: + ocfs2_inode_unlock(inode1, 1); + brelse(*bh1); + *bh1 = NULL; +out_rw2: + ocfs2_rw_unlock(inode2, 1); +out_i2: + ocfs2_rw_unlock(inode1, 1); +out_i1: + unlock_two_nondirectories(s_inode, t_inode); + return status; +} + +/* Unlock both inodes and release buffers. */ +static void ocfs2_reflink_inodes_unlock(struct inode *s_inode, + struct buffer_head *s_bh, + struct inode *t_inode, + struct buffer_head *t_bh) +{ + ocfs2_inode_unlock(s_inode, 1); + ocfs2_rw_unlock(s_inode, 1); + brelse(s_bh); + if (s_inode != t_inode) { + ocfs2_inode_unlock(t_inode, 1); + ocfs2_rw_unlock(t_inode, 1); + brelse(t_bh); + } + unlock_two_nondirectories(s_inode, t_inode); +} + +/* Link a range of blocks from one file to another. */ +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb); + struct buffer_head *in_bh = NULL, *out_bh = NULL; + bool same_inode = (inode_in == inode_out); + ssize_t ret; + + if (!ocfs2_refcount_tree(osb)) + return -EOPNOTSUPP; + if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) + return -EROFS; + + /* Lock both files against IO */ + ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh); + if (ret) + return ret; + + /* Check file eligibility and prepare for block sharing. */ + ret = -EINVAL; + if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) || + (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE)) + goto out_unlock; + + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret <= 0) + goto out_unlock; + + /* Lock out changes to the allocation maps and remap. */ + down_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem, + SINGLE_DEPTH_NESTING); + + ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out, + out_bh, pos_out, len); + + /* Zap any page cache for the destination file's range. */ + if (!ret) + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + + up_write(&OCFS2_I(inode_in)->ip_alloc_sem); + if (!same_inode) + up_write(&OCFS2_I(inode_out)->ip_alloc_sem); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + /* + * Empty the extent map so that we may get the right extent + * record from the disk. + */ + ocfs2_extent_map_trunc(inode_in, 0); + ocfs2_extent_map_trunc(inode_out, 0); + + ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len); + if (ret) { + mlog_errno(ret); + goto out_unlock; + } + + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return 0; + +out_unlock: + ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh); + return ret; +} diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h index 6422bbcdb525..4af55bf4b35b 100644 --- a/fs/ocfs2/refcounttree.h +++ b/fs/ocfs2/refcounttree.h @@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode, const char __user *oldname, const char __user *newname, bool preserve); +int ocfs2_reflink_remap_range(struct file *file_in, + loff_t pos_in, + struct file *file_out, + loff_t pos_out, + u64 len, + bool is_dedupe); + #endif /* OCFS2_REFCOUNTTREE_H */ diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index c9e828ec3c8e..dae9eb7c441e 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -24,7 +24,7 @@ #include <linux/slab.h> #include <linux/reboot.h> #include <linux/sched.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "stackglue.h" diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 52c07346bea3..820359096c7a 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl"; */ static struct ocfs2_stack_plugin *active_stack; +inline int ocfs2_is_o2cb_active(void) +{ + return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB); +} +EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active); + static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name) { struct ocfs2_stack_plugin *p; diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index f2dce10fae54..e3036e1790e8 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin); void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin); +/* In ocfs2_downconvert_lock(), we need to know which stack we are using */ +int ocfs2_is_o2cb_active(void); + extern struct kset *ocfs2_kset; #endif /* STACKGLUE_H */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f56fe39fab04..a24e42f95341 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len) out += snprintf(buf + out, len - out, "Disabled\n"); else out += snprintf(buf + out, len - out, "%lu seconds ago\n", - (get_seconds() - os->os_scantime.tv_sec)); + (unsigned long)(ktime_get_seconds() - os->os_scantime)); out += snprintf(buf + out, len - out, "%10s => %3s %10s\n", "Slots", "Num", "RecoGen"); @@ -985,7 +985,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; - /* Cancel periodic syncing before we grab dqonoff_mutex */ oinfo = sb_dqinfo(sb, type)->dqi_priv; cancel_delayed_work_sync(&oinfo->dqi_sync_work); inode = igrab(sb->s_dquot.files[type]); diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 6ad8eecefe21..94cfacc9bad7 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -87,7 +87,6 @@ const struct address_space_operations ocfs2_fast_symlink_aops = { }; const struct inode_operations ocfs2_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index cb157a34a656..3c5384d9b3a5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2577,7 +2577,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return 0; - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) { + if (ocfs2_is_refcount_inode(inode)) { ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), le64_to_cpu(di->i_refcount_loc), 1, &ref_tree, &ref_root_bh); @@ -3608,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode, } /* Check whether the value is refcounted and do some preparation. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL && + if (ocfs2_is_refcount_inode(inode) && (!xis.not_found || !xbs.not_found)) { ret = ocfs2_prepare_refcount_xattr(inode, di, &xi, &xis, &xbs, &ref_tree, diff --git a/fs/open.c b/fs/open.c index d3ed8171e8e0..9921f70bc5ca 100644 --- a/fs/open.c +++ b/fs/open.c @@ -19,7 +19,7 @@ #include <linux/mount.h> #include <linux/fcntl.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/personality.h> #include <linux/pagemap.h> diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index c003a667ed1a..13215f26e321 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -16,7 +16,7 @@ #include <asm/openprom.h> #include <asm/oplib.h> #include <asm/prom.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static DEFINE_MUTEX(op_mutex); diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index f419dd999581..c4ab6fdf17a0 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -355,7 +355,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, __u64 tag; } head; int total = ret = iov_iter_count(iter); - int n; int downcall_size = sizeof(struct orangefs_downcall_s); int head_size = sizeof(head); @@ -372,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return -EFAULT; } - n = copy_from_iter(&head, head_size, iter); - if (n < head_size) { + if (!copy_from_iter_full(&head, head_size, iter)) { gossip_err("%s: failed to copy head.\n", __func__); return -EFAULT; } @@ -408,8 +406,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, return ret; } - n = copy_from_iter(&op->downcall, downcall_size, iter); - if (n != downcall_size) { + if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) { gossip_err("%s: failed to copy downcall.\n", __func__); goto Efault; } @@ -463,10 +460,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb, goto Enomem; } memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size); - n = copy_from_iter(op->downcall.trailer_buf, - op->downcall.trailer_size, - iter); - if (n != op->downcall.trailer_size) { + if (!copy_from_iter_full(op->downcall.trailer_buf, + op->downcall.trailer_size, iter)) { gossip_err("%s: failed to copy trailer.\n", __func__); vfree(op->downcall.trailer_buf); goto Efault; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index 02cc6139ec90..e6bbc8083d77 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -724,7 +724,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl) { int rc = -EINVAL; - if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { + if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) { if (cmd == F_GETLK) { rc = 0; posix_test_lock(filp, fl); diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 462d10933e48..5cd617980fbf 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -8,6 +8,7 @@ * Linux VFS inode operations. */ +#include <linux/bvec.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 0748a26598fc..791912da97d7 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -434,6 +434,7 @@ static ssize_t orangefs_debug_write(struct file *file, char *debug_string; struct orangefs_kernel_op_s *new_op = NULL; struct client_debug_mask c_mask = { NULL, 0, 0 }; + char *s; gossip_debug(GOSSIP_DEBUGFS_DEBUG, "orangefs_debug_write: %pD\n", @@ -521,8 +522,9 @@ static ssize_t orangefs_debug_write(struct file *file, } mutex_lock(&orangefs_debug_lock); - memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); - sprintf((char *)file->f_inode->i_private, "%s\n", debug_string); + s = file_inode(file)->i_private; + memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN); + sprintf(s, "%s\n", debug_string); mutex_unlock(&orangefs_debug_lock); *ppos += count; diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index 10b0b06e075e..02b1bbdbcc42 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -9,7 +9,6 @@ #include "orangefs-bufmap.h" const struct inode_operations orangefs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = simple_get_link, .setattr = orangefs_setattr, .getattr = orangefs_getattr, diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig index 34355818a2e0..0daac5112f7a 100644 --- a/fs/overlayfs/Kconfig +++ b/fs/overlayfs/Kconfig @@ -8,3 +8,17 @@ config OVERLAY_FS merged with the 'upper' object. For more information see Documentation/filesystems/overlayfs.txt + +config OVERLAY_FS_REDIRECT_DIR + bool "Overlayfs: turn on redirect dir feature by default" + depends on OVERLAY_FS + help + If this config option is enabled then overlay filesystems will use + redirects when renaming directories by default. In this case it is + still possible to turn off redirects globally with the + "redirect_dir=off" module option or on a filesystem instance basis + with the "redirect_dir=off" mount option. + + Note, that redirects are not backward compatible. That is, mounting + an overlay which has redirects on a kernel that doesn't support this + feature will have unexpected results. diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile index 900daed3e91d..99373bbc1478 100644 --- a/fs/overlayfs/Makefile +++ b/fs/overlayfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_OVERLAY_FS) += overlay.o -overlay-objs := super.o inode.o dir.o readdir.o copy_up.o +overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 36795eed40b0..f57043dace62 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -33,7 +33,7 @@ static int ovl_check_fd(const void *data, struct file *f, unsigned int fd) { const struct dentry *dentry = data; - if (f->f_inode == d_inode(dentry)) + if (file_inode(f) == d_inode(dentry)) pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n", f, fd, current->pid, current->comm); return 0; @@ -153,6 +153,13 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) goto out_fput; } + /* Try to use clone_file_range to clone up within the same fs */ + error = vfs_clone_file_range(old_file, 0, new_file, 0, len); + if (!error) + goto out; + /* Couldn't clone, so now we try to copy the data */ + error = 0; + /* FIXME: copy up sparse files efficiently */ while (len) { size_t this_len = OVL_COPY_UP_CHUNK_SIZE; @@ -177,7 +184,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) len -= bytes; } - +out: if (!error) error = vfs_fsync(new_file, 0); fput(new_file); @@ -231,10 +238,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct inode *udir = upperdir->d_inode; struct dentry *newdentry = NULL; struct dentry *upper = NULL; - umode_t mode = stat->mode; int err; const struct cred *old_creds = NULL; struct cred *new_creds = NULL; + struct cattr cattr = { + /* Can't properly set mode on creation because of the umask */ + .mode = stat->mode & S_IFMT, + .rdev = stat->rdev, + .link = link + }; newdentry = ovl_lookup_temp(workdir, dentry); err = PTR_ERR(newdentry); @@ -254,10 +266,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (new_creds) old_creds = override_creds(new_creds); - /* Can't properly set mode on creation because of the umask */ - stat->mode &= S_IFMT; - err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); - stat->mode = mode; + err = ovl_create_real(wdir, newdentry, &cattr, NULL, true); if (new_creds) { revert_creds(old_creds); @@ -296,12 +305,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, ovl_dentry_update(dentry, newdentry); ovl_inode_update(d_inode(dentry), d_inode(newdentry)); newdentry = NULL; - - /* - * Non-directores become opaque when copied up. - */ - if (!S_ISDIR(stat->mode)) - ovl_dentry_set_opaque(dentry, true); out2: dput(upper); out1: @@ -317,20 +320,14 @@ out_cleanup: /* * Copy up a single dentry * - * Directory renames only allowed on "pure upper" (already created on - * upper filesystem, never copied up). Directories which are on lower or - * are merged may not be renamed. For these -EXDEV is returned and - * userspace has to deal with it. This means, when copying up a - * directory we can rely on it and ancestors being stable. - * - * Non-directory renames start with copy up of source if necessary. The - * actual rename will only proceed once the copy up was successful. Copy - * up uses upper parent i_mutex for exclusion. Since rename can change - * d_parent it is possible that the copy up will lock the old parent. At - * that point the file will have already been copied up anyway. + * All renames start with copy up of source if necessary. The actual + * rename will only proceed once the copy up was successful. Copy up uses + * upper parent i_mutex for exclusion. Since rename can change d_parent it + * is possible that the copy up will lock the old parent. At that point + * the file will have already been copied up anyway. */ -int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, - struct path *lowerpath, struct kstat *stat) +static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat) { DEFINE_DELAYED_CALL(done); struct dentry *workdir = ovl_workdir(dentry); @@ -339,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct path parentpath; struct dentry *lowerdentry = lowerpath->dentry; struct dentry *upperdir; - struct dentry *upperdentry; const char *link = NULL; if (WARN_ON(!workdir)) @@ -365,8 +361,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, pr_err("overlayfs: failed to lock workdir+upperdir\n"); goto out_unlock; } - upperdentry = ovl_dentry_upper(dentry); - if (upperdentry) { + if (ovl_dentry_upper(dentry)) { /* Raced with another copy-up? Nothing to do, then... */ err = 0; goto out_unlock; @@ -385,7 +380,7 @@ out_unlock: return err; } -int ovl_copy_up(struct dentry *dentry) +int ovl_copy_up_flags(struct dentry *dentry, int flags) { int err = 0; const struct cred *old_cred = ovl_override_creds(dentry->d_sb); @@ -415,6 +410,9 @@ int ovl_copy_up(struct dentry *dentry) ovl_path_lower(next, &lowerpath); err = vfs_getattr(&lowerpath, &stat); + /* maybe truncate regular file. this has no effect on dirs */ + if (flags & O_TRUNC) + stat.size = 0; if (!err) err = ovl_copy_up_one(parent, next, &lowerpath, &stat); @@ -425,3 +423,8 @@ int ovl_copy_up(struct dentry *dentry) return err; } + +int ovl_copy_up(struct dentry *dentry) +{ + return ovl_copy_up_flags(dentry, 0); +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 306b6c161840..16e06dd89457 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -12,11 +12,18 @@ #include <linux/xattr.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/module.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/atomic.h> +#include <linux/ratelimit.h> #include "overlayfs.h" +static unsigned short ovl_redirect_max = 256; +module_param_named(redirect_max, ovl_redirect_max, ushort, 0644); +MODULE_PARM_DESC(ovl_redirect_max, + "Maximum length of absolute redirect xattr value"); + void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) { int err; @@ -75,8 +82,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir, } int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct kstat *stat, const char *link, - struct dentry *hardlink, bool debug) + struct cattr *attr, struct dentry *hardlink, bool debug) { int err; @@ -86,13 +92,13 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, if (hardlink) { err = ovl_do_link(hardlink, dir, newdentry, debug); } else { - switch (stat->mode & S_IFMT) { + switch (attr->mode & S_IFMT) { case S_IFREG: - err = ovl_do_create(dir, newdentry, stat->mode, debug); + err = ovl_do_create(dir, newdentry, attr->mode, debug); break; case S_IFDIR: - err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + err = ovl_do_mkdir(dir, newdentry, attr->mode, debug); break; case S_IFCHR: @@ -100,11 +106,11 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, case S_IFIFO: case S_IFSOCK: err = ovl_do_mknod(dir, newdentry, - stat->mode, stat->rdev, debug); + attr->mode, attr->rdev, debug); break; case S_IFLNK: - err = ovl_do_symlink(dir, newdentry, link, debug); + err = ovl_do_symlink(dir, newdentry, attr->link, debug); break; default: @@ -121,20 +127,15 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry, return err; } -static int ovl_set_opaque(struct dentry *upperdentry) -{ - return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); -} - -static void ovl_remove_opaque(struct dentry *upperdentry) +static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) { int err; - err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE); - if (err) { - pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", - upperdentry->d_name.name, err); - } + err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0); + if (!err) + ovl_dentry_set_opaque(dentry); + + return err; } static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -182,9 +183,13 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, d_instantiate(dentry, inode); } +static bool ovl_type_merge(struct dentry *dentry) +{ + return OVL_TYPE_MERGE(ovl_path_type(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, - struct kstat *stat, const char *link, - struct dentry *hardlink) + struct cattr *attr, struct dentry *hardlink) { struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *udir = upperdir->d_inode; @@ -192,7 +197,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, int err; if (!hardlink && !IS_POSIXACL(udir)) - stat->mode &= ~current_umask(); + attr->mode &= ~current_umask(); inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, @@ -200,10 +205,15 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, err = PTR_ERR(newdentry); if (IS_ERR(newdentry)) goto out_unlock; - err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + err = ovl_create_real(udir, newdentry, attr, hardlink, false); if (err) goto out_dput; + if (ovl_type_merge(dentry->d_parent)) { + /* Setting opaque here is just an optimization, allow to fail */ + ovl_set_opaque(dentry, newdentry); + } + ovl_instantiate(dentry, inode, newdentry, !!hardlink); newdentry = NULL; out_dput: @@ -270,7 +280,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (IS_ERR(opaquedir)) goto out_unlock; - err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + err = ovl_create_real(wdir, opaquedir, + &(struct cattr){.mode = stat.mode}, NULL, true); if (err) goto out_dput; @@ -278,7 +289,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - err = ovl_set_opaque(opaquedir); + err = ovl_set_opaque(dentry, opaquedir); if (err) goto out_cleanup; @@ -370,7 +381,7 @@ out_free: } static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, - struct kstat *stat, const char *link, + struct cattr *cattr, struct dentry *hardlink) { struct dentry *workdir = ovl_workdir(dentry); @@ -387,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (!hardlink) { err = posix_acl_create(dentry->d_parent->d_inode, - &stat->mode, &default_acl, &acl); + &cattr->mode, &default_acl, &acl); if (err) return err; } @@ -407,7 +418,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, if (IS_ERR(upper)) goto out_dput; - err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + err = ovl_create_real(wdir, newdentry, cattr, hardlink, true); if (err) goto out_dput2; @@ -415,10 +426,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, * mode could have been mutilated due to umask (e.g. sgid directory) */ if (!hardlink && - !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) { + !S_ISLNK(cattr->mode) && + newdentry->d_inode->i_mode != cattr->mode) { struct iattr attr = { .ia_valid = ATTR_MODE, - .ia_mode = stat->mode, + .ia_mode = cattr->mode, }; inode_lock(newdentry->d_inode); err = notify_change(newdentry, &attr, NULL); @@ -438,8 +450,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, goto out_cleanup; } - if (!hardlink && S_ISDIR(stat->mode)) { - err = ovl_set_opaque(newdentry); + if (!hardlink && S_ISDIR(cattr->mode)) { + err = ovl_set_opaque(dentry, newdentry); if (err) goto out_cleanup; @@ -475,8 +487,7 @@ out_cleanup: } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, - struct kstat *stat, const char *link, - struct dentry *hardlink) + struct cattr *attr, struct dentry *hardlink) { int err; const struct cred *old_cred; @@ -494,7 +505,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, override_cred->fsgid = inode->i_gid; if (!hardlink) { err = security_dentry_create_files_as(dentry, - stat->mode, &dentry->d_name, old_cred, + attr->mode, &dentry->d_name, old_cred, override_cred); if (err) { put_cred(override_cred); @@ -504,12 +515,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, put_cred(override_creds(override_cred)); put_cred(override_cred); - if (!ovl_dentry_is_opaque(dentry)) - err = ovl_create_upper(dentry, inode, stat, link, + if (!ovl_dentry_is_whiteout(dentry)) + err = ovl_create_upper(dentry, inode, attr, hardlink); else - err = ovl_create_over_whiteout(dentry, inode, stat, - link, hardlink); + err = ovl_create_over_whiteout(dentry, inode, attr, + hardlink); } out_revert_creds: revert_creds(old_cred); @@ -528,8 +539,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, { int err; struct inode *inode; - struct kstat stat = { + struct cattr attr = { .rdev = rdev, + .link = link, }; err = ovl_want_write(dentry); @@ -537,14 +549,14 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, goto out; err = -ENOMEM; - inode = ovl_new_inode(dentry->d_sb, mode); + inode = ovl_new_inode(dentry->d_sb, mode, rdev); if (!inode) goto out_drop_write; inode_init_owner(inode, dentry->d_parent->d_inode, mode); - stat.mode = inode->i_mode; + attr.mode = inode->i_mode; - err = ovl_create_or_link(dentry, inode, &stat, link, NULL); + err = ovl_create_or_link(dentry, inode, &attr, NULL); if (err) iput(inode); @@ -598,7 +610,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir, inode = d_inode(old); ihold(inode); - err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old)); + err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old)); if (err) iput(inode); @@ -684,8 +696,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); struct inode *dir = upperdir->d_inode; struct dentry *upper; + struct dentry *opaquedir = NULL; int err; + /* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */ + if (is_dir && ovl_dentry_get_redirect(dentry)) { + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + inode_lock_nested(dir, I_MUTEX_PARENT); upper = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); @@ -694,14 +715,15 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) goto out_unlock; err = -ESTALE; - if (upper == ovl_dentry_upper(dentry)) { - if (is_dir) - err = vfs_rmdir(dir, upper); - else - err = vfs_unlink(dir, upper, NULL); - ovl_dentry_version_inc(dentry->d_parent); - } - dput(upper); + if ((opaquedir && upper != opaquedir) || + (!opaquedir && upper != ovl_dentry_upper(dentry))) + goto out_dput_upper; + + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + ovl_dentry_version_inc(dentry->d_parent); /* * Keeping this dentry hashed would mean having to release @@ -711,34 +733,21 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) */ if (!err) d_drop(dentry); +out_dput_upper: + dput(upper); out_unlock: inode_unlock(dir); - + dput(opaquedir); +out: return err; } -static inline int ovl_check_sticky(struct dentry *dentry) -{ - struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; - struct inode *inode = ovl_dentry_real(dentry)->d_inode; - - if (check_sticky(dir, inode)) - return -EPERM; - - return 0; -} - static int ovl_do_remove(struct dentry *dentry, bool is_dir) { enum ovl_path_type type; int err; const struct cred *old_cred; - - err = ovl_check_sticky(dentry); - if (err) - goto out; - err = ovl_want_write(dentry); if (err) goto out; @@ -750,7 +759,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir) type = ovl_path_type(dentry); old_cred = ovl_override_creds(dentry->d_sb); - if (OVL_TYPE_PURE_UPPER(type)) + if (!ovl_lower_positive(dentry)) err = ovl_remove_upper(dentry, is_dir); else err = ovl_remove_and_whiteout(dentry, is_dir); @@ -777,13 +786,114 @@ static int ovl_rmdir(struct inode *dir, struct dentry *dentry) return ovl_do_remove(dentry, true); } -static int ovl_rename2(struct inode *olddir, struct dentry *old, - struct inode *newdir, struct dentry *new, - unsigned int flags) +static bool ovl_type_merge_or_lower(struct dentry *dentry) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type); +} + +static bool ovl_can_move(struct dentry *dentry) +{ + return ovl_redirect_dir(dentry->d_sb) || + !d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry); +} + +static char *ovl_get_redirect(struct dentry *dentry, bool samedir) +{ + char *buf, *ret; + struct dentry *d, *tmp; + int buflen = ovl_redirect_max + 1; + + if (samedir) { + ret = kstrndup(dentry->d_name.name, dentry->d_name.len, + GFP_KERNEL); + goto out; + } + + buf = ret = kmalloc(buflen, GFP_TEMPORARY); + if (!buf) + goto out; + + buflen--; + buf[buflen] = '\0'; + for (d = dget(dentry); !IS_ROOT(d);) { + const char *name; + int thislen; + + spin_lock(&d->d_lock); + name = ovl_dentry_get_redirect(d); + if (name) { + thislen = strlen(name); + } else { + name = d->d_name.name; + thislen = d->d_name.len; + } + + /* If path is too long, fall back to userspace move */ + if (thislen + (name[0] != '/') > buflen) { + ret = ERR_PTR(-EXDEV); + spin_unlock(&d->d_lock); + goto out_put; + } + + buflen -= thislen; + memcpy(&buf[buflen], name, thislen); + tmp = dget_dlock(d->d_parent); + spin_unlock(&d->d_lock); + + dput(d); + d = tmp; + + /* Absolute redirect: finished */ + if (buf[buflen] == '/') + break; + buflen--; + buf[buflen] = '/'; + } + ret = kstrdup(&buf[buflen], GFP_KERNEL); +out_put: + dput(d); + kfree(buf); +out: + return ret ? ret : ERR_PTR(-ENOMEM); +} + +static int ovl_set_redirect(struct dentry *dentry, bool samedir) +{ + int err; + const char *redirect = ovl_dentry_get_redirect(dentry); + + if (redirect && (samedir || redirect[0] == '/')) + return 0; + + redirect = ovl_get_redirect(dentry, samedir); + if (IS_ERR(redirect)) + return PTR_ERR(redirect); + + err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT, + redirect, strlen(redirect), 0); + if (!err) { + spin_lock(&dentry->d_lock); + ovl_dentry_set_redirect(dentry, redirect); + spin_unlock(&dentry->d_lock); + } else { + kfree(redirect); + if (err == -EOPNOTSUPP) + ovl_clear_redirect_dir(dentry->d_sb); + else + pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err); + /* Fall back to userspace copy-up */ + err = -EXDEV; + } + return err; +} + +static int ovl_rename(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) { int err; - enum ovl_path_type old_type; - enum ovl_path_type new_type; struct dentry *old_upperdir; struct dentry *new_upperdir; struct dentry *olddentry; @@ -794,7 +904,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, bool cleanup_whiteout = false; bool overwrite = !(flags & RENAME_EXCHANGE); bool is_dir = d_is_dir(old); - bool new_is_dir = false; + bool new_is_dir = d_is_dir(new); + bool samedir = olddir == newdir; struct dentry *opaquedir = NULL; const struct cred *old_cred = NULL; @@ -804,46 +915,12 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, flags &= ~RENAME_NOREPLACE; - err = ovl_check_sticky(old); - if (err) - goto out; - /* Don't copy up directory trees */ - old_type = ovl_path_type(old); err = -EXDEV; - if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir) + if (!ovl_can_move(old)) + goto out; + if (!overwrite && !ovl_can_move(new)) goto out; - - if (new->d_inode) { - err = ovl_check_sticky(new); - if (err) - goto out; - - if (d_is_dir(new)) - new_is_dir = true; - - new_type = ovl_path_type(new); - err = -EXDEV; - if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) - goto out; - - err = 0; - if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) { - if (ovl_dentry_lower(old)->d_inode == - ovl_dentry_lower(new)->d_inode) - goto out; - } - if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) { - if (ovl_dentry_upper(old)->d_inode == - ovl_dentry_upper(new)->d_inode) - goto out; - } - } else { - if (ovl_dentry_is_opaque(new)) - new_type = __OVL_PATH_UPPER; - else - new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE; - } err = ovl_want_write(old); if (err) @@ -862,12 +939,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, goto out_drop_write; } - old_opaque = !OVL_TYPE_PURE_UPPER(old_type); - new_opaque = !OVL_TYPE_PURE_UPPER(new_type); - old_cred = ovl_override_creds(old->d_sb); - if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) { + if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) { opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { @@ -877,15 +951,15 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, } if (overwrite) { - if (old_opaque) { - if (new->d_inode || !new_opaque) { + if (ovl_lower_positive(old)) { + if (!ovl_dentry_is_whiteout(new)) { /* Whiteout source */ flags |= RENAME_WHITEOUT; } else { /* Switch whiteouts */ flags |= RENAME_EXCHANGE; } - } else if (is_dir && !new->d_inode && new_opaque) { + } else if (is_dir && ovl_dentry_is_whiteout(new)) { flags |= RENAME_EXCHANGE; cleanup_whiteout = true; } @@ -896,7 +970,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, trap = lock_rename(new_upperdir, old_upperdir); - olddentry = lookup_one_len(old->d_name.name, old_upperdir, old->d_name.len); err = PTR_ERR(olddentry); @@ -913,6 +986,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (IS_ERR(newdentry)) goto out_dput_old; + old_opaque = ovl_dentry_is_opaque(old); + new_opaque = ovl_dentry_is_opaque(new); + err = -ESTALE; if (ovl_dentry_upper(new)) { if (opaquedir) { @@ -933,54 +1009,31 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (newdentry == trap) goto out_dput; - if (is_dir && !old_opaque && new_opaque) { - err = ovl_set_opaque(olddentry); + if (WARN_ON(olddentry->d_inode == newdentry->d_inode)) + goto out_dput; + + err = 0; + if (is_dir) { + if (ovl_type_merge_or_lower(old)) + err = ovl_set_redirect(old, samedir); + else if (!old_opaque && ovl_type_merge(new->d_parent)) + err = ovl_set_opaque(old, olddentry); if (err) goto out_dput; } - if (!overwrite && new_is_dir && old_opaque && !new_opaque) { - err = ovl_set_opaque(newdentry); + if (!overwrite && new_is_dir) { + if (ovl_type_merge_or_lower(new)) + err = ovl_set_redirect(new, samedir); + else if (!new_opaque && ovl_type_merge(old->d_parent)) + err = ovl_set_opaque(new, newdentry); if (err) goto out_dput; } - if (old_opaque || new_opaque) { - err = ovl_do_rename(old_upperdir->d_inode, olddentry, - new_upperdir->d_inode, newdentry, - flags); - } else { - /* No debug for the plain case */ - BUG_ON(flags & ~RENAME_EXCHANGE); - err = vfs_rename(old_upperdir->d_inode, olddentry, - new_upperdir->d_inode, newdentry, - NULL, flags); - } - - if (err) { - if (is_dir && !old_opaque && new_opaque) - ovl_remove_opaque(olddentry); - if (!overwrite && new_is_dir && old_opaque && !new_opaque) - ovl_remove_opaque(newdentry); + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, flags); + if (err) goto out_dput; - } - - if (is_dir && old_opaque && !new_opaque) - ovl_remove_opaque(olddentry); - if (!overwrite && new_is_dir && !old_opaque && new_opaque) - ovl_remove_opaque(newdentry); - - /* - * Old dentry now lives in different location. Dentries in - * lowerstack are stale. We cannot drop them here because - * access to them is lockless. This could be only pure upper - * or opaque directory - numlower is zero. Or upper non-dir - * entry - its pureness is tracked by flag opaque. - */ - if (old_opaque != new_opaque) { - ovl_dentry_set_opaque(old, new_opaque); - if (!overwrite) - ovl_dentry_set_opaque(new, old_opaque); - } if (cleanup_whiteout) ovl_cleanup(old_upperdir->d_inode, newdentry); @@ -1009,7 +1062,7 @@ const struct inode_operations ovl_dir_inode_operations = { .symlink = ovl_symlink, .unlink = ovl_unlink, .rmdir = ovl_rmdir, - .rename = ovl_rename2, + .rename = ovl_rename, .link = ovl_link, .setattr = ovl_setattr, .create = ovl_create, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 7fb53d055537..08643ac44a02 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -13,34 +13,6 @@ #include <linux/posix_acl.h> #include "overlayfs.h" -static int ovl_copy_up_truncate(struct dentry *dentry) -{ - int err; - struct dentry *parent; - struct kstat stat; - struct path lowerpath; - const struct cred *old_cred; - - parent = dget_parent(dentry); - err = ovl_copy_up(parent); - if (err) - goto out_dput_parent; - - ovl_path_lower(dentry, &lowerpath); - - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&lowerpath, &stat); - if (!err) { - stat.size = 0; - err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat); - } - revert_creds(old_cred); - -out_dput_parent: - dput(parent); - return err; -} - int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -64,27 +36,10 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (err) goto out; - if (attr->ia_valid & ATTR_SIZE) { - struct inode *realinode = d_inode(ovl_dentry_real(dentry)); - - err = -ETXTBSY; - if (atomic_read(&realinode->i_writecount) < 0) - goto out_drop_write; - } - err = ovl_copy_up(dentry); if (!err) { - struct inode *winode = NULL; - upperdentry = ovl_dentry_upper(dentry); - if (attr->ia_valid & ATTR_SIZE) { - winode = d_inode(upperdentry); - err = get_write_access(winode); - if (err) - goto out_drop_write; - } - if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; @@ -95,11 +50,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); inode_unlock(upperdentry->d_inode); - - if (winode) - put_write_access(winode); } -out_drop_write: ovl_drop_write(dentry); out: return err; @@ -302,10 +253,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags) if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (!err) { - if (file_flags & O_TRUNC) - err = ovl_copy_up_truncate(dentry); - else - err = ovl_copy_up(dentry); + err = ovl_copy_up_flags(dentry, file_flags); ovl_drop_write(dentry); } } @@ -348,13 +296,12 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, .get_link = ovl_get_link, - .readlink = generic_readlink, .getattr = ovl_getattr, .listxattr = ovl_listxattr, .update_time = ovl_update_time, }; -static void ovl_fill_inode(struct inode *inode, umode_t mode) +static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_ino = get_next_ino(); inode->i_mode = mode; @@ -363,8 +310,11 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode) inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif - mode &= S_IFMT; - switch (mode) { + switch (mode & S_IFMT) { + case S_IFREG: + inode->i_op = &ovl_file_inode_operations; + break; + case S_IFDIR: inode->i_op = &ovl_dir_inode_operations; inode->i_fop = &ovl_dir_operations; @@ -375,26 +325,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode) break; default: - WARN(1, "illegal file type: %i\n", mode); - /* Fall through */ - - case S_IFREG: - case S_IFSOCK: - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: inode->i_op = &ovl_file_inode_operations; + init_special_inode(inode, mode, rdev); break; } } -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode) +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) { struct inode *inode; inode = new_inode(sb); if (inode) - ovl_fill_inode(inode, mode); + ovl_fill_inode(inode, mode, rdev); return inode; } @@ -418,7 +361,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode) inode = iget5_locked(sb, (unsigned long) realinode, ovl_inode_test, ovl_inode_set, realinode); if (inode && inode->i_state & I_NEW) { - ovl_fill_inode(inode, realinode->i_mode); + ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); set_nlink(inode, realinode->i_nlink); unlock_new_inode(inode); } diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c new file mode 100644 index 000000000000..023bb0b03352 --- /dev/null +++ b/fs/overlayfs/namei.c @@ -0,0 +1,410 @@ +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/ratelimit.h> +#include "overlayfs.h" +#include "ovl_entry.h" + +struct ovl_lookup_data { + struct qstr name; + bool is_dir; + bool opaque; + bool stop; + bool last; + char *redirect; +}; + +static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, + size_t prelen, const char *post) +{ + int res; + char *s, *next, *buf = NULL; + + res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return 0; + goto fail; + } + buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (res == 0) + goto invalid; + + res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res); + if (res < 0) + goto fail; + if (res == 0) + goto invalid; + if (buf[0] == '/') { + for (s = buf; *s++ == '/'; s = next) { + next = strchrnul(s, '/'); + if (s == next) + goto invalid; + } + } else { + if (strchr(buf, '/') != NULL) + goto invalid; + + memmove(buf + prelen, buf, res); + memcpy(buf, d->name.name, prelen); + } + + strcat(buf, post); + kfree(d->redirect); + d->redirect = buf; + d->name.name = d->redirect; + d->name.len = strlen(d->redirect); + + return 0; + +err_free: + kfree(buf); + return 0; +fail: + pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res); + goto err_free; +invalid: + pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf); + goto err_free; +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + + if (!d_is_dir(dentry)) + return false; + + res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, + const char *name, unsigned int namelen, + size_t prelen, const char *post, + struct dentry **ret) +{ + struct dentry *this; + int err; + + this = lookup_one_len_unlocked(name, base, namelen); + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + if (err == -ENOENT || err == -ENAMETOOLONG) + goto out; + goto out_err; + } + if (!this->d_inode) + goto put_and_out; + + if (ovl_dentry_weird(this)) { + /* Don't support traversing automounts and other weirdness */ + err = -EREMOTE; + goto out_err; + } + if (ovl_is_whiteout(this)) { + d->stop = d->opaque = true; + goto put_and_out; + } + if (!d_can_lookup(this)) { + d->stop = true; + if (d->is_dir) + goto put_and_out; + goto out; + } + d->is_dir = true; + if (!d->last && ovl_is_opaquedir(this)) { + d->stop = d->opaque = true; + goto out; + } + err = ovl_check_redirect(this, d, prelen, post); + if (err) + goto out_err; +out: + *ret = this; + return 0; + +put_and_out: + dput(this); + this = NULL; + goto out; + +out_err: + dput(this); + return err; +} + +static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, + struct dentry **ret) +{ + /* Counting down from the end, since the prefix can change */ + size_t rem = d->name.len - 1; + struct dentry *dentry = NULL; + int err; + + if (d->name.name[0] != '/') + return ovl_lookup_single(base, d, d->name.name, d->name.len, + 0, "", ret); + + while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) { + const char *s = d->name.name + d->name.len - rem; + const char *next = strchrnul(s, '/'); + size_t thislen = next - s; + bool end = !next[0]; + + /* Verify we did not go off the rails */ + if (WARN_ON(s[-1] != '/')) + return -EIO; + + err = ovl_lookup_single(base, d, s, thislen, + d->name.len - rem, next, &base); + dput(dentry); + if (err) + return err; + dentry = base; + if (end) + break; + + rem -= thislen + 1; + + if (WARN_ON(rem >= d->name.len)) + return -EIO; + } + *ret = dentry; + return 0; +} + +/* + * Returns next layer in stack starting from top. + * Returns -1 if this is the last layer. + */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + BUG_ON(idx < 0); + if (idx == 0) { + ovl_path_upper(dentry, path); + if (path->dentry) + return oe->numlower ? 1 : -1; + idx++; + } + BUG_ON(idx > oe->numlower); + *path = oe->lowerstack[idx - 1]; + + return (idx < oe->numlower) ? idx + 1 : -1; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + const struct cred *old_cred; + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct path *stack = NULL; + struct dentry *upperdir, *upperdentry = NULL; + unsigned int ctr = 0; + struct inode *inode = NULL; + bool upperopaque = false; + char *upperredirect = NULL; + struct dentry *this; + unsigned int i; + int err; + struct ovl_lookup_data d = { + .name = dentry->d_name, + .is_dir = false, + .opaque = false, + .stop = false, + .last = !poe->numlower, + .redirect = NULL, + }; + + if (dentry->d_name.len > ofs->namelen) + return ERR_PTR(-ENAMETOOLONG); + + old_cred = ovl_override_creds(dentry->d_sb); + upperdir = ovl_upperdentry_dereference(poe); + if (upperdir) { + err = ovl_lookup_layer(upperdir, &d, &upperdentry); + if (err) + goto out; + + if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) { + dput(upperdentry); + err = -EREMOTE; + goto out; + } + + if (d.redirect) { + upperredirect = kstrdup(d.redirect, GFP_KERNEL); + if (!upperredirect) + goto out_put_upper; + if (d.redirect[0] == '/') + poe = dentry->d_sb->s_root->d_fsdata; + } + upperopaque = d.opaque; + } + + if (!d.stop && poe->numlower) { + err = -ENOMEM; + stack = kcalloc(ofs->numlower, sizeof(struct path), + GFP_TEMPORARY); + if (!stack) + goto out_put_upper; + } + + for (i = 0; !d.stop && i < poe->numlower; i++) { + struct path lowerpath = poe->lowerstack[i]; + + d.last = i == poe->numlower - 1; + err = ovl_lookup_layer(lowerpath.dentry, &d, &this); + if (err) + goto out_put; + + if (!this) + continue; + + stack[ctr].dentry = this; + stack[ctr].mnt = lowerpath.mnt; + ctr++; + + if (d.stop) + break; + + if (d.redirect && + d.redirect[0] == '/' && + poe != dentry->d_sb->s_root->d_fsdata) { + poe = dentry->d_sb->s_root->d_fsdata; + + /* Find the current layer on the root dentry */ + for (i = 0; i < poe->numlower; i++) + if (poe->lowerstack[i].mnt == lowerpath.mnt) + break; + if (WARN_ON(i == poe->numlower)) + break; + } + } + + oe = ovl_alloc_entry(ctr); + err = -ENOMEM; + if (!oe) + goto out_put; + + if (upperdentry || ctr) { + struct dentry *realdentry; + struct inode *realinode; + + realdentry = upperdentry ? upperdentry : stack[0].dentry; + realinode = d_inode(realdentry); + + err = -ENOMEM; + if (upperdentry && !d_is_dir(upperdentry)) { + inode = ovl_get_inode(dentry->d_sb, realinode); + } else { + inode = ovl_new_inode(dentry->d_sb, realinode->i_mode, + realinode->i_rdev); + if (inode) + ovl_inode_init(inode, realinode, !!upperdentry); + } + if (!inode) + goto out_free_oe; + ovl_copyattr(realdentry->d_inode, inode); + } + + revert_creds(old_cred); + oe->opaque = upperopaque; + oe->redirect = upperredirect; + oe->__upperdentry = upperdentry; + memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + kfree(stack); + kfree(d.redirect); + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_free_oe: + kfree(oe); +out_put: + for (i = 0; i < ctr; i++) + dput(stack[i].dentry); + kfree(stack); +out_put_upper: + dput(upperdentry); + kfree(upperredirect); +out: + kfree(d.redirect); + revert_creds(old_cred); + return ERR_PTR(err); +} + +bool ovl_lower_positive(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct ovl_entry *poe = dentry->d_parent->d_fsdata; + const struct qstr *name = &dentry->d_name; + unsigned int i; + bool positive = false; + bool done = false; + + /* + * If dentry is negative, then lower is positive iff this is a + * whiteout. + */ + if (!dentry->d_inode) + return oe->opaque; + + /* Negative upper -> positive lower */ + if (!oe->__upperdentry) + return true; + + /* Positive upper -> have to look up lower to see whether it exists */ + for (i = 0; !done && !positive && i < poe->numlower; i++) { + struct dentry *this; + struct dentry *lowerdir = poe->lowerstack[i].dentry; + + this = lookup_one_len_unlocked(name->name, lowerdir, + name->len); + if (IS_ERR(this)) { + switch (PTR_ERR(this)) { + case -ENOENT: + case -ENAMETOOLONG: + break; + + default: + /* + * Assume something is there, we just couldn't + * access it. + */ + positive = true; + break; + } + } else { + if (this->d_inode) { + positive = !ovl_is_whiteout(this); + done = true; + } + dput(this); + } + } + + return positive; +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e218e741cb99..8af450b0e57a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,23 +9,17 @@ #include <linux/kernel.h> -struct ovl_entry; - enum ovl_path_type { - __OVL_PATH_PURE = (1 << 0), - __OVL_PATH_UPPER = (1 << 1), - __OVL_PATH_MERGE = (1 << 2), + __OVL_PATH_UPPER = (1 << 0), + __OVL_PATH_MERGE = (1 << 1), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) -#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE) -#define OVL_TYPE_MERGE_OR_LOWER(type) \ - (OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type)) - #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" +#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" #define OVL_ISUPPER_MASK 1UL @@ -143,35 +137,43 @@ static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper) return (struct inode *) (x & ~OVL_ISUPPER_MASK); } +/* util.c */ +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +struct dentry *ovl_workdir(struct dentry *dentry); +const struct cred *ovl_override_creds(struct super_block *sb); +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); +bool ovl_dentry_remote(struct dentry *dentry); +bool ovl_dentry_weird(struct dentry *dentry); enum ovl_path_type ovl_path_type(struct dentry *dentry); -u64 ovl_dentry_version_get(struct dentry *dentry); -void ovl_dentry_version_inc(struct dentry *dentry); void ovl_path_upper(struct dentry *dentry, struct path *path); void ovl_path_lower(struct dentry *dentry, struct path *path); enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); -int ovl_path_next(int idx, struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); -struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, - bool is_upper); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); -struct dentry *ovl_workdir(struct dentry *dentry); -int ovl_want_write(struct dentry *dentry); -void ovl_drop_write(struct dentry *dentry); bool ovl_dentry_is_opaque(struct dentry *dentry); -void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); -bool ovl_is_whiteout(struct dentry *dentry); -const struct cred *ovl_override_creds(struct super_block *sb); +bool ovl_dentry_is_whiteout(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry); +bool ovl_redirect_dir(struct super_block *sb); +void ovl_clear_redirect_dir(struct super_block *sb); +const char *ovl_dentry_get_redirect(struct dentry *dentry); +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +void ovl_inode_init(struct inode *inode, struct inode *realinode, + bool is_upper); void ovl_inode_update(struct inode *inode, struct inode *upperinode); -struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags); +void ovl_dentry_version_inc(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); -struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, - struct kstat *stat, const char *link); +/* namei.c */ +int ovl_path_next(int idx, struct dentry *dentry, struct path *path); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); +bool ovl_lower_positive(struct dentry *dentry); /* readdir.c */ extern const struct file_operations ovl_dir_operations; @@ -195,7 +197,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags); int ovl_update_time(struct inode *inode, struct timespec *ts, int flags); bool ovl_is_private_xattr(const char *name); -struct inode *ovl_new_inode(struct super_block *sb, umode_t mode); +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev); struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode); static inline void ovl_copyattr(struct inode *from, struct inode *to) { @@ -210,14 +212,18 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to) /* dir.c */ extern const struct inode_operations ovl_dir_inode_operations; struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +struct cattr { + dev_t rdev; + umode_t mode; + const char *link; +}; int ovl_create_real(struct inode *dir, struct dentry *newdentry, - struct kstat *stat, const char *link, + struct cattr *attr, struct dentry *hardlink, bool debug); void ovl_cleanup(struct inode *dir, struct dentry *dentry); /* copy_up.c */ int ovl_copy_up(struct dentry *dentry); -int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, - struct path *lowerpath, struct kstat *stat); +int ovl_copy_up_flags(struct dentry *dentry, int flags); int ovl_copy_xattr(struct dentry *old, struct dentry *new); int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h new file mode 100644 index 000000000000..d14bca1850d9 --- /dev/null +++ b/fs/overlayfs/ovl_entry.h @@ -0,0 +1,53 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; + bool default_permissions; + bool redirect_dir; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + unsigned numlower; + struct vfsmount **lower_mnt; + struct dentry *workdir; + long namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; + /* creds of process who forced instantiation of super block */ + const struct cred *creator_cred; +}; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + const char *redirect; + bool opaque; + }; + struct rcu_head rcu; + }; + unsigned numlower; + struct path lowerstack[]; +}; + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower); + +static inline struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + return lockless_dereference(oe->__upperdentry); +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 0e100856c7b8..20f48abbb82f 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -9,280 +9,29 @@ #include <linux/fs.h> #include <linux/namei.h> -#include <linux/pagemap.h> #include <linux/xattr.h> -#include <linux/security.h> #include <linux/mount.h> -#include <linux/slab.h> #include <linux/parser.h> #include <linux/module.h> -#include <linux/sched.h> #include <linux/statfs.h> #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include "overlayfs.h" +#include "ovl_entry.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -struct ovl_config { - char *lowerdir; - char *upperdir; - char *workdir; - bool default_permissions; -}; - -/* private information held for overlayfs's superblock */ -struct ovl_fs { - struct vfsmount *upper_mnt; - unsigned numlower; - struct vfsmount **lower_mnt; - struct dentry *workdir; - long lower_namelen; - /* pathnames of lower and upper dirs, for show_options */ - struct ovl_config config; - /* creds of process who forced instantiation of super block */ - const struct cred *creator_cred; -}; struct ovl_dir_cache; -/* private information held for every overlayfs dentry */ -struct ovl_entry { - struct dentry *__upperdentry; - struct ovl_dir_cache *cache; - union { - struct { - u64 version; - bool opaque; - }; - struct rcu_head rcu; - }; - unsigned numlower; - struct path lowerstack[]; -}; - #define OVL_MAX_STACK 500 -static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) -{ - return oe->numlower ? oe->lowerstack[0].dentry : NULL; -} - -enum ovl_path_type ovl_path_type(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - enum ovl_path_type type = 0; - - if (oe->__upperdentry) { - type = __OVL_PATH_UPPER; - - /* - * Non-dir dentry can hold lower dentry from previous - * location. Its purity depends only on opaque flag. - */ - if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode)) - type |= __OVL_PATH_MERGE; - else if (!oe->opaque) - type |= __OVL_PATH_PURE; - } else { - if (oe->numlower > 1) - type |= __OVL_PATH_MERGE; - } - return type; -} - -static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) -{ - return lockless_dereference(oe->__upperdentry); -} - -void ovl_path_upper(struct dentry *dentry, struct path *path) -{ - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - struct ovl_entry *oe = dentry->d_fsdata; - - path->mnt = ofs->upper_mnt; - path->dentry = ovl_upperdentry_dereference(oe); -} - -enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) -{ - enum ovl_path_type type = ovl_path_type(dentry); - - if (!OVL_TYPE_UPPER(type)) - ovl_path_lower(dentry, path); - else - ovl_path_upper(dentry, path); - - return type; -} - -struct dentry *ovl_dentry_upper(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return ovl_upperdentry_dereference(oe); -} - -struct dentry *ovl_dentry_lower(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return __ovl_dentry_lower(oe); -} - -struct dentry *ovl_dentry_real(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - struct dentry *realdentry; - - realdentry = ovl_upperdentry_dereference(oe); - if (!realdentry) - realdentry = __ovl_dentry_lower(oe); - - return realdentry; -} - -static void ovl_inode_init(struct inode *inode, struct inode *realinode, - bool is_upper) -{ - WRITE_ONCE(inode->i_private, (unsigned long) realinode | - (is_upper ? OVL_ISUPPER_MASK : 0)); -} - -struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, - bool is_upper) -{ - if (is_upper) { - struct ovl_fs *ofs = inode->i_sb->s_fs_info; - - return ofs->upper_mnt; - } else { - return oe->numlower ? oe->lowerstack[0].mnt : NULL; - } -} - -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - return oe->cache; -} - -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - oe->cache = cache; -} - -void ovl_path_lower(struct dentry *dentry, struct path *path) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; -} - -int ovl_want_write(struct dentry *dentry) -{ - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - return mnt_want_write(ofs->upper_mnt); -} - -void ovl_drop_write(struct dentry *dentry) -{ - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - mnt_drop_write(ofs->upper_mnt); -} - -struct dentry *ovl_workdir(struct dentry *dentry) -{ - struct ovl_fs *ofs = dentry->d_sb->s_fs_info; - return ofs->workdir; -} - -bool ovl_dentry_is_opaque(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - return oe->opaque; -} - -void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) -{ - struct ovl_entry *oe = dentry->d_fsdata; - oe->opaque = opaque; -} - -void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); - WARN_ON(oe->__upperdentry); - /* - * Make sure upperdentry is consistent before making it visible to - * ovl_upperdentry_dereference(). - */ - smp_wmb(); - oe->__upperdentry = upperdentry; -} - -void ovl_inode_update(struct inode *inode, struct inode *upperinode) -{ - WARN_ON(!upperinode); - WARN_ON(!inode_unhashed(inode)); - WRITE_ONCE(inode->i_private, - (unsigned long) upperinode | OVL_ISUPPER_MASK); - if (!S_ISDIR(upperinode->i_mode)) - __insert_inode_hash(inode, (unsigned long) upperinode); -} - -void ovl_dentry_version_inc(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - WARN_ON(!inode_is_locked(dentry->d_inode)); - oe->version++; -} - -u64 ovl_dentry_version_get(struct dentry *dentry) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - WARN_ON(!inode_is_locked(dentry->d_inode)); - return oe->version; -} - -bool ovl_is_whiteout(struct dentry *dentry) -{ - struct inode *inode = dentry->d_inode; - - return inode && IS_WHITEOUT(inode); -} - -const struct cred *ovl_override_creds(struct super_block *sb) -{ - struct ovl_fs *ofs = sb->s_fs_info; - - return override_creds(ofs->creator_cred); -} - -static bool ovl_is_opaquedir(struct dentry *dentry) -{ - int res; - char val; - - if (!d_is_dir(dentry)) - return false; - - res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1); - if (res == 1 && val == 'y') - return true; - - return false; -} +static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR); +module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644); +MODULE_PARM_DESC(ovl_redirect_dir_def, + "Default to on or off for the redirect_dir feature"); static void ovl_dentry_release(struct dentry *dentry) { @@ -292,6 +41,7 @@ static void ovl_dentry_release(struct dentry *dentry) unsigned int i; dput(oe->__upperdentry); + kfree(oe->redirect); for (i = 0; i < oe->numlower; i++) dput(oe->lowerstack[i].dentry); kfree_rcu(oe, rcu); @@ -304,7 +54,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, { struct dentry *real; - if (d_is_dir(dentry)) { + if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) return dentry; goto bug; @@ -392,226 +142,6 @@ static const struct dentry_operations ovl_reval_dentry_operations = { .d_weak_revalidate = ovl_dentry_weak_revalidate, }; -static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) -{ - size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); - struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); - - if (oe) - oe->numlower = numlower; - - return oe; -} - -static bool ovl_dentry_remote(struct dentry *dentry) -{ - return dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_REAL); -} - -static bool ovl_dentry_weird(struct dentry *dentry) -{ - return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | - DCACHE_MANAGE_TRANSIT | - DCACHE_OP_HASH | - DCACHE_OP_COMPARE); -} - -static inline struct dentry *ovl_lookup_real(struct dentry *dir, - const struct qstr *name) -{ - struct dentry *dentry; - - dentry = lookup_one_len_unlocked(name->name, dir, name->len); - - if (IS_ERR(dentry)) { - if (PTR_ERR(dentry) == -ENOENT) - dentry = NULL; - } else if (!dentry->d_inode) { - dput(dentry); - dentry = NULL; - } else if (ovl_dentry_weird(dentry)) { - dput(dentry); - /* Don't support traversing automounts and other weirdness */ - dentry = ERR_PTR(-EREMOTE); - } - return dentry; -} - -/* - * Returns next layer in stack starting from top. - * Returns -1 if this is the last layer. - */ -int ovl_path_next(int idx, struct dentry *dentry, struct path *path) -{ - struct ovl_entry *oe = dentry->d_fsdata; - - BUG_ON(idx < 0); - if (idx == 0) { - ovl_path_upper(dentry, path); - if (path->dentry) - return oe->numlower ? 1 : -1; - idx++; - } - BUG_ON(idx > oe->numlower); - *path = oe->lowerstack[idx - 1]; - - return (idx < oe->numlower) ? idx + 1 : -1; -} - -struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct ovl_entry *oe; - const struct cred *old_cred; - struct ovl_entry *poe = dentry->d_parent->d_fsdata; - struct path *stack = NULL; - struct dentry *upperdir, *upperdentry = NULL; - unsigned int ctr = 0; - struct inode *inode = NULL; - bool upperopaque = false; - struct dentry *this, *prev = NULL; - unsigned int i; - int err; - - old_cred = ovl_override_creds(dentry->d_sb); - upperdir = ovl_upperdentry_dereference(poe); - if (upperdir) { - this = ovl_lookup_real(upperdir, &dentry->d_name); - err = PTR_ERR(this); - if (IS_ERR(this)) - goto out; - - if (this) { - if (unlikely(ovl_dentry_remote(this))) { - dput(this); - err = -EREMOTE; - goto out; - } - if (ovl_is_whiteout(this)) { - dput(this); - this = NULL; - upperopaque = true; - } else if (poe->numlower && ovl_is_opaquedir(this)) { - upperopaque = true; - } - } - upperdentry = prev = this; - } - - if (!upperopaque && poe->numlower) { - err = -ENOMEM; - stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL); - if (!stack) - goto out_put_upper; - } - - for (i = 0; !upperopaque && i < poe->numlower; i++) { - bool opaque = false; - struct path lowerpath = poe->lowerstack[i]; - - this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name); - err = PTR_ERR(this); - if (IS_ERR(this)) { - /* - * If it's positive, then treat ENAMETOOLONG as ENOENT. - */ - if (err == -ENAMETOOLONG && (upperdentry || ctr)) - continue; - goto out_put; - } - if (!this) - continue; - if (ovl_is_whiteout(this)) { - dput(this); - break; - } - /* - * Only makes sense to check opaque dir if this is not the - * lowermost layer. - */ - if (i < poe->numlower - 1 && ovl_is_opaquedir(this)) - opaque = true; - - if (prev && (!S_ISDIR(prev->d_inode->i_mode) || - !S_ISDIR(this->d_inode->i_mode))) { - /* - * FIXME: check for upper-opaqueness maybe better done - * in remove code. - */ - if (prev == upperdentry) - upperopaque = true; - dput(this); - break; - } - /* - * If this is a non-directory then stop here. - */ - if (!S_ISDIR(this->d_inode->i_mode)) - opaque = true; - - stack[ctr].dentry = this; - stack[ctr].mnt = lowerpath.mnt; - ctr++; - prev = this; - if (opaque) - break; - } - - oe = ovl_alloc_entry(ctr); - err = -ENOMEM; - if (!oe) - goto out_put; - - if (upperdentry || ctr) { - struct dentry *realdentry; - struct inode *realinode; - - realdentry = upperdentry ? upperdentry : stack[0].dentry; - realinode = d_inode(realdentry); - - err = -ENOMEM; - if (upperdentry && !d_is_dir(upperdentry)) { - inode = ovl_get_inode(dentry->d_sb, realinode); - } else { - inode = ovl_new_inode(dentry->d_sb, realinode->i_mode); - if (inode) - ovl_inode_init(inode, realinode, !!upperdentry); - } - if (!inode) - goto out_free_oe; - ovl_copyattr(realdentry->d_inode, inode); - } - - revert_creds(old_cred); - oe->opaque = upperopaque; - oe->__upperdentry = upperdentry; - memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); - kfree(stack); - dentry->d_fsdata = oe; - d_add(dentry, inode); - - return NULL; - -out_free_oe: - kfree(oe); -out_put: - for (i = 0; i < ctr; i++) - dput(stack[i].dentry); - kfree(stack); -out_put_upper: - dput(upperdentry); -out: - revert_creds(old_cred); - return ERR_PTR(err); -} - -struct file *ovl_path_open(struct path *path, int flags) -{ - return dentry_open(path, flags | O_NOATIME, current_cred()); -} - static void ovl_put_super(struct super_block *sb) { struct ovl_fs *ufs = sb->s_fs_info; @@ -649,7 +179,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) err = vfs_statfs(&path, buf); if (!err) { - buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_namelen = ofs->namelen; buf->f_type = OVERLAYFS_SUPER_MAGIC; } @@ -674,6 +204,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) } if (ufs->config.default_permissions) seq_puts(m, ",default_permissions"); + if (ufs->config.redirect_dir != ovl_redirect_dir_def) + seq_printf(m, ",redirect_dir=%s", + ufs->config.redirect_dir ? "on" : "off"); return 0; } @@ -700,6 +233,8 @@ enum { OPT_UPPERDIR, OPT_WORKDIR, OPT_DEFAULT_PERMISSIONS, + OPT_REDIRECT_DIR_ON, + OPT_REDIRECT_DIR_OFF, OPT_ERR, }; @@ -708,6 +243,8 @@ static const match_table_t ovl_tokens = { {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, + {OPT_REDIRECT_DIR_ON, "redirect_dir=on"}, + {OPT_REDIRECT_DIR_OFF, "redirect_dir=off"}, {OPT_ERR, NULL} }; @@ -772,6 +309,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) config->default_permissions = true; break; + case OPT_REDIRECT_DIR_ON: + config->redirect_dir = true; + break; + + case OPT_REDIRECT_DIR_OFF: + config->redirect_dir = false; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -809,12 +354,9 @@ retry: strlen(OVL_WORKDIR_NAME)); if (!IS_ERR(work)) { - struct kstat stat = { - .mode = S_IFDIR | 0, - }; struct iattr attr = { .ia_valid = ATTR_MODE, - .ia_mode = stat.mode, + .ia_mode = S_IFDIR | 0, }; if (work->d_inode) { @@ -828,7 +370,9 @@ retry: goto retry; } - err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + err = ovl_create_real(dir, work, + &(struct cattr){.mode = S_IFDIR | 0}, + NULL, true); if (err) goto out_dput; @@ -903,7 +447,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("overlayfs: filesystem on '%s' not supported\n", name); goto out_put; } - if (!S_ISDIR(path->dentry->d_inode->i_mode)) { + if (!d_is_dir(path->dentry)) { pr_err("overlayfs: '%s' not a directory\n", name); goto out_put; } @@ -936,22 +480,33 @@ static int ovl_mount_dir(const char *name, struct path *path) return err; } -static int ovl_lower_dir(const char *name, struct path *path, long *namelen, - int *stack_depth, bool *remote) +static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs, + const char *name) { - int err; struct kstatfs statfs; + int err = vfs_statfs(path, &statfs); + + if (err) + pr_err("overlayfs: statfs failed on '%s'\n", name); + else + ofs->namelen = max(ofs->namelen, statfs.f_namelen); + + return err; +} + +static int ovl_lower_dir(const char *name, struct path *path, + struct ovl_fs *ofs, int *stack_depth, bool *remote) +{ + int err; err = ovl_mount_dir_noesc(name, path); if (err) goto out; - err = vfs_statfs(path, &statfs); - if (err) { - pr_err("overlayfs: statfs failed on '%s'\n", name); + err = ovl_check_namelen(path, ofs, name); + if (err) goto out_put; - } - *namelen = max(*namelen, statfs.f_namelen); + *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); if (ovl_dentry_remote(path->dentry)) @@ -1067,7 +622,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - return -EPERM; + return -EOPNOTSUPP; } static int ovl_own_xattr_set(const struct xattr_handler *handler, @@ -1075,7 +630,7 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { - return -EPERM; + return -EOPNOTSUPP; } static int ovl_other_xattr_get(const struct xattr_handler *handler, @@ -1153,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs) goto out; + ufs->config.redirect_dir = ovl_redirect_dir_def; err = ovl_parse_opt((char *) data, &ufs->config); if (err) goto out_free_config; @@ -1183,6 +739,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_put_upperpath; } + err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir); + if (err) + goto out_put_upperpath; + err = ovl_mount_dir(ufs->config.workdir, &workpath); if (err) goto out_put_upperpath; @@ -1214,15 +774,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) goto out_free_lowertmp; } + err = -ENOMEM; stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) goto out_free_lowertmp; + err = -EINVAL; lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { - err = ovl_lower_dir(lower, &stack[numlower], - &ufs->lower_namelen, &sb->s_stack_depth, - &remote); + err = ovl_lower_dir(lower, &stack[numlower], ufs, + &sb->s_stack_depth, &remote); if (err) goto out_put_lowerpath; @@ -1324,7 +885,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = ufs; sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; - root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR)); + root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); if (!root_dentry) goto out_free_oe; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c new file mode 100644 index 000000000000..952286f4826c --- /dev/null +++ b/fs/overlayfs/util.c @@ -0,0 +1,265 @@ +/* + * Copyright (C) 2011 Novell Inc. + * Copyright (C) 2016 Red Hat, Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include "overlayfs.h" +#include "ovl_entry.h" + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +const struct cred *ovl_override_creds(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return override_creds(ofs->creator_cred); +} + +struct ovl_entry *ovl_alloc_entry(unsigned int numlower) +{ + size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); + struct ovl_entry *oe = kzalloc(size, GFP_KERNEL); + + if (oe) + oe->numlower = numlower; + + return oe; +} + +bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | + DCACHE_OP_REAL); +} + +bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + enum ovl_path_type type = 0; + + if (oe->__upperdentry) { + type = __OVL_PATH_UPPER; + + /* + * Non-dir dentry can hold lower dentry from previous + * location. + */ + if (oe->numlower && d_is_dir(dentry)) + type |= __OVL_PATH_MERGE; + } else { + if (oe->numlower > 1) + type |= __OVL_PATH_MERGE; + } + return type; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + enum ovl_path_type type = ovl_path_type(dentry); + + if (!OVL_TYPE_UPPER(type)) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe) +{ + return oe->numlower ? oe->lowerstack[0].dentry : NULL; +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return __ovl_dentry_lower(oe); +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = __ovl_dentry_lower(oe); + + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +bool ovl_dentry_is_whiteout(struct dentry *dentry) +{ + return !dentry->d_inode && ovl_dentry_is_opaque(dentry); +} + +void ovl_dentry_set_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->opaque = true; +} + +bool ovl_redirect_dir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->config.redirect_dir; +} + +void ovl_clear_redirect_dir(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + ofs->config.redirect_dir = false; +} + +const char *ovl_dentry_get_redirect(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->redirect; +} + +void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + kfree(oe->redirect); + oe->redirect = redirect; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); + WARN_ON(oe->__upperdentry); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_inode_init(struct inode *inode, struct inode *realinode, bool is_upper) +{ + WRITE_ONCE(inode->i_private, (unsigned long) realinode | + (is_upper ? OVL_ISUPPER_MASK : 0)); +} + +void ovl_inode_update(struct inode *inode, struct inode *upperinode) +{ + WARN_ON(!upperinode); + WARN_ON(!inode_unhashed(inode)); + WRITE_ONCE(inode->i_private, + (unsigned long) upperinode | OVL_ISUPPER_MASK); + if (!S_ISDIR(upperinode->i_mode)) + __insert_inode_hash(inode, (unsigned long) upperinode); +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!inode_is_locked(dentry->d_inode)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!inode_is_locked(dentry->d_inode)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags | O_NOATIME, current_cred()); +} diff --git a/fs/pipe.c b/fs/pipe.c index 8e0d9f26dfad..73b84baf58f8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -23,7 +23,7 @@ #include <linux/fcntl.h> #include <linux/memcontrol.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/ioctls.h> #include "internal.h" diff --git a/fs/pnode.c b/fs/pnode.c index 234a9ac49958..06a793f4ae38 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -67,49 +67,47 @@ int get_dominating_id(struct mount *mnt, const struct path *root) static int do_make_slave(struct mount *mnt) { - struct mount *peer_mnt = mnt, *master = mnt->mnt_master; - struct mount *slave_mnt; + struct mount *master, *slave_mnt; - /* - * slave 'mnt' to a peer mount that has the - * same root dentry. If none is available then - * slave it to anything that is available. - */ - while ((peer_mnt = next_peer(peer_mnt)) != mnt && - peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ; - - if (peer_mnt == mnt) { - peer_mnt = next_peer(mnt); - if (peer_mnt == mnt) - peer_mnt = NULL; - } - if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && - list_empty(&mnt->mnt_share)) - mnt_release_group_id(mnt); - - list_del_init(&mnt->mnt_share); - mnt->mnt_group_id = 0; - - if (peer_mnt) - master = peer_mnt; - - if (master) { - list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) - slave_mnt->mnt_master = master; - list_move(&mnt->mnt_slave, &master->mnt_slave_list); - list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); - INIT_LIST_HEAD(&mnt->mnt_slave_list); + if (list_empty(&mnt->mnt_share)) { + if (IS_MNT_SHARED(mnt)) { + mnt_release_group_id(mnt); + CLEAR_MNT_SHARED(mnt); + } + master = mnt->mnt_master; + if (!master) { + struct list_head *p = &mnt->mnt_slave_list; + while (!list_empty(p)) { + slave_mnt = list_first_entry(p, + struct mount, mnt_slave); + list_del_init(&slave_mnt->mnt_slave); + slave_mnt->mnt_master = NULL; + } + return 0; + } } else { - struct list_head *p = &mnt->mnt_slave_list; - while (!list_empty(p)) { - slave_mnt = list_first_entry(p, - struct mount, mnt_slave); - list_del_init(&slave_mnt->mnt_slave); - slave_mnt->mnt_master = NULL; + struct mount *m; + /* + * slave 'mnt' to a peer mount that has the + * same root dentry. If none is available then + * slave it to anything that is available. + */ + for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { + if (m->mnt.mnt_root == mnt->mnt.mnt_root) { + master = m; + break; + } } + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + CLEAR_MNT_SHARED(mnt); } + list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) + slave_mnt->mnt_master = master; + list_move(&mnt->mnt_slave, &master->mnt_slave_list); + list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); + INIT_LIST_HEAD(&mnt->mnt_slave_list); mnt->mnt_master = master; - CLEAR_MNT_SHARED(mnt); return 0; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 595522022aca..c9d48dc78495 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) - return 0; - if (error == 0) - acl = NULL; + error = posix_acl_update_mode(inode, + &inode->i_mode, &acl); + if (error) + return error; } inode->i_ctime = current_time(inode); diff --git a/fs/proc/array.c b/fs/proc/array.c index 81818adb8e9e..51a4213afa2e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header, if (sigismember(set, i+2)) x |= 2; if (sigismember(set, i+3)) x |= 4; if (sigismember(set, i+4)) x |= 8; - seq_printf(m, "%x", x); + seq_putc(m, hex_asc[x]); } while (i >= 4); seq_putc(m, '\n'); @@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p) static inline void task_seccomp(struct seq_file *m, struct task_struct *p) { + seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p)); #ifdef CONFIG_SECCOMP - seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode); - seq_putc(m, '\n'); + seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode); #endif + seq_putc(m, '\n'); } static inline void task_context_switch_counts(struct seq_file *m, diff --git a/fs/proc/base.c b/fs/proc/base.c index ca651ac00660..87c9a9aacda3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -47,7 +47,7 @@ * Overall revision about smaps. */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> @@ -104,9 +104,12 @@ * in /proc for a task before it execs a suid executable. */ +static u8 nlink_tid; +static u8 nlink_tgid; + struct pid_entry { const char *name; - int len; + unsigned int len; umode_t mode; const struct inode_operations *iop; const struct file_operations *fop; @@ -139,13 +142,13 @@ struct pid_entry { * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. */ -static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, +static unsigned int __init pid_entry_nlink(const struct pid_entry *entries, unsigned int n) { unsigned int i; unsigned int count; - count = 0; + count = 2; for (i = 0; i < n; ++i) { if (S_ISDIR(entries[i].mode)) ++count; @@ -1243,7 +1246,7 @@ static const struct file_operations proc_oom_score_adj_operations = { }; #ifdef CONFIG_AUDITSYSCALL -#define TMPBUFLEN 21 +#define TMPBUFLEN 11 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -1664,7 +1667,8 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task) +struct inode *proc_pid_make_inode(struct super_block * sb, + struct task_struct *task, umode_t mode) { struct inode * inode; struct proc_inode *ei; @@ -1678,6 +1682,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t /* Common stuff */ ei = PROC_I(inode); + inode->i_mode = mode; inode->i_ino = get_next_ino(); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_op = &proc_def_inode_operations; @@ -1967,7 +1972,7 @@ out: struct map_files_info { fmode_t mode; - unsigned long len; + unsigned int len; unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ }; @@ -2004,7 +2009,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | + ((mode & FMODE_READ ) ? S_IRUSR : 0) | + ((mode & FMODE_WRITE) ? S_IWUSR : 0)); if (!inode) return -ENOENT; @@ -2013,12 +2020,6 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; - inode->i_mode = S_IFLNK; - - if (mode & FMODE_READ) - inode->i_mode |= S_IRUSR; - if (mode & FMODE_WRITE) - inode->i_mode |= S_IWUSR; d_set_d_op(dentry, &tid_map_files_dentry_operations); d_add(dentry, inode); @@ -2372,12 +2373,11 @@ static int proc_pident_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, p->mode); if (!inode) goto out; ei = PROC_I(inode); - inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) set_nlink(inode, 2); /* Use getattr to fix if necessary */ if (p->iop) @@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir, * Yes, it does not scale. And it should not. Don't add * new entries into /proc/<tgid>/ without very good reasons. */ - last = &ents[nents - 1]; - for (p = ents; p <= last; p++) { + last = &ents[nents]; + for (p = ents; p < last; p++) { if (p->len != dentry->d_name.len) continue; if (!memcmp(dentry->d_name.name, p->name, p->len)) break; } - if (p > last) + if (p >= last) goto out; error = proc_pident_instantiate(dir, dentry, task, p); @@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, if (ctx->pos >= nents + 2) goto out; - for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) { + for (p = ents + (ctx->pos - 2); p < ents + nents; p++) { if (!proc_fill_cache(file, ctx, p->name, p->len, proc_pident_instantiate, task, p)) break; @@ -3059,17 +3059,15 @@ static int proc_pid_instantiate(struct inode *dir, { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto out; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff, - ARRAY_SIZE(tgid_base_stuff))); + set_nlink(inode, nlink_tgid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3181,6 +3179,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[PROC_NUMBUF]; int len; + + cond_resched(); if (!has_pid_permissions(ns, iter.task, 2)) continue; @@ -3352,17 +3352,15 @@ static int proc_task_instantiate(struct inode *dir, struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto out; - inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tid_base_inode_operations; inode->i_fop = &proc_tid_base_operations; inode->i_flags|=S_IMMUTABLE; - set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff, - ARRAY_SIZE(tid_base_stuff))); + set_nlink(inode, nlink_tid); d_set_d_op(dentry, &pid_dentry_operations); @@ -3552,3 +3550,9 @@ static const struct file_operations proc_task_operations = { .iterate_shared = proc_task_readdir, .llseek = generic_file_llseek, }; + +void __init set_proc_pid_nlink(void) +{ + nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff)); + nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff)); +} diff --git a/fs/proc/fd.c b/fs/proc/fd.c index d21dafef3102..4274f83bf100 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -183,14 +183,13 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; - inode->i_mode = S_IFLNK; inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -322,14 +321,13 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry, struct proc_inode *ei; struct inode *inode; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR); if (!inode) goto out; ei = PROC_I(inode); ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; inode->i_fop = &proc_fdinfo_file_operations; d_set_d_op(dentry, &tid_fd_dentry_operations); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 5f2dc2032c79..f6a01f09f79d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -22,7 +22,7 @@ #include <linux/bitops.h> #include <linux/spinlock.h> #include <linux/completion.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -479,6 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) } return ent; } +EXPORT_SYMBOL(proc_create_mount_point); struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e69ebe648a34..842a5ff5b85c 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -24,7 +24,7 @@ #include <linux/mount.h> #include <linux/magic.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde) /* pde is locked */ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) { + /* + * close() (proc_reg_release()) can't delete an entry and proceed: + * ->release hook needs to be available at the right moment. + * + * rmmod (remove_proc_entry() et al) can't delete an entry and proceed: + * "struct file" needs to be available at the right moment. + * + * Therefore, first process to enter this function does ->release() and + * signals its completion to the other process which does nothing. + */ if (pdeo->closing) { /* somebody else is doing that, just wait */ DECLARE_COMPLETION_ONSTACK(c); @@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo) spin_lock(&pde->pde_unload_lock); } else { struct file *file; - pdeo->closing = 1; + pdeo->closing = true; spin_unlock(&pde->pde_unload_lock); file = pdeo->file; pde->proc_fops->release(file_inode(file), file); spin_lock(&pde->pde_unload_lock); - list_del_init(&pdeo->lh); + /* After ->release. */ + list_del(&pdeo->lh); if (pdeo->c) complete(pdeo->c); kfree(pdeo); @@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de) if (atomic_add_return(BIAS, &de->in_use) != BIAS) wait_for_completion(&c); + /* ->pde_openers list can't grow from now on. */ + spin_lock(&de->pde_unload_lock); while (!list_empty(&de->pde_openers)) { struct pde_opener *pdeo; @@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file) struct pde_opener *pdeo; /* - * What for, you ask? Well, we can have open, rmmod, remove_proc_entry - * sequence. ->release won't be called because ->proc_fops will be - * cleared. Depending on complexity of ->release, consequences vary. + * Ensure that + * 1) PDE's ->release hook will be called no matter what + * either normally by close()/->release, or forcefully by + * rmmod/remove_proc_entry. + * + * 2) rmmod isn't blocked by opening file in /proc and sitting on + * the descriptor (including "rmmod foo </proc/foo" scenario). * - * We can't wait for mercy when close will be done for real, it's - * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release - * by hand in remove_proc_entry(). For this, save opener's credentials - * for later. + * Save every "struct file" with custom ->release hook. */ - pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL); + pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL); if (!pdeo) return -ENOMEM; @@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (rv == 0 && release) { /* To know what to release. */ pdeo->file = file; - /* Strictly for "too late" ->release in proc_reg_release(). */ + pdeo->closing = false; + pdeo->c = NULL; spin_lock(&pde->pde_unload_lock); list_add(&pdeo->lh, &pde->pde_openers); spin_unlock(&pde->pde_unload_lock); @@ -410,7 +425,6 @@ static const char *proc_get_link(struct dentry *dentry, } const struct inode_operations proc_link_inode_operations = { - .readlink = generic_readlink, .get_link = proc_get_link, }; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 5378441ec1b7..2de5194ba378 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,7 +162,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *); extern int proc_setattr(struct dentry *, struct iattr *); -extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *); +extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern int pid_revalidate(struct dentry *, unsigned int); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); @@ -195,7 +195,6 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } -struct proc_dir_entry *proc_create_mount_point(const char *name); /* * inode.c @@ -203,7 +202,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name); struct pde_opener { struct file *file; struct list_head lh; - int closing; + bool closing; struct completion *c; }; extern const struct inode_operations proc_link_inode_operations; @@ -211,6 +210,7 @@ extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern void proc_init_inodecache(void); +void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern int proc_fill_super(struct super_block *, void *data, int flags); extern void proc_entry_rundown(struct proc_dir_entry *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 5c89a07e3d7f..0b80ad87b4d6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -23,7 +23,7 @@ #include <linux/bootmem.h> #include <linux/init.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index 05f8dcdb086e..f9387bb7631b 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -14,7 +14,7 @@ #include <linux/fs.h> #include <linux/syslog.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> extern wait_queue_head_t log_wait; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 51b8b0a8ad91..766f0c637ad1 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -92,12 +92,11 @@ static int proc_ns_instantiate(struct inode *dir, struct inode *inode; struct proc_inode *ei; - inode = proc_pid_make_inode(dir->i_sb, task); + inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO); if (!inode) goto out; ei = PROC_I(inode); - inode->i_mode = S_IFLNK|S_IRWXUGO; inode->i_op = &proc_ns_link_inode_operations; ei->ns_ops = ns_ops; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index f8595e8b5cd0..75634379f82e 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -25,7 +25,7 @@ #include <linux/seq_file.h> #include <linux/hugetlb.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/pgtable.h> #include <asm/tlb.h> #include <asm/div64.h> diff --git a/fs/proc/page.c b/fs/proc/page.c index 3ecd445e830d..2726536489b1 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -13,7 +13,7 @@ #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/kernel-page-flags.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" #define KPMSIZE sizeof(u64) @@ -173,7 +173,8 @@ u64 stable_page_flags(struct page *page) u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); - u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + if (PageSwapCache(page)) + u |= 1 << KPF_SWAPCACHE; u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 7ae6b1da7cab..ffd72a6c6e04 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -8,7 +8,7 @@ * proc net directory handling functions */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 55313d994895..d4e37acd4821 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -709,7 +709,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) ctl_dir = container_of(head, struct ctl_dir, header); if (!dir_emit_dots(file, ctx)) - return 0; + goto out; pos = 2; @@ -719,6 +719,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx) break; } } +out: sysctl_head_finish(head); return 0; } diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 15f327bed8c6..901bd06f437d 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -4,7 +4,7 @@ * Copyright 1997, Theodore Ts'o */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> diff --git a/fs/proc/root.c b/fs/proc/root.c index 8d3e484055a6..1988440b2049 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -6,7 +6,7 @@ * proc root directory handling functions */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/time.h> @@ -122,6 +122,7 @@ void __init proc_root_init(void) int err; proc_init_inodecache(); + set_proc_pid_nlink(); err = register_filesystem(&proc_fs_type); if (err) return; diff --git a/fs/proc/self.c b/fs/proc/self.c index 40245954c450..39857f6db5cf 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -6,18 +6,6 @@ /* * /proc/self: */ -static int proc_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - char tmp[PROC_NUMBUF]; - if (!tgid) - return -ENOENT; - sprintf(tmp, "%d", tgid); - return readlink_copy(buffer, buflen, tmp); -} - static const char *proc_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) @@ -38,7 +26,6 @@ static const char *proc_self_get_link(struct dentry *dentry, } static const struct inode_operations proc_self_inode_operations = { - .readlink = proc_self_readlink, .get_link = proc_self_get_link, }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 35b92d81692f..8f96a49178d0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -17,7 +17,7 @@ #include <linux/shmem_fs.h> #include <asm/elf.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/tlbflush.h> #include "internal.h" @@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); + cond_resched(); return 0; } #ifdef CONFIG_HUGETLB_PAGE diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 595b90a9766c..20614b62a9b7 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -6,19 +6,6 @@ /* * /proc/thread_self: */ -static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct pid_namespace *ns = dentry->d_sb->s_fs_info; - pid_t tgid = task_tgid_nr_ns(current, ns); - pid_t pid = task_pid_nr_ns(current, ns); - char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF]; - if (!pid) - return -ENOENT; - sprintf(tmp, "%d/task/%d", tgid, pid); - return readlink_copy(buffer, buflen, tmp); -} - static const char *proc_thread_self_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) @@ -40,7 +27,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, } static const struct inode_operations proc_thread_self_inode_operations = { - .readlink = proc_thread_self_readlink, .get_link = proc_thread_self_get_link, }; diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 8ab782d8b33d..5105b1599981 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -22,7 +22,7 @@ #include <linux/list.h> #include <linux/vmalloc.h> #include <linux/pagemap.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/io.h> #include "internal.h" diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index be40813eff52..b42e5bd6d8ff 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -86,4 +86,4 @@ config PSTORE_RAM Note that for historical reasons, the module will be named "ramoops.ko". - For more information, see Documentation/ramoops.txt. + For more information, see Documentation/admin-guide/ramoops.rst. diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index d4887705bb61..899d0ba0bd6c 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -27,6 +27,9 @@ #include <asm/barrier.h> #include "internal.h" +/* This doesn't need to be atomic: speed is chosen over correctness here. */ +static u64 pstore_ftrace_stamp; + static void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, @@ -42,6 +45,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.ip = ip; rec.parent_ip = parent_ip; + pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++); pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, 0, sizeof(rec), psinfo); @@ -71,10 +75,13 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf, if (!on ^ pstore_ftrace_enabled) goto out; - if (on) + if (on) { + ftrace_ops_set_global_filter(&pstore_ftrace_ops); ret = register_ftrace_function(&pstore_ftrace_ops); - else + } else { ret = unregister_ftrace_function(&pstore_ftrace_ops); + } + if (ret) { pr_err("%s: unable to %sregister ftrace ops: %zd\n", __func__, on ? "" : "un", ret); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1781dc50762e..57c0646479f5 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -107,9 +107,11 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) struct pstore_ftrace_seq_data *data = v; struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); - seq_printf(s, "%d %08lx %08lx %pf <- %pF\n", - pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip, - (void *)rec->ip, (void *)rec->parent_ip); + seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", + pstore_ftrace_decode_cpu(rec), + pstore_ftrace_read_timestamp(rec), + rec->ip, rec->parent_ip, (void *)rec->ip, + (void *)rec->parent_ip); return 0; } @@ -197,11 +199,14 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (err) return err; - if (p->psi->erase) + if (p->psi->erase) { + mutex_lock(&p->psi->read_mutex); p->psi->erase(p->type, p->id, p->count, d_inode(dentry)->i_ctime, p->psi); - else + mutex_unlock(&p->psi->read_mutex); + } else { return -EPERM; + } return simple_unlink(dir, dentry); } diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index e38a22b31282..da416e6591c9 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -5,40 +5,6 @@ #include <linux/time.h> #include <linux/pstore.h> -#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB) -#define PSTORE_CPU_IN_IP 0x1 -#elif NR_CPUS <= 4 && defined(CONFIG_ARM) -#define PSTORE_CPU_IN_IP 0x3 -#endif - -struct pstore_ftrace_record { - unsigned long ip; - unsigned long parent_ip; -#ifndef PSTORE_CPU_IN_IP - unsigned int cpu; -#endif -}; - -static inline void -pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu) -{ -#ifndef PSTORE_CPU_IN_IP - rec->cpu = cpu; -#else - rec->ip |= cpu; -#endif -} - -static inline unsigned int -pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec) -{ -#ifndef PSTORE_CPU_IN_IP - return rec->cpu; -#else - return rec->ip & PSTORE_CPU_IN_IP; -#endif -} - #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); extern void pstore_unregister_ftrace(void); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 14984d902a99..729677e18e36 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -493,6 +493,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, if (!is_locked) { pr_err("pstore dump routine blocked in %s path, may corrupt error record\n" , in_nmi() ? "NMI" : why); + return; } } else { spin_lock_irqsave(&psinfo->buf_lock, flags); @@ -584,8 +585,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - memcpy(psinfo->buf, s, c); - psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo); + psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, + s, 0, c, psinfo); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 6ad831b9d1b8..1d887efaaf71 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -85,10 +85,10 @@ MODULE_PARM_DESC(ramoops_ecc, "bytes ECC)"); struct ramoops_context { - struct persistent_ram_zone **przs; - struct persistent_ram_zone *cprz; - struct persistent_ram_zone *fprz; - struct persistent_ram_zone *mprz; + struct persistent_ram_zone **dprzs; /* Oops dump zones */ + struct persistent_ram_zone *cprz; /* Console zone */ + struct persistent_ram_zone **fprzs; /* Ftrace zones */ + struct persistent_ram_zone *mprz; /* PMSG zone */ phys_addr_t phys_addr; unsigned long size; unsigned int memtype; @@ -97,12 +97,14 @@ struct ramoops_context { size_t ftrace_size; size_t pmsg_size; int dump_oops; + u32 flags; struct persistent_ram_ecc_info ecc_info; unsigned int max_dump_cnt; unsigned int dump_write_cnt; /* _read_cnt need clear on ramoops_pstore_open */ unsigned int dump_read_cnt; unsigned int console_read_cnt; + unsigned int max_ftrace_cnt; unsigned int ftrace_read_cnt; unsigned int pmsg_read_cnt; struct pstore_info pstore; @@ -180,16 +182,69 @@ static bool prz_ok(struct persistent_ram_zone *prz) persistent_ram_ecc_string(prz, NULL, 0)); } +static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, + struct persistent_ram_zone *src) +{ + size_t dest_size, src_size, total, dest_off, src_off; + size_t dest_idx = 0, src_idx = 0, merged_idx = 0; + void *merged_buf; + struct pstore_ftrace_record *drec, *srec, *mrec; + size_t record_size = sizeof(struct pstore_ftrace_record); + + dest_off = dest->old_log_size % record_size; + dest_size = dest->old_log_size - dest_off; + + src_off = src->old_log_size % record_size; + src_size = src->old_log_size - src_off; + + total = dest_size + src_size; + merged_buf = kmalloc(total, GFP_KERNEL); + if (!merged_buf) + return -ENOMEM; + + drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off); + srec = (struct pstore_ftrace_record *)(src->old_log + src_off); + mrec = (struct pstore_ftrace_record *)(merged_buf); + + while (dest_size > 0 && src_size > 0) { + if (pstore_ftrace_read_timestamp(&drec[dest_idx]) < + pstore_ftrace_read_timestamp(&srec[src_idx])) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } else { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + } + + while (dest_size > 0) { + mrec[merged_idx++] = drec[dest_idx++]; + dest_size -= record_size; + } + + while (src_size > 0) { + mrec[merged_idx++] = srec[src_idx++]; + src_size -= record_size; + } + + kfree(dest->old_log); + dest->old_log = merged_buf; + dest->old_log_size = total; + + return 0; +} + static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, int *count, struct timespec *time, char **buf, bool *compressed, ssize_t *ecc_notice_size, struct pstore_info *psi) { - ssize_t size; + ssize_t size = 0; struct ramoops_context *cxt = psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; + bool free_prz = false; /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have @@ -201,7 +256,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { - prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, cxt->max_dump_cnt, id, type, PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) @@ -219,14 +274,56 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 1, id, type, PSTORE_TYPE_CONSOLE, 0); - if (!prz_ok(prz)) - prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt, - 1, id, type, PSTORE_TYPE_FTRACE, 0); + if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, 1, id, type, PSTORE_TYPE_PMSG, 0); - if (!prz_ok(prz)) - return 0; + + /* ftrace is last since it may want to dynamically allocate memory. */ + if (!prz_ok(prz) && cxt->fprzs) { + if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { + prz = ramoops_get_next_prz(cxt->fprzs, + &cxt->ftrace_read_cnt, 1, id, type, + PSTORE_TYPE_FTRACE, 0); + } else { + /* + * Build a new dummy record which combines all the + * per-cpu records including metadata and ecc info. + */ + struct persistent_ram_zone *tmp_prz, *prz_next; + + tmp_prz = kzalloc(sizeof(struct persistent_ram_zone), + GFP_KERNEL); + if (!tmp_prz) + return -ENOMEM; + free_prz = true; + + while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { + prz_next = ramoops_get_next_prz(cxt->fprzs, + &cxt->ftrace_read_cnt, + cxt->max_ftrace_cnt, id, + type, PSTORE_TYPE_FTRACE, 0); + + if (!prz_ok(prz_next)) + continue; + + tmp_prz->ecc_info = prz_next->ecc_info; + tmp_prz->corrected_bytes += + prz_next->corrected_bytes; + tmp_prz->bad_blocks += prz_next->bad_blocks; + size = ftrace_log_combine(tmp_prz, prz_next); + if (size) + goto out; + } + *id = 0; + prz = tmp_prz; + } + } + + if (!prz_ok(prz)) { + size = 0; + goto out; + } size = persistent_ram_old_size(prz) - header_length; @@ -234,12 +331,21 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); - if (*buf == NULL) - return -ENOMEM; + if (*buf == NULL) { + size = -ENOMEM; + goto out; + } memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); +out: + if (free_prz) { + kfree(prz->old_log); + kfree(prz); + } + return size; } @@ -283,15 +389,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, persistent_ram_write(cxt->cprz, buf, size); return 0; } else if (type == PSTORE_TYPE_FTRACE) { - if (!cxt->fprz) + int zonenum; + + if (!cxt->fprzs) return -ENOMEM; - persistent_ram_write(cxt->fprz, buf, size); + /* + * Choose zone by if we're using per-cpu buffers. + */ + if (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + zonenum = smp_processor_id(); + else + zonenum = 0; + + persistent_ram_write(cxt->fprzs[zonenum], buf, size); return 0; } else if (type == PSTORE_TYPE_PMSG) { - if (!cxt->mprz) - return -ENOMEM; - persistent_ram_write(cxt->mprz, buf, size); - return 0; + pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__); + return -EINVAL; } if (type != PSTORE_TYPE_DMESG) @@ -316,10 +430,10 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, if (part != 1) return -ENOSPC; - if (!cxt->przs) + if (!cxt->dprzs) return -ENOSPC; - prz = cxt->przs[cxt->dump_write_cnt]; + prz = cxt->dprzs[cxt->dump_write_cnt]; hlen = ramoops_write_kmsg_hdr(prz, compressed); if (size + hlen > prz->buffer_size) @@ -359,13 +473,15 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, case PSTORE_TYPE_DMESG: if (id >= cxt->max_dump_cnt) return -EINVAL; - prz = cxt->przs[id]; + prz = cxt->dprzs[id]; break; case PSTORE_TYPE_CONSOLE: prz = cxt->cprz; break; case PSTORE_TYPE_FTRACE: - prz = cxt->fprz; + if (id >= cxt->max_ftrace_cnt) + return -EINVAL; + prz = cxt->fprzs[id]; break; case PSTORE_TYPE_PMSG: prz = cxt->mprz; @@ -396,68 +512,113 @@ static void ramoops_free_przs(struct ramoops_context *cxt) { int i; - if (!cxt->przs) - return; + /* Free dump PRZs */ + if (cxt->dprzs) { + for (i = 0; i < cxt->max_dump_cnt; i++) + persistent_ram_free(cxt->dprzs[i]); - for (i = 0; i < cxt->max_dump_cnt; i++) - persistent_ram_free(cxt->przs[i]); + kfree(cxt->dprzs); + cxt->max_dump_cnt = 0; + } - kfree(cxt->przs); - cxt->max_dump_cnt = 0; + /* Free ftrace PRZs */ + if (cxt->fprzs) { + for (i = 0; i < cxt->max_ftrace_cnt; i++) + persistent_ram_free(cxt->fprzs[i]); + kfree(cxt->fprzs); + cxt->max_ftrace_cnt = 0; + } } -static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, - phys_addr_t *paddr, size_t dump_mem_sz) +static int ramoops_init_przs(const char *name, + struct device *dev, struct ramoops_context *cxt, + struct persistent_ram_zone ***przs, + phys_addr_t *paddr, size_t mem_sz, + ssize_t record_size, + unsigned int *cnt, u32 sig, u32 flags) { int err = -ENOMEM; int i; + size_t zone_sz; + struct persistent_ram_zone **prz_ar; - if (!cxt->record_size) + /* Allocate nothing for 0 mem_sz or 0 record_size. */ + if (mem_sz == 0 || record_size == 0) { + *cnt = 0; return 0; + } - if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) { - dev_err(dev, "no room for dumps\n"); - return -ENOMEM; + /* + * If we have a negative record size, calculate it based on + * mem_sz / *cnt. If we have a positive record size, calculate + * cnt from mem_sz / record_size. + */ + if (record_size < 0) { + if (*cnt == 0) + return 0; + record_size = mem_sz / *cnt; + if (record_size == 0) { + dev_err(dev, "%s record size == 0 (%zu / %u)\n", + name, mem_sz, *cnt); + goto fail; + } + } else { + *cnt = mem_sz / record_size; + if (*cnt == 0) { + dev_err(dev, "%s record count == 0 (%zu / %zu)\n", + name, mem_sz, record_size); + goto fail; + } } - cxt->max_dump_cnt = dump_mem_sz / cxt->record_size; - if (!cxt->max_dump_cnt) - return -ENOMEM; + if (*paddr + mem_sz - cxt->phys_addr > cxt->size) { + dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + name, + mem_sz, (unsigned long long)*paddr, + cxt->size, (unsigned long long)cxt->phys_addr); + goto fail; + } - cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt, - GFP_KERNEL); - if (!cxt->przs) { - dev_err(dev, "failed to initialize a prz array for dumps\n"); - goto fail_mem; + zone_sz = mem_sz / *cnt; + if (!zone_sz) { + dev_err(dev, "%s zone size == 0\n", name); + goto fail; } - for (i = 0; i < cxt->max_dump_cnt; i++) { - cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, + prz_ar = kcalloc(*cnt, sizeof(**przs), GFP_KERNEL); + if (!prz_ar) + goto fail; + + for (i = 0; i < *cnt; i++) { + prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig, &cxt->ecc_info, - cxt->memtype); - if (IS_ERR(cxt->przs[i])) { - err = PTR_ERR(cxt->przs[i]); - dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - cxt->record_size, (unsigned long long)*paddr, err); + cxt->memtype, flags); + if (IS_ERR(prz_ar[i])) { + err = PTR_ERR(prz_ar[i]); + dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", + name, record_size, + (unsigned long long)*paddr, err); while (i > 0) { i--; - persistent_ram_free(cxt->przs[i]); + persistent_ram_free(prz_ar[i]); } - goto fail_prz; + kfree(prz_ar); + goto fail; } - *paddr += cxt->record_size; + *paddr += zone_sz; } + *przs = prz_ar; return 0; -fail_prz: - kfree(cxt->przs); -fail_mem: - cxt->max_dump_cnt = 0; + +fail: + *cnt = 0; return err; } -static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, +static int ramoops_init_prz(const char *name, + struct device *dev, struct ramoops_context *cxt, struct persistent_ram_zone **prz, phys_addr_t *paddr, size_t sz, u32 sig) { @@ -465,18 +626,19 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt, return 0; if (*paddr + sz - cxt->phys_addr > cxt->size) { - dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", - sz, (unsigned long long)*paddr, + dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n", + name, sz, (unsigned long long)*paddr, cxt->size, (unsigned long long)cxt->phys_addr); return -ENOMEM; } - *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype); + *prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, + cxt->memtype, 0); if (IS_ERR(*prz)) { int err = PTR_ERR(*prz); - dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - sz, (unsigned long long)*paddr, err); + dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n", + name, sz, (unsigned long long)*paddr, err); return err; } @@ -543,6 +705,7 @@ static int ramoops_parse_dt(struct platform_device *pdev, parse_size("ftrace-size", pdata->ftrace_size); parse_size("pmsg-size", pdata->pmsg_size); parse_size("ecc-size", pdata->ecc_info.ecc_size); + parse_size("flags", pdata->flags); #undef parse_size @@ -561,6 +724,7 @@ static int ramoops_probe(struct platform_device *pdev) if (dev_of_node(dev) && !pdata) { pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL); if (!pdata) { + pr_err("cannot allocate platform data buffer\n"); err = -ENOMEM; goto fail_out; } @@ -570,11 +734,20 @@ static int ramoops_probe(struct platform_device *pdev) goto fail_out; } - /* Only a single ramoops area allowed at a time, so fail extra + /* + * Only a single ramoops area allowed at a time, so fail extra * probes. */ - if (cxt->max_dump_cnt) + if (cxt->max_dump_cnt) { + pr_err("already initialized\n"); goto fail_out; + } + + /* Make sure we didn't get bogus platform data pointer. */ + if (!pdata) { + pr_err("NULL platform data\n"); + goto fail_out; + } if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size && !pdata->ftrace_size && !pdata->pmsg_size)) { @@ -600,27 +773,37 @@ static int ramoops_probe(struct platform_device *pdev) cxt->ftrace_size = pdata->ftrace_size; cxt->pmsg_size = pdata->pmsg_size; cxt->dump_oops = pdata->dump_oops; + cxt->flags = pdata->flags; cxt->ecc_info = pdata->ecc_info; paddr = cxt->phys_addr; dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size - cxt->pmsg_size; - err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz); + err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr, + dump_mem_sz, cxt->record_size, + &cxt->max_dump_cnt, 0, 0); if (err) goto fail_out; - err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr, + err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr, cxt->console_size, 0); if (err) goto fail_init_cprz; - err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size, - LINUX_VERSION_CODE); + cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + ? nr_cpu_ids + : 1; + err = ramoops_init_przs("ftrace", dev, cxt, &cxt->fprzs, &paddr, + cxt->ftrace_size, -1, + &cxt->max_ftrace_cnt, LINUX_VERSION_CODE, + (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU) + ? PRZ_FLAG_NO_LOCK : 0); if (err) goto fail_init_fprz; - err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0); + err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr, + cxt->pmsg_size, 0); if (err) goto fail_init_mprz; @@ -680,7 +863,6 @@ fail_clear: cxt->pstore.bufsize = 0; persistent_ram_free(cxt->mprz); fail_init_mprz: - persistent_ram_free(cxt->fprz); fail_init_fprz: persistent_ram_free(cxt->cprz); fail_init_cprz: @@ -699,7 +881,6 @@ static int ramoops_remove(struct platform_device *pdev) cxt->pstore.bufsize = 0; persistent_ram_free(cxt->mprz); - persistent_ram_free(cxt->fprz); persistent_ram_free(cxt->cprz); ramoops_free_przs(cxt); @@ -741,6 +922,8 @@ static void ramoops_register_dummy(void) dummy_data->ftrace_size = ramoops_ftrace_size; dummy_data->pmsg_size = ramoops_pmsg_size; dummy_data->dump_oops = dump_oops; + dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU; + /* * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC * (using 1 byte for ECC isn't much of use anyway). diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 3975deec02f8..a857338b7dab 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz) return atomic_read(&prz->buffer->start); } -static DEFINE_RAW_SPINLOCK(buffer_lock); - /* increase and wrap the start pointer, returning the old value */ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) { int old; int new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->start); new = old + a; @@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a) new -= prz->buffer_size; atomic_set(&prz->buffer->start, new); - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); return old; } @@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) { size_t old; size_t new; - unsigned long flags; + unsigned long flags = 0; - raw_spin_lock_irqsave(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_lock_irqsave(&prz->buffer_lock, flags); old = atomic_read(&prz->buffer->size); if (old == prz->buffer_size) @@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a) atomic_set(&prz->buffer->size, new); exit: - raw_spin_unlock_irqrestore(&buffer_lock, flags); + if (!(prz->flags & PRZ_FLAG_NO_LOCK)) + raw_spin_unlock_irqrestore(&prz->buffer_lock, flags); } static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz, @@ -465,7 +467,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size, } static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, - struct persistent_ram_ecc_info *ecc_info) + struct persistent_ram_ecc_info *ecc_info, + unsigned long flags) { int ret; @@ -493,6 +496,8 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, prz->buffer->sig = sig; persistent_ram_zap(prz); + prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + prz->flags = flags; return 0; } @@ -517,7 +522,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz) struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, u32 sig, struct persistent_ram_ecc_info *ecc_info, - unsigned int memtype) + unsigned int memtype, u32 flags) { struct persistent_ram_zone *prz; int ret = -ENOMEM; @@ -532,7 +537,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, if (ret) goto err; - ret = persistent_ram_post_init(prz, sig, ecc_info); + ret = persistent_ram_post_init(prz, sig, ecc_info, flags); if (ret) goto err; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 1bfac28b7e7d..406fed92362a 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -119,8 +119,7 @@ * spinlock to internal buffers before writing. * * Lock ordering (including related VFS locks) is the following: - * dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex - * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc. + * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -572,7 +571,8 @@ int dquot_scan_active(struct super_block *sb, struct dquot *dquot, *old_dquot = NULL; int ret = 0; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); + spin_lock(&dq_list_lock); list_for_each_entry(dquot, &inuse_list, dq_inuse) { if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) @@ -603,7 +603,6 @@ int dquot_scan_active(struct super_block *sb, spin_unlock(&dq_list_lock); out: dqput(old_dquot); - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return ret; } EXPORT_SYMBOL(dquot_scan_active); @@ -617,7 +616,8 @@ int dquot_writeback_dquots(struct super_block *sb, int type) int cnt; int err, ret = 0; - mutex_lock(&dqopt->dqonoff_mutex); + WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount)); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -653,7 +653,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type) && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); dqstats_inc(DQST_SYNCS); - mutex_unlock(&dqopt->dqonoff_mutex); return ret; } @@ -683,7 +682,6 @@ int dquot_quota_sync(struct super_block *sb, int type) * Now when everything is written we can discard the pagecache so * that userspace sees the changes. */ - mutex_lock(&dqopt->dqonoff_mutex); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -693,7 +691,6 @@ int dquot_quota_sync(struct super_block *sb, int type) truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); inode_unlock(dqopt->files[cnt]); } - mutex_unlock(&dqopt->dqonoff_mutex); return 0; } @@ -935,7 +932,7 @@ static int dqinit_needed(struct inode *inode, int type) return 0; } -/* This routine is guarded by dqonoff_mutex mutex */ +/* This routine is guarded by s_umount semaphore */ static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; @@ -2050,21 +2047,13 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid) struct quota_info *dqopt = sb_dqopt(sb); int err; - mutex_lock(&dqopt->dqonoff_mutex); - if (!sb_has_quota_active(sb, qid->type)) { - err = -ESRCH; - goto out; - } - if (!dqopt->ops[qid->type]->get_next_id) { - err = -ENOSYS; - goto out; - } + if (!sb_has_quota_active(sb, qid->type)) + return -ESRCH; + if (!dqopt->ops[qid->type]->get_next_id) + return -ENOSYS; mutex_lock(&dqopt->dqio_mutex); err = dqopt->ops[qid->type]->get_next_id(sb, qid); mutex_unlock(&dqopt->dqio_mutex); -out: - mutex_unlock(&dqopt->dqonoff_mutex); - return err; } EXPORT_SYMBOL(dquot_get_next_id); @@ -2107,6 +2096,10 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) struct quota_info *dqopt = sb_dqopt(sb); struct inode *toputinode[MAXQUOTAS]; + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); + /* Cannot turn off usage accounting without turning off limits, or * suspend quotas and simultaneously turn quotas off. */ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) @@ -2114,18 +2107,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) DQUOT_USAGE_ENABLED))) return -EINVAL; - /* We need to serialize quota_off() for device */ - mutex_lock(&dqopt->dqonoff_mutex); - /* * Skip everything if there's nothing to do. We have to do this because * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ - if (!sb_any_quota_loaded(sb)) { - mutex_unlock(&dqopt->dqonoff_mutex); + if (!sb_any_quota_loaded(sb)) return 0; - } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { toputinode[cnt] = NULL; if (type != -1 && cnt != type) @@ -2179,7 +2168,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } - mutex_unlock(&dqopt->dqonoff_mutex); /* Skip syncing and setting flags if quota files are hidden */ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) @@ -2196,20 +2184,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) * must also discard the blockdev buffers so that we see the * changes done by userspace on the next quotaon() */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (toputinode[cnt]) { - mutex_lock(&dqopt->dqonoff_mutex); - /* If quota was reenabled in the meantime, we have - * nothing to do */ - if (!sb_has_quota_loaded(sb, cnt)) { - inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | + /* This can happen when suspending quotas on remount-ro... */ + if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { + inode_lock(toputinode[cnt]); + toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); - truncate_inode_pages(&toputinode[cnt]->i_data, - 0); - inode_unlock(toputinode[cnt]); - mark_inode_dirty_sync(toputinode[cnt]); - } - mutex_unlock(&dqopt->dqonoff_mutex); + truncate_inode_pages(&toputinode[cnt]->i_data, 0); + inode_unlock(toputinode[cnt]); + mark_inode_dirty_sync(toputinode[cnt]); } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); @@ -2281,6 +2263,10 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EINVAL; goto out_fmt; } + if (sb_has_quota_loaded(sb, type)) { + error = -EBUSY; + goto out_fmt; + } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* As we bypass the pagecache we must now flush all the @@ -2292,11 +2278,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, sync_filesystem(sb); invalidate_bdev(sb->s_bdev); } - mutex_lock(&dqopt->dqonoff_mutex); - if (sb_has_quota_loaded(sb, type)) { - error = -EBUSY; - goto out_lock; - } if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { /* We don't want quota and atime on quota files (deadlocks @@ -2317,7 +2298,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EIO; dqopt->files[type] = igrab(inode); if (!dqopt->files[type]) - goto out_lock; + goto out_file_flags; error = -EINVAL; if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_file_init; @@ -2340,14 +2321,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, spin_unlock(&dq_state_lock); add_dquot_ref(sb, type); - mutex_unlock(&dqopt->dqonoff_mutex); return 0; out_file_init: dqopt->files[type] = NULL; iput(inode); -out_lock: +out_file_flags: if (oldflags != -1) { inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() @@ -2356,7 +2336,6 @@ out_lock: inode->i_flags |= oldflags; inode_unlock(inode); } - mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: put_quota_format(fmt); @@ -2371,15 +2350,16 @@ int dquot_resume(struct super_block *sb, int type) int ret = 0, cnt; unsigned int flags; + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; - - mutex_lock(&dqopt->dqonoff_mutex); - if (!sb_has_quota_suspended(sb, cnt)) { - mutex_unlock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, cnt)) continue; - } + inode = dqopt->files[cnt]; dqopt->files[cnt] = NULL; spin_lock(&dq_state_lock); @@ -2388,7 +2368,6 @@ int dquot_resume(struct super_block *sb, int type) cnt); dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt); spin_unlock(&dq_state_lock); - mutex_unlock(&dqopt->dqonoff_mutex); flags = dquot_generic_flag(flags, cnt); ret = vfs_load_quota_inode(inode, cnt, @@ -2401,7 +2380,7 @@ int dquot_resume(struct super_block *sb, int type) EXPORT_SYMBOL(dquot_resume); int dquot_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int error = security_quota_on(path->dentry); if (error) @@ -2424,42 +2403,30 @@ EXPORT_SYMBOL(dquot_quota_on); int dquot_enable(struct inode *inode, int type, int format_id, unsigned int flags) { - int ret = 0; struct super_block *sb = inode->i_sb; - struct quota_info *dqopt = sb_dqopt(sb); /* Just unsuspend quotas? */ BUG_ON(flags & DQUOT_SUSPENDED); + /* s_umount should be held in exclusive mode */ + if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount))) + up_read(&sb->s_umount); if (!flags) return 0; /* Just updating flags needed? */ if (sb_has_quota_loaded(sb, type)) { - mutex_lock(&dqopt->dqonoff_mutex); - /* Now do a reliable test... */ - if (!sb_has_quota_loaded(sb, type)) { - mutex_unlock(&dqopt->dqonoff_mutex); - goto load_quota; - } if (flags & DQUOT_USAGE_ENABLED && - sb_has_quota_usage_enabled(sb, type)) { - ret = -EBUSY; - goto out_lock; - } + sb_has_quota_usage_enabled(sb, type)) + return -EBUSY; if (flags & DQUOT_LIMITS_ENABLED && - sb_has_quota_limits_enabled(sb, type)) { - ret = -EBUSY; - goto out_lock; - } + sb_has_quota_limits_enabled(sb, type)) + return -EBUSY; spin_lock(&dq_state_lock); sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); -out_lock: - mutex_unlock(&dqopt->dqonoff_mutex); - return ret; + return 0; } -load_quota: return vfs_load_quota_inode(inode, type, format_id, flags); } EXPORT_SYMBOL(dquot_enable); @@ -2751,7 +2718,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state) struct quota_info *dqopt = sb_dqopt(sb); int type; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); memset(state, 0, sizeof(*state)); for (type = 0; type < MAXQUOTAS; type++) { if (!sb_has_quota_active(sb, type)) @@ -2773,7 +2739,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state) tstate->nextents = 1; /* We don't know... */ spin_unlock(&dq_data_lock); } - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return 0; } EXPORT_SYMBOL(dquot_get_state); @@ -2787,18 +2752,13 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) if ((ii->i_fieldmask & QC_WARNS_MASK) || (ii->i_fieldmask & QC_RT_SPC_TIMER)) return -EINVAL; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_active(sb, type)) { - err = -ESRCH; - goto out; - } + if (!sb_has_quota_active(sb, type)) + return -ESRCH; mi = sb_dqopt(sb)->info + type; if (ii->i_fieldmask & QC_FLAGS) { if ((ii->i_flags & QCI_ROOT_SQUASH && - mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) { - err = -EINVAL; - goto out; - } + mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) + return -EINVAL; } spin_lock(&dq_data_lock); if (ii->i_fieldmask & QC_SPC_TIMER) @@ -2815,8 +2775,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) mark_info_dirty(sb, type); /* Force write to disk */ sb->dq_op->write_info(sb, type); -out: - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return err; } EXPORT_SYMBOL(dquot_set_dqinfo); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index 8b252673d454..e99b1a72d9a7 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -12,14 +12,8 @@ static const struct genl_multicast_group quota_mcgrps[] = { }; /* Netlink family structure for quota */ -static struct genl_family quota_genl_family = { - /* - * Needed due to multicast group ID abuse - old code assumed - * the family ID was also a valid multicast group ID (which - * isn't true) and userspace might thus rely on it. Assign a - * static ID for this group to make dealing with that easier. - */ - .id = GENL_ID_VFS_DQUOT, +static struct genl_family quota_genl_family __ro_after_init = { + .module = THIS_MODULE, .hdrsize = 0, .name = "VFS_DQUOT", .version = 1, diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2d445425aad7..07e08c7d05ca 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -80,7 +80,7 @@ unsigned int qtype_enforce_flag(int type) } static int quota_quotaon(struct super_block *sb, int type, qid_t id, - struct path *path) + const struct path *path) { if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable) return -ENOSYS; @@ -104,13 +104,9 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr) { __u32 fmt; - mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_active(sb, type)) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + if (!sb_has_quota_active(sb, type)) return -ESRCH; - } fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id; - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); if (copy_to_user(addr, &fmt, sizeof(fmt))) return -EFAULT; return 0; @@ -700,7 +696,7 @@ static int quota_rmxquota(struct super_block *sb, void __user *addr) /* Copy parameters and call proper function */ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, - void __user *addr, struct path *path) + void __user *addr, const struct path *path) { int ret; @@ -789,9 +785,14 @@ static int quotactl_cmd_write(int cmd) } return 1; } - #endif /* CONFIG_BLOCK */ +/* Return true if quotactl command is manipulating quota on/off state */ +static bool quotactl_cmd_onoff(int cmd) +{ + return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF); +} + /* * look up a superblock on which quota ops will be performed * - use the name of a block device to find the superblock thereon @@ -809,7 +810,9 @@ static struct super_block *quotactl_block(const char __user *special, int cmd) putname(tmp); if (IS_ERR(bdev)) return ERR_CAST(bdev); - if (quotactl_cmd_write(cmd)) + if (quotactl_cmd_onoff(cmd)) + sb = get_super_exclusive_thawed(bdev); + else if (quotactl_cmd_write(cmd)) sb = get_super_thawed(bdev); else sb = get_super(bdev); @@ -872,7 +875,10 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special, ret = do_quotactl(sb, type, cmds, id, addr, pathp); - drop_super(sb); + if (!quotactl_cmd_onoff(cmds)) + drop_super(sb); + else + drop_super_exclusive(sb); out: if (pathp && !IS_ERR(pathp)) path_put(pathp); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2bcbf4e77982..2ef7ce75c062 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -23,7 +23,7 @@ #include <linux/sched.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" static int ramfs_nommu_setattr(struct dentry *, struct iattr *); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8621c039b536..26e45863e499 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -35,7 +35,7 @@ #include <linux/parser.h> #include <linux/magic.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" #define RAMFS_DEFAULT_MODE 0755 diff --git a/fs/read_write.c b/fs/read_write.c index 190e0d362581..5816d4c4cab0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -20,7 +20,7 @@ #include <linux/fs.h> #include "internal.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); @@ -1538,28 +1538,43 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; - ret = mnt_want_write_file(file_out); - if (ret) - return ret; + sb_start_write(inode_out->i_sb); - ret = -EOPNOTSUPP; - if (file_out->f_op->copy_file_range) + /* + * Try cloning first, this is supported by more file systems, and + * more efficient if both clone and copy are supported (e.g. NFS). + */ + if (file_in->f_op->clone_file_range) { + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (ret == 0) { + ret = len; + goto done; + } + } + + if (file_out->f_op->copy_file_range) { ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, pos_out, len, flags); - if (ret == -EOPNOTSUPP) - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + if (ret != -EOPNOTSUPP) + goto done; + } + + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); +done: if (ret > 0) { fsnotify_access(file_in); add_rchar(current, ret); fsnotify_modify(file_out); add_wchar(current, ret); } + inc_syscr(current); inc_syscw(current); - mnt_drop_write_file(file_out); + sb_end_write(inode_out->i_sb); return ret; } @@ -1650,6 +1665,115 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) return security_file_permission(file, write ? MAY_WRITE : MAY_READ); } +/* + * Check that the two inodes are eligible for cloning, the ranges make + * sense, and then flush all dirty data. Caller must ensure that the + * inodes have been locked against any other modifications. + * + * Returns: 0 for "nothing to clone", 1 for "something to clone", or + * the usual negative error code. + */ +int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, + struct inode *inode_out, loff_t pos_out, + u64 *len, bool is_dedupe) +{ + loff_t bs = inode_out->i_sb->s_blocksize; + loff_t blen; + loff_t isize; + bool same_inode = (inode_in == inode_out); + int ret; + + /* Don't touch certain kinds of inodes */ + if (IS_IMMUTABLE(inode_out)) + return -EPERM; + + if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) + return -ETXTBSY; + + /* Don't reflink dirs, pipes, sockets... */ + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + /* Are we going all the way to the end? */ + isize = i_size_read(inode_in); + if (isize == 0) + return 0; + + /* Zero length dedupe exits immediately; reflink goes to EOF. */ + if (*len == 0) { + if (is_dedupe || pos_in == isize) + return 0; + if (pos_in > isize) + return -EINVAL; + *len = isize - pos_in; + } + + /* Ensure offsets don't wrap and the input is inside i_size */ + if (pos_in + *len < pos_in || pos_out + *len < pos_out || + pos_in + *len > isize) + return -EINVAL; + + /* Don't allow dedupe past EOF in the dest file */ + if (is_dedupe) { + loff_t disize; + + disize = i_size_read(inode_out); + if (pos_out >= disize || pos_out + *len > disize) + return -EINVAL; + } + + /* If we're linking to EOF, continue to the block boundary. */ + if (pos_in + *len == isize) + blen = ALIGN(isize, bs) - pos_in; + else + blen = *len; + + /* Only reflink if we're aligned to block boundaries */ + if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || + !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) + return -EINVAL; + + /* Don't allow overlapped reflink within the same file */ + if (same_inode) { + if (pos_out + blen > pos_in && pos_out < pos_in + blen) + return -EINVAL; + } + + /* Wait for the completion of any pending IOs on both files */ + inode_dio_wait(inode_in); + if (!same_inode) + inode_dio_wait(inode_out); + + ret = filemap_write_and_wait_range(inode_in->i_mapping, + pos_in, pos_in + *len - 1); + if (ret) + return ret; + + ret = filemap_write_and_wait_range(inode_out->i_mapping, + pos_out, pos_out + *len - 1); + if (ret) + return ret; + + /* + * Check that the extents are the same. + */ + if (is_dedupe) { + bool is_same = false; + + ret = vfs_dedupe_file_range_compare(inode_in, pos_in, + inode_out, pos_out, *len, &is_same); + if (ret) + return ret; + if (!is_same) + return -EBADE; + } + + return 1; +} +EXPORT_SYMBOL(vfs_clone_file_prep_inodes); + int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, u64 len) { @@ -1657,15 +1781,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct inode *inode_out = file_inode(file_out); int ret; - if (inode_in->i_sb != inode_out->i_sb || - file_in->f_path.mnt != file_out->f_path.mnt) - return -EXDEV; - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) return -EISDIR; if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) return -EINVAL; + /* + * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on + * the same mount. Practically, they only need to be on the same file + * system. + */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + if (!(file_in->f_mode & FMODE_READ) || !(file_out->f_mode & FMODE_WRITE) || (file_out->f_flags & O_APPEND)) @@ -1685,10 +1813,6 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, if (pos_in + len > i_size_read(inode_in)) return -EINVAL; - ret = mnt_want_write_file(file_out); - if (ret) - return ret; - ret = file_in->f_op->clone_file_range(file_in, pos_in, file_out, pos_out, len); if (!ret) { @@ -1696,11 +1820,106 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in, fsnotify_modify(file_out); } - mnt_drop_write_file(file_out); return ret; } EXPORT_SYMBOL(vfs_clone_file_range); +/* + * Read a page's worth of file data into the page cache. Return the page + * locked. + */ +static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping; + struct page *page; + pgoff_t n; + + n = offset >> PAGE_SHIFT; + mapping = inode->i_mapping; + page = read_mapping_page(mapping, n, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + lock_page(page); + return page; +} + +/* + * Compare extents of two files to see if they are the same. + * Caller must have locked both inodes to prevent write races. + */ +int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff, + struct inode *dest, loff_t destoff, + loff_t len, bool *is_same) +{ + loff_t src_poff; + loff_t dest_poff; + void *src_addr; + void *dest_addr; + struct page *src_page; + struct page *dest_page; + loff_t cmp_len; + bool same; + int error; + + error = -EINVAL; + same = true; + while (len) { + src_poff = srcoff & (PAGE_SIZE - 1); + dest_poff = destoff & (PAGE_SIZE - 1); + cmp_len = min(PAGE_SIZE - src_poff, + PAGE_SIZE - dest_poff); + cmp_len = min(cmp_len, len); + if (cmp_len <= 0) + goto out_error; + + src_page = vfs_dedupe_get_page(src, srcoff); + if (IS_ERR(src_page)) { + error = PTR_ERR(src_page); + goto out_error; + } + dest_page = vfs_dedupe_get_page(dest, destoff); + if (IS_ERR(dest_page)) { + error = PTR_ERR(dest_page); + unlock_page(src_page); + put_page(src_page); + goto out_error; + } + src_addr = kmap_atomic(src_page); + dest_addr = kmap_atomic(dest_page); + + flush_dcache_page(src_page); + flush_dcache_page(dest_page); + + if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) + same = false; + + kunmap_atomic(dest_addr); + kunmap_atomic(src_addr); + unlock_page(dest_page); + unlock_page(src_page); + put_page(dest_page); + put_page(src_page); + + if (!same) + break; + + srcoff += cmp_len; + destoff += cmp_len; + len -= cmp_len; + } + + *is_same = same; + return 0; + +out_error: + return error; +} +EXPORT_SYMBOL(vfs_dedupe_file_range_compare); + int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) { struct file_dedupe_range_info *info; @@ -1737,6 +1956,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) goto out; ret = 0; + if (off + len > i_size_read(src)) + return -EINVAL; + /* pre-format output fields to sane values */ for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; diff --git a/fs/readdir.c b/fs/readdir.c index 9d0212c374d6..0e8a7f355f7a 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -19,7 +19,7 @@ #include <linux/syscalls.h> #include <linux/unistd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> int iterate_dir(struct file *file, struct dir_context *ctx) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 58b2dedb2a3a..cfeae9b0a2b7 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -19,6 +19,7 @@ #include <linux/quotaops.h> #include <linux/swap.h> #include <linux/uio.h> +#include <linux/bio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bc2dde2423c2..aa40c242f1db 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s, mark_buffer_dirty(jl->j_commit_bh) ; depth = reiserfs_write_unlock_nested(s); if (reiserfs_barrier_flush(s)) - __sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(jl->j_commit_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(jl->j_commit_bh); reiserfs_write_lock_nested(s, depth); @@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb, depth = reiserfs_write_unlock_nested(sb); if (reiserfs_barrier_flush(sb)) - __sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA); + __sync_dirty_buffer(journal->j_header_bh, + REQ_PREFLUSH | REQ_FUA); else sync_dirty_buffer(journal->j_header_bh); diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e6a2b406af36..bd39a998843d 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1665,7 +1665,6 @@ const struct inode_operations reiserfs_dir_inode_operations = { * stuff added */ const struct inode_operations reiserfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .setattr = reiserfs_setattr, .listxattr = reiserfs_listxattr, diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index a97e352d05d3..0037aea97d39 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -11,6 +11,7 @@ #include <linux/time.h> #include <linux/string.h> #include <linux/pagemap.h> +#include <linux/bio.h> #include "reiserfs.h" #include <linux/buffer_head.h> #include <linux/quotaops.h> diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0a6ad4e71e88..e314cb30a181 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -802,7 +802,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, struct path *); +static int reiserfs_quota_on(struct super_block *, int, int, const struct path *); static const struct dquot_operations reiserfs_quota_operations = { .write_dquot = reiserfs_write_dquot, @@ -2348,7 +2348,7 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - struct path *path) + const struct path *path) { int err; struct inode *inode; diff --git a/fs/romfs/super.c b/fs/romfs/super.c index d0f8a38dfafa..0186fe6d39f3 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -74,6 +74,7 @@ #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/uaccess.h> +#include <linux/major.h> #include "internal.h" static struct kmem_cache *romfs_inode_cachep; @@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode) static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + /* When calling huge_encode_dev(), + * use sb->s_bdev->bd_dev when, + * - CONFIG_ROMFS_ON_BLOCK defined + * use sb->s_dev when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD defined + * leave id as 0 when, + * - CONFIG_ROMFS_ON_BLOCK undefined and + * - CONFIG_ROMFS_ON_MTD undefined + */ + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = ROMFS_MAGIC; buf->f_namelen = ROMFS_MAXFN; @@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags |= MS_RDONLY | MS_NOATIME; sb->s_op = &romfs_super_ops; +#ifdef CONFIG_ROMFS_ON_MTD + /* Use same dev ID from the underlying mtdblock device */ + if (sb->s_mtd) + sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index); +#endif /* read the image superblock and check it */ rsb = kmalloc(512, GFP_KERNEL); if (!rsb) diff --git a/fs/select.c b/fs/select.c index 3d4f85defeab..305c0daf5d67 100644 --- a/fs/select.c +++ b/fs/select.c @@ -31,7 +31,7 @@ #include <net/busy_poll.h> #include <linux/vmalloc.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> /* diff --git a/fs/seq_file.c b/fs/seq_file.c index 368bfb92b115..ca69fb99e41a 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -15,7 +15,7 @@ #include <linux/printk.h> #include <linux/string_helpers.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/page.h> static void seq_set_overflow(struct seq_file *m) @@ -190,6 +190,13 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) */ m->version = file->f_version; + /* + * if request is to read from zero offset, reset iterator to first + * record as it might have been already advanced by previous requests + */ + if (*ppos == 0) + m->index = 0; + /* Don't assume *ppos is where we left it */ if (unlikely(*ppos != m->read_pos)) { while ((err = traverse(m, *ppos)) == -EAGAIN) diff --git a/fs/splice.c b/fs/splice.c index 5a7750bd2eea..4ef78aa8ef61 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -17,6 +17,7 @@ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> * */ +#include <linux/bvec.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/pagemap.h> @@ -203,6 +204,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, buf->len = spd->partial[page_nr].len; buf->private = spd->partial[page_nr].private; buf->ops = spd->ops; + buf->flags = 0; pipe->nrbufs++; page_nr++; @@ -1086,7 +1088,13 @@ EXPORT_SYMBOL(do_splice_direct); static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) { - while (pipe->nrbufs == pipe->buffers) { + for (;;) { + if (unlikely(!pipe->readers)) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + if (pipe->nrbufs != pipe->buffers) + return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; if (signal_pending(current)) @@ -1095,7 +1103,6 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) pipe_wait(pipe); pipe->waiting_writers--; } - return 0; } static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index ce62a380314f..2751476e6b6e 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -31,6 +31,7 @@ #include <linux/slab.h> #include <linux/string.h> #include <linux/buffer_head.h> +#include <linux/bio.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 79b9c31a0c8f..befeba0fa70a 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -118,7 +118,6 @@ const struct address_space_operations squashfs_symlink_aops = { }; const struct inode_operations squashfs_symlink_inode_ops = { - .readlink = generic_readlink, .get_link = page_get_link, .listxattr = squashfs_listxattr }; diff --git a/fs/stat.c b/fs/stat.c index bc045c7994e1..a268b7f27adf 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -15,7 +15,7 @@ #include <linux/syscalls.h> #include <linux/pagemap.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> void generic_fillattr(struct inode *inode, struct kstat *stat) @@ -329,12 +329,14 @@ retry: struct inode *inode = d_backing_inode(path.dentry); error = empty ? -ENOENT : -EINVAL; - if (inode->i_op->readlink) { + /* + * AFS mountpoints allow readlink(2) but are not symlinks + */ + if (d_is_symlink(path.dentry) || inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { touch_atime(&path); - error = inode->i_op->readlink(path.dentry, - buf, bufsiz); + error = vfs_readlink(path.dentry, buf, bufsiz); } } path_put(&path); diff --git a/fs/statfs.c b/fs/statfs.c index 083dc0ac9140..13ae259d4879 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -63,7 +63,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf) return retval; } -int vfs_statfs(struct path *path, struct kstatfs *buf) +int vfs_statfs(const struct path *path, struct kstatfs *buf) { int error; diff --git a/fs/super.c b/fs/super.c index c183835566c1..1709ed029a2c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -244,7 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); mutex_init(&s->s_dquot.dqio_mutex); - mutex_init(&s->s_dquot.dqonoff_mutex); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; @@ -558,6 +557,13 @@ void drop_super(struct super_block *sb) EXPORT_SYMBOL(drop_super); +void drop_super_exclusive(struct super_block *sb) +{ + up_write(&sb->s_umount); + put_super(sb); +} +EXPORT_SYMBOL(drop_super_exclusive); + /** * iterate_supers - call function for all active superblocks * @f: function to call @@ -628,15 +634,7 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ - -struct super_block *get_super(struct block_device *bdev) +static struct super_block *__get_super(struct block_device *bdev, bool excl) { struct super_block *sb; @@ -651,11 +649,17 @@ rescan: if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + if (!excl) + down_read(&sb->s_umount); + else + down_write(&sb->s_umount); /* still alive? */ if (sb->s_root && (sb->s_flags & MS_BORN)) return sb; - up_read(&sb->s_umount); + if (!excl) + up_read(&sb->s_umount); + else + up_write(&sb->s_umount); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); @@ -666,32 +670,67 @@ rescan: return NULL; } -EXPORT_SYMBOL(get_super); - /** - * get_super_thawed - get thawed superblock of a device + * get_super - get the superblock of a device * @bdev: device to get the superblock for * * Scans the superblock list and finds the superblock of the file system - * mounted on the device. The superblock is returned once it is thawed - * (or immediately if it was not frozen). %NULL is returned if no match - * is found. + * mounted on the device given. %NULL is returned if no match is found. */ -struct super_block *get_super_thawed(struct block_device *bdev) +struct super_block *get_super(struct block_device *bdev) +{ + return __get_super(bdev, false); +} +EXPORT_SYMBOL(get_super); + +static struct super_block *__get_super_thawed(struct block_device *bdev, + bool excl) { while (1) { - struct super_block *s = get_super(bdev); + struct super_block *s = __get_super(bdev, excl); if (!s || s->s_writers.frozen == SB_UNFROZEN) return s; - up_read(&s->s_umount); + if (!excl) + up_read(&s->s_umount); + else + up_write(&s->s_umount); wait_event(s->s_writers.wait_unfrozen, s->s_writers.frozen == SB_UNFROZEN); put_super(s); } } + +/** + * get_super_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen). %NULL is returned if no match + * is found. + */ +struct super_block *get_super_thawed(struct block_device *bdev) +{ + return __get_super_thawed(bdev, false); +} EXPORT_SYMBOL(get_super_thawed); /** + * get_super_exclusive_thawed - get thawed superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device. The superblock is returned once it is thawed + * (or immediately if it was not frozen) and s_umount semaphore is held + * in exclusive mode. %NULL is returned if no match is found. + */ +struct super_block *get_super_exclusive_thawed(struct block_device *bdev) +{ + return __get_super_thawed(bdev, true); +} +EXPORT_SYMBOL(get_super_exclusive_thawed); + +/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index d62c423a5a2d..858fb72f9e0f 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -145,7 +145,6 @@ static inline void write3byte(struct sysv_sb_info *sbi, } static const struct inode_operations sysv_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = page_get_link, .getattr = sysv_getattr, }; diff --git a/fs/timerfd.c b/fs/timerfd.c index 9ae4abb4110b..c173cc196175 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -55,7 +55,7 @@ static inline bool isalarm(struct timerfd_ctx *ctx) /* * This gets called when the timer event triggers. We set the "expired" * flag, but we do not re-arm the timer (in case it's necessary, - * tintv.tv64 != 0) until the timer is accessed. + * tintv != 0) until the timer is accessed. */ static void timerfd_triggered(struct timerfd_ctx *ctx) { @@ -93,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm, */ void timerfd_clock_was_set(void) { - ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + ktime_t moffs = ktime_mono_to_real(0); struct timerfd_ctx *ctx; unsigned long flags; @@ -102,8 +102,8 @@ void timerfd_clock_was_set(void) if (!ctx->might_cancel) continue; spin_lock_irqsave(&ctx->wqh.lock, flags); - if (ctx->moffs.tv64 != moffs.tv64) { - ctx->moffs.tv64 = KTIME_MAX; + if (ctx->moffs != moffs) { + ctx->moffs = KTIME_MAX; ctx->ticks++; wake_up_locked(&ctx->wqh); } @@ -124,9 +124,9 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx) static bool timerfd_canceled(struct timerfd_ctx *ctx) { - if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX) + if (!ctx->might_cancel || ctx->moffs != KTIME_MAX) return false; - ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + ctx->moffs = ktime_mono_to_real(0); return true; } @@ -155,7 +155,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx) else remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr); - return remaining.tv64 < 0 ? ktime_set(0, 0): remaining; + return remaining < 0 ? 0: remaining; } static int timerfd_setup(struct timerfd_ctx *ctx, int flags, @@ -184,7 +184,7 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ctx->t.tmr.function = timerfd_tmrproc; } - if (texp.tv64 != 0) { + if (texp != 0) { if (isalarm(ctx)) { if (flags & TFD_TIMER_ABSTIME) alarm_start(&ctx->t.alarm, texp); @@ -261,9 +261,9 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count, if (ctx->ticks) { ticks = ctx->ticks; - if (ctx->expired && ctx->tintv.tv64) { + if (ctx->expired && ctx->tintv) { /* - * If tintv.tv64 != 0, this is a periodic timer that + * If tintv != 0, this is a periodic timer that * needs to be re-armed. We avoid doing it in the timer * callback to avoid DoS attacks specifying a very * short timer period. @@ -410,7 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) else hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); - ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 }); + ctx->moffs = ktime_mono_to_real(0); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); @@ -469,7 +469,7 @@ static int do_timerfd_settime(int ufd, int flags, * We do not update "ticks" and "expired" since the timer will be * re-programmed again in the following timerfd_setup() call. */ - if (ctx->expired && ctx->tintv.tv64) { + if (ctx->expired && ctx->tintv) { if (isalarm(ctx)) alarm_forward_now(&ctx->t.alarm, ctx->tintv); else @@ -499,7 +499,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t) ctx = f.file->private_data; spin_lock_irq(&ctx->wqh.lock); - if (ctx->expired && ctx->tintv.tv64) { + if (ctx->expired && ctx->tintv) { ctx->expired = 0; if (isalarm(ctx)) { diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 7ff7712f284e..b0d0623c83ed 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -50,3 +50,14 @@ config UBIFS_ATIME_SUPPORT strictatime is the "heavy", relatime is "lighter", etc. If unsure, say 'N' + +config UBIFS_FS_ENCRYPTION + bool "UBIFS Encryption" + depends on UBIFS_FS && BLOCK + select FS_ENCRYPTION + default n + help + Enable encryption of UBIFS files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile index c54a24360f85..6f3251c2bf08 100644 --- a/fs/ubifs/Makefile +++ b/fs/ubifs/Makefile @@ -5,3 +5,4 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o ubifs-y += misc.o +ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c new file mode 100644 index 000000000000..3402720f2b28 --- /dev/null +++ b/fs/ubifs/crypto.c @@ -0,0 +1,97 @@ +#include "ubifs.h" + +static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len) +{ + return ubifs_xattr_get(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len); +} + +static int ubifs_crypt_set_context(struct inode *inode, const void *ctx, + size_t len, void *fs_data) +{ + return ubifs_xattr_set(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, 0); +} + +static bool ubifs_crypt_empty_dir(struct inode *inode) +{ + return ubifs_check_dir_empty(inode) == 0; +} + +static unsigned int ubifs_crypt_max_namelen(struct inode *inode) +{ + if (S_ISLNK(inode->i_mode)) + return UBIFS_MAX_INO_DATA; + else + return UBIFS_MAX_NLEN; +} + +static int ubifs_key_prefix(struct inode *inode, u8 **key) +{ + static char prefix[] = "ubifs:"; + + *key = prefix; + + return sizeof(prefix) - 1; +} + +int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, int block) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + void *p = &dn->data; + struct page *ret; + unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE); + + ubifs_assert(pad_len <= *out_len); + dn->compr_size = cpu_to_le16(in_len); + + /* pad to full block cipher length */ + if (pad_len != in_len) + memset(p + in_len, 0, pad_len - in_len); + + ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len, + offset_in_page(&dn->data), block, GFP_NOFS); + if (IS_ERR(ret)) { + ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret)); + return PTR_ERR(ret); + } + *out_len = pad_len; + + return 0; +} + +int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int *out_len, int block) +{ + struct ubifs_info *c = inode->i_sb->s_fs_info; + int err; + unsigned int clen = le16_to_cpu(dn->compr_size); + unsigned int dlen = *out_len; + + if (clen <= 0 || clen > UBIFS_BLOCK_SIZE || clen > dlen) { + ubifs_err(c, "bad compr_size: %i", clen); + return -EINVAL; + } + + ubifs_assert(dlen <= UBIFS_BLOCK_SIZE); + err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen, + offset_in_page(&dn->data), block); + if (err) { + ubifs_err(c, "fscrypt_decrypt_page failed: %i", err); + return err; + } + *out_len = clen; + + return 0; +} + +struct fscrypt_operations ubifs_crypt_operations = { + .flags = FS_CFLG_OWN_PAGES, + .get_context = ubifs_crypt_get_context, + .set_context = ubifs_crypt_set_context, + .is_encrypted = __ubifs_crypt_is_encrypted, + .empty_dir = ubifs_crypt_empty_dir, + .max_namelen = ubifs_crypt_max_namelen, + .key_prefix = ubifs_key_prefix, +}; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 69e287e20732..1e712a364680 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -233,7 +233,7 @@ static void dump_ch(const struct ubifs_ch *ch) void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) { const struct ubifs_inode *ui = ubifs_inode(inode); - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; union ubifs_key key; struct ubifs_dent_node *dent, *pdent = NULL; int count = 2; @@ -289,8 +289,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\t%d: %s (%s)\n", count++, dent->name, get_dent_type(dent->type)); - nm.name = dent->name; - nm.len = le16_to_cpu(dent->nlen); + fname_name(&nm) = dent->name; + fname_len(&nm) = le16_to_cpu(dent->nlen); kfree(pdent); pdent = dent; key_read(c, &dent->key, &key); @@ -1107,7 +1107,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) unsigned int nlink = 2; union ubifs_key key; struct ubifs_dent_node *dent, *pdent = NULL; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; loff_t size = UBIFS_INO_NODE_SZ; if (!dbg_is_chk_gen(c)) @@ -1128,9 +1128,9 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) return err; } - nm.name = dent->name; - nm.len = le16_to_cpu(dent->nlen); - size += CALC_DENT_SIZE(nm.len); + fname_name(&nm) = dent->name; + fname_len(&nm) = le16_to_cpu(dent->nlen); + size += CALC_DENT_SIZE(fname_len(&nm)); if (dent->type == UBIFS_ITYPE_DIR) nlink += 1; kfree(pdent); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ca16c5d7bab1..528369f3e472 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -85,11 +85,26 @@ static int inherit_flags(const struct inode *dir, umode_t mode) * initializes it. Returns new inode in case of success and an error code in * case of failure. */ -struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, +struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode) { + int err; struct inode *inode; struct ubifs_inode *ui; + bool encrypted = false; + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) { + ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err); + return ERR_PTR(err); + } + + if (!fscrypt_has_encryption_key(dir)) + return ERR_PTR(-EPERM); + + encrypted = true; + } inode = new_inode(c->vfs_sb); ui = ubifs_inode(inode); @@ -165,18 +180,29 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, */ ui->creat_sqnum = ++c->max_sqnum; spin_unlock(&c->cnt_lock); + + if (encrypted) { + err = fscrypt_inherit_context(dir, inode, &encrypted, true); + if (err) { + ubifs_err(c, "fscrypt_inherit_context failed: %i", err); + make_bad_inode(inode); + iput(inode); + return ERR_PTR(err); + } + } + return inode; } static int dbg_check_name(const struct ubifs_info *c, const struct ubifs_dent_node *dent, - const struct qstr *nm) + const struct fscrypt_name *nm) { if (!dbg_is_chk_gen(c)) return 0; - if (le16_to_cpu(dent->nlen) != nm->len) + if (le16_to_cpu(dent->nlen) != fname_len(nm)) return -EINVAL; - if (memcmp(dent->name, nm->name, nm->len)) + if (memcmp(dent->name, fname_name(nm), fname_len(nm))) return -EINVAL; return 0; } @@ -189,30 +215,61 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct ubifs_dent_node *dent; struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm; dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino); - if (dentry->d_name.len > UBIFS_MAX_NLEN) - return ERR_PTR(-ENAMETOOLONG); + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (err && err != -ENOKEY) + return ERR_PTR(err); + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); + if (err) + return ERR_PTR(err); + + if (fname_len(&nm) > UBIFS_MAX_NLEN) { + err = -ENAMETOOLONG; + goto out_fname; + } dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS); - if (!dent) - return ERR_PTR(-ENOMEM); + if (!dent) { + err = -ENOMEM; + goto out_fname; + } - dent_key_init(c, &key, dir->i_ino, &dentry->d_name); + if (nm.hash) { + ubifs_assert(fname_len(&nm) == 0); + ubifs_assert(fname_name(&nm) == NULL); + dent_key_init_hash(c, &key, dir->i_ino, nm.hash); + err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash); + } else { + dent_key_init(c, &key, dir->i_ino, &nm); + err = ubifs_tnc_lookup_nm(c, &key, dent, &nm); + } - err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name); if (err) { if (err == -ENOENT) { dbg_gen("not found"); goto done; } - goto out; + goto out_dent; } - if (dbg_check_name(c, dent, &dentry->d_name)) { + if (dbg_check_name(c, dent, &nm)) { err = -EINVAL; - goto out; + goto out_dent; } inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum)); @@ -225,11 +282,12 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, ubifs_err(c, "dead directory entry '%pd', error %d", dentry, err); ubifs_ro_mode(c, err); - goto out; + goto out_dent; } done: kfree(dent); + fscrypt_free_filename(&nm); /* * Note, d_splice_alias() would be required instead if we supported * NFS. @@ -237,8 +295,10 @@ done: d_add(dentry, inode); return NULL; -out: +out_dent: kfree(dent); +out_fname: + fscrypt_free_filename(&nm); return ERR_PTR(err); } @@ -247,10 +307,11 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .dirtied_ino = 1 }; struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct fscrypt_name nm; + int err, sz_change; /* * Budget request settings: new inode, new direntry, changing the @@ -264,10 +325,16 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (err) return err; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, mode); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } err = ubifs_init_security(dir, inode, &dentry->d_name); @@ -278,12 +345,13 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); insert_inode_hash(inode); d_instantiate(dentry, inode); return 0; @@ -295,6 +363,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); ubifs_err(c, "cannot create regular file, error %d", err); @@ -310,6 +380,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1 }; struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir); int err, instantiated = 0; + struct fscrypt_name nm; /* * Budget request settings: new dirty inode, new direntry, @@ -319,13 +390,20 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd', mode %#hx in dir ino %lu", dentry, mode, dir->i_ino); - err = ubifs_budget_space(c, &req); + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) return err; + err = ubifs_budget_space(c, &req); + if (err) { + fscrypt_free_filename(&nm); + return err; + } + err = ubifs_budget_space(c, &ino_req); if (err) { ubifs_release_budget(c, &req); + fscrypt_free_filename(&nm); return err; } @@ -361,7 +439,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry, mutex_unlock(&ui->ui_mutex); mutex_lock(&dir_ui->ui_mutex); - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -380,6 +458,7 @@ out_budg: ubifs_release_budget(c, &req); if (!instantiated) ubifs_release_budget(c, &ino_req); + fscrypt_free_filename(&nm); ubifs_err(c, "cannot create temporary file, error %d", err); return err; } @@ -439,12 +518,14 @@ static unsigned int vfs_dent_type(uint8_t type) */ static int ubifs_readdir(struct file *file, struct dir_context *ctx) { - int err = 0; - struct qstr nm; + int fstr_real_len = 0, err = 0; + struct fscrypt_name nm; + struct fscrypt_str fstr = {0}; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; + bool encrypted = ubifs_crypt_is_encrypted(dir); dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos); @@ -455,6 +536,18 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) */ return 0; + if (encrypted) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + + err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr); + if (err) + return err; + + fstr_real_len = fstr.len; + } + if (file->f_version == 0) { /* * The file was seek'ed, which means that @file->private_data @@ -476,12 +569,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) /* File positions 0 and 1 correspond to "." and ".." */ if (ctx->pos < 2) { ubifs_assert(!file->private_data); - if (!dir_emit_dots(file, ctx)) + if (!dir_emit_dots(file, ctx)) { + if (encrypted) + fscrypt_fname_free_buffer(&fstr); return 0; + } /* Find the first entry in TNC and save it */ lowest_dent_key(c, &key, dir->i_ino); - nm.name = NULL; + fname_len(&nm) = 0; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -499,7 +595,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) * Find the entry corresponding to @ctx->pos or the closest one. */ dent_key_init_hash(c, &key, dir->i_ino, ctx->pos); - nm.name = NULL; + fname_len(&nm) = 0; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -516,15 +612,33 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) ubifs_assert(le64_to_cpu(dent->ch.sqnum) > ubifs_inode(dir)->creat_sqnum); - nm.len = le16_to_cpu(dent->nlen); - if (!dir_emit(ctx, dent->name, nm.len, + fname_len(&nm) = le16_to_cpu(dent->nlen); + fname_name(&nm) = dent->name; + + if (encrypted) { + fstr.len = fstr_real_len; + + err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c, + &dent->key), + le32_to_cpu(dent->cookie), + &nm.disk_name, &fstr); + if (err) + goto out; + } else { + fstr.len = fname_len(&nm); + fstr.name = fname_name(&nm); + } + + if (!dir_emit(ctx, fstr.name, fstr.len, le64_to_cpu(dent->inum), - vfs_dent_type(dent->type))) + vfs_dent_type(dent->type))) { + if (encrypted) + fscrypt_fname_free_buffer(&fstr); return 0; + } /* Switch to the next entry */ key_read(c, &dent->key, &key); - nm.name = dent->name; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); @@ -541,6 +655,9 @@ out: kfree(file->private_data); file->private_data = NULL; + if (encrypted) + fscrypt_fname_free_buffer(&fstr); + if (err != -ENOENT) ubifs_err(c, "cannot find next direntry, error %d", err); else @@ -601,6 +718,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; + struct fscrypt_name nm; /* * Budget request settings: new direntry, changing the target inode, @@ -613,13 +731,21 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - err = dbg_check_synced_i_size(c, inode); + if (ubifs_crypt_is_encrypted(dir) && + !fscrypt_has_permitted_context(dir, inode)) + return -EPERM; + + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); if (err) return err; + err = dbg_check_synced_i_size(c, inode); + if (err) + goto out_fname; + err = ubifs_budget_space(c, &req); if (err) - return err; + goto out_fname; lock_2_inodes(dir, inode); inc_nlink(inode); @@ -628,13 +754,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -644,6 +771,8 @@ out_cancel: unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); +out_fname: + fscrypt_free_filename(&nm); return err; } @@ -652,10 +781,10 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) struct ubifs_info *c = dir->i_sb->s_fs_info; struct inode *inode = d_inode(dentry); struct ubifs_inode *dir_ui = ubifs_inode(dir); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); - int err, budgeted = 1; + int err, sz_change, budgeted = 1; struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; unsigned int saved_nlink = inode->i_nlink; + struct fscrypt_name nm; /* * Budget request settings: deletion direntry, deletion inode (+1 for @@ -667,16 +796,29 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); + if (err) + return err; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) - return err; + goto out_fname; err = ubifs_budget_space(c, &req); if (err) { if (err != -ENOSPC) - return err; + goto out_fname; budgeted = 0; } @@ -686,7 +828,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -698,6 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -707,21 +850,23 @@ out_cancel: unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); +out_fname: + fscrypt_free_filename(&nm); return err; } /** * check_dir_empty - check if a directory is empty or not. - * @c: UBIFS file-system description object * @dir: VFS inode object of the directory to check * * This function checks if directory @dir is empty. Returns zero if the * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes * in case of of errors. */ -static int check_dir_empty(struct ubifs_info *c, struct inode *dir) +int ubifs_check_dir_empty(struct inode *dir) { - struct qstr nm = { .name = NULL }; + struct ubifs_info *c = dir->i_sb->s_fs_info; + struct fscrypt_name nm = { 0 }; struct ubifs_dent_node *dent; union ubifs_key key; int err; @@ -743,10 +888,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; struct inode *inode = d_inode(dentry); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); - int err, budgeted = 1; + int err, sz_change, budgeted = 1; struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 }; + struct fscrypt_name nm; /* * Budget request settings: deletion direntry, deletion inode and @@ -758,14 +903,26 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino, dir->i_ino); ubifs_assert(inode_is_locked(dir)); ubifs_assert(inode_is_locked(inode)); - err = check_dir_empty(c, d_inode(dentry)); + err = ubifs_check_dir_empty(d_inode(dentry)); + if (err) + return err; + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err && err != -ENOKEY) + return err; + } + + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm); if (err) return err; + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + err = ubifs_budget_space(c, &req); if (err) { if (err != -ENOSPC) - return err; + goto out_fname; budgeted = 0; } @@ -776,7 +933,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; unlock_2_inodes(dir, inode); @@ -788,6 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) c->bi.nospace = c->bi.nospace_rp = 0; smp_wmb(); } + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -798,6 +956,8 @@ out_cancel: unlock_2_inodes(dir, inode); if (budgeted) ubifs_release_budget(c, &req); +out_fname: + fscrypt_free_filename(&nm); return err; } @@ -806,8 +966,9 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; - int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int err, sz_change; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 }; + struct fscrypt_name nm; /* * Budget request settings: new inode, new direntry and changing parent @@ -821,10 +982,16 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (err) return err; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, S_IFDIR | mode); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } err = ubifs_init_security(dir, inode, &dentry->d_name); @@ -838,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; @@ -847,6 +1014,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ubifs_release_budget(c, &req); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -857,6 +1025,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -870,11 +1040,12 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; union ubifs_dev_desc *dev = NULL; - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int sz_change; int err, devlen = 0; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(devlen, 8), .dirtied_ino = 1 }; + struct fscrypt_name nm; /* * Budget request settings: new inode, new direntry and changing parent @@ -896,11 +1067,17 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, return err; } + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + + sz_change = CALC_DENT_SIZE(fname_len(&nm)); + inode = ubifs_new_inode(c, dir, mode); if (IS_ERR(inode)) { kfree(dev); err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } init_special_inode(inode, inode->i_mode, rdev); @@ -917,7 +1094,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -925,6 +1102,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, ubifs_release_budget(c, &req); insert_inode_hash(inode); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -934,6 +1112,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -947,10 +1127,27 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, struct ubifs_inode *dir_ui = ubifs_inode(dir); struct ubifs_info *c = dir->i_sb->s_fs_info; int err, len = strlen(symname); - int sz_change = CALC_DENT_SIZE(dentry->d_name.len); + int sz_change = CALC_DENT_SIZE(len); + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1, .new_ino_d = ALIGN(len, 8), .dirtied_ino = 1 }; + struct fscrypt_name nm; + + if (ubifs_crypt_is_encrypted(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + goto out_budg; + + if (!fscrypt_has_encryption_key(dir)) { + err = -EPERM; + goto out_budg; + } + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } /* * Budget request settings: new inode, new direntry and changing parent @@ -960,36 +1157,65 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry, symname, dir->i_ino); - if (len > UBIFS_MAX_INO_DATA) + if (disk_link.len > UBIFS_MAX_INO_DATA) return -ENAMETOOLONG; err = ubifs_budget_space(c, &req); if (err) return err; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); + if (err) + goto out_budg; + inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) { err = PTR_ERR(inode); - goto out_budg; + goto out_fname; } ui = ubifs_inode(inode); - ui->data = kmalloc(len + 1, GFP_NOFS); + ui->data = kmalloc(disk_link.len, GFP_NOFS); if (!ui->data) { err = -ENOMEM; goto out_inode; } - memcpy(ui->data, symname, len); - ((char *)ui->data)[len] = '\0'; - inode->i_link = ui->data; + if (ubifs_crypt_is_encrypted(dir)) { + struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; + + sd = kzalloc(disk_link.len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto out_inode; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err) { + kfree(sd); + goto out_inode; + } + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; + } else { + inode->i_link = ui->data; + } + + memcpy(ui->data, disk_link.name, disk_link.len); + ((char *)ui->data)[disk_link.len - 1] = '\0'; + /* * The terminating zero byte is not written to the flash media and it * is put just to make later in-memory string processing simpler. Thus, * data length is @len, not @len + %1. */ - ui->data_len = len; - inode->i_size = ubifs_inode(inode)->ui_size = len; + ui->data_len = disk_link.len - 1; + inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1; err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) @@ -999,7 +1225,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; - err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); + err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; mutex_unlock(&dir_ui->ui_mutex); @@ -1007,6 +1233,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, ubifs_release_budget(c, &req); insert_inode_hash(inode); d_instantiate(dentry, inode); + fscrypt_free_filename(&nm); return 0; out_cancel: @@ -1016,6 +1243,8 @@ out_cancel: out_inode: make_bad_inode(inode); iput(inode); +out_fname: + fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); return err; @@ -1078,15 +1307,14 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_inode *whiteout_ui = NULL; int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); - int unlink = !!new_inode; - int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len); - int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len); + int unlink = !!new_inode, new_sz, old_sz; struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1, .dirtied_ino = 3 }; struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct timespec time; unsigned int uninitialized_var(saved_nlink); + struct fscrypt_name old_nm, new_nm; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1107,17 +1335,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, if (unlink) ubifs_assert(inode_is_locked(new_inode)); + if (old_dir != new_dir) { + if (ubifs_crypt_is_encrypted(new_dir) && + !fscrypt_has_permitted_context(new_dir, old_inode)) + return -EPERM; + } + if (unlink && is_dir) { - err = check_dir_empty(c, new_inode); + err = ubifs_check_dir_empty(new_inode); if (err) return err; } - err = ubifs_budget_space(c, &req); + err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm); if (err) return err; + + err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm); + if (err) { + fscrypt_free_filename(&old_nm); + return err; + } + + new_sz = CALC_DENT_SIZE(fname_len(&new_nm)); + old_sz = CALC_DENT_SIZE(fname_len(&old_nm)); + + err = ubifs_budget_space(c, &req); + if (err) { + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); + return err; + } err = ubifs_budget_space(c, &ino_req); if (err) { + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); ubifs_release_budget(c, &req); return err; } @@ -1239,8 +1491,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, iput(whiteout); } - err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout, - sync); + err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir, + new_inode, &new_nm, whiteout, sync); if (err) goto out_cancel; @@ -1256,6 +1508,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, ubifs_release_budget(c, &ino_req); if (IS_SYNC(old_inode)) err = old_inode->i_sb->s_op->write_inode(old_inode, NULL); + + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); return err; out_cancel: @@ -1284,6 +1539,8 @@ out_cancel: unlock_4_inodes(old_dir, new_dir, new_inode, whiteout); ubifs_release_budget(c, &ino_req); ubifs_release_budget(c, &req); + fscrypt_free_filename(&old_nm); + fscrypt_free_filename(&new_nm); return err; } @@ -1298,9 +1555,27 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, struct inode *snd_inode = d_inode(new_dentry); struct timespec time; int err; + struct fscrypt_name fst_nm, snd_nm; ubifs_assert(fst_inode && snd_inode); + if ((ubifs_crypt_is_encrypted(old_dir) || + ubifs_crypt_is_encrypted(new_dir)) && + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, fst_inode) || + !fscrypt_has_permitted_context(old_dir, snd_inode))) + return -EPERM; + + err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm); + if (err) + return err; + + err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm); + if (err) { + fscrypt_free_filename(&fst_nm); + return err; + } + lock_4_inodes(old_dir, new_dir, NULL, NULL); time = ubifs_current_time(old_dir); @@ -1320,12 +1595,14 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, } } - err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry, - sync); + err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir, + snd_inode, &snd_nm, sync); unlock_4_inodes(old_dir, new_dir, NULL, NULL); ubifs_release_budget(c, &req); + fscrypt_free_filename(&fst_nm); + fscrypt_free_filename(&snd_nm); return err; } @@ -1384,6 +1661,14 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ubifs_dir_open(struct inode *dir, struct file *file) +{ + if (ubifs_crypt_is_encrypted(dir)) + return fscrypt_get_encryption_info(dir) ? -EACCES : 0; + + return 0; +} + const struct inode_operations ubifs_dir_inode_operations = { .lookup = ubifs_lookup, .create = ubifs_create, @@ -1410,6 +1695,7 @@ const struct file_operations ubifs_dir_operations = { .iterate_shared = ubifs_readdir, .fsync = ubifs_fsync, .unlocked_ioctl = ubifs_ioctl, + .open = ubifs_dir_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index b4fbeefba246..b0d783774c96 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -78,6 +78,13 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, goto dump; dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, block); + if (err) + goto dump; + } + out_len = UBIFS_BLOCK_SIZE; err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); @@ -650,6 +657,13 @@ static int populate_page(struct ubifs_info *c, struct page *page, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; out_len = UBIFS_BLOCK_SIZE; + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, page_block); + if (err) + goto out_err; + } + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) @@ -1594,6 +1608,15 @@ static const struct vm_operations_struct ubifs_file_vm_ops = { static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) { int err; + struct inode *inode = file->f_mapping->host; + + if (ubifs_crypt_is_encrypted(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } err = generic_file_mmap(file, vma); if (err) @@ -1605,6 +1628,88 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int ubifs_file_open(struct inode *inode, struct file *filp) +{ + int ret; + struct dentry *dir; + struct ubifs_info *c = inode->i_sb->s_fs_info; + + if (ubifs_crypt_is_encrypted(inode)) { + ret = fscrypt_get_encryption_info(inode); + if (ret) + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } + + dir = dget_parent(file_dentry(filp)); + if (ubifs_crypt_is_encrypted(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu", + (unsigned long) d_inode(dir)->i_ino, + (unsigned long) inode->i_ino); + dput(dir); + ubifs_ro_mode(c, -EPERM); + return -EPERM; + } + dput(dir); + + return 0; +} + +static const char *ubifs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + int err; + struct fscrypt_symlink_data *sd; + struct ubifs_inode *ui = ubifs_inode(inode); + struct fscrypt_str cstr; + struct fscrypt_str pstr; + + if (!ubifs_crypt_is_encrypted(inode)) + return ui->data; + + if (!dentry) + return ERR_PTR(-ECHILD); + + err = fscrypt_get_encryption_info(inode); + if (err) + return ERR_PTR(err); + + sd = (struct fscrypt_symlink_data *)ui->data; + cstr.name = sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + if (cstr.len == 0) + return ERR_PTR(-ENOENT); + + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len) + return ERR_PTR(-EIO); + + err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); + if (err) + return ERR_PTR(err); + + err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); + if (err) { + fscrypt_fname_free_buffer(&pstr); + return ERR_PTR(err); + } + + pstr.name[pstr.len] = '\0'; + + // XXX this probably won't happen anymore... + if (pstr.name[0] == '\0') { + fscrypt_fname_free_buffer(&pstr); + return ERR_PTR(-ENOENT); + } + + set_delayed_call(done, kfree_link, pstr.name); + return pstr.name; +} + + const struct address_space_operations ubifs_file_address_operations = { .readpage = ubifs_readpage, .writepage = ubifs_writepage, @@ -1628,8 +1733,7 @@ const struct inode_operations ubifs_file_inode_operations = { }; const struct inode_operations ubifs_symlink_inode_operations = { - .readlink = generic_readlink, - .get_link = simple_get_link, + .get_link = ubifs_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .listxattr = ubifs_listxattr, @@ -1647,6 +1751,7 @@ const struct file_operations ubifs_file_operations = { .unlocked_ioctl = ubifs_ioctl, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, + .open = ubifs_file_open, #ifdef CONFIG_COMPAT .compat_ioctl = ubifs_compat_ioctl, #endif diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index e845c64b6ce1..7b35e3d6cde7 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -846,10 +846,6 @@ int ubifs_gc_start_commit(struct ubifs_info *c) */ while (1) { lp = ubifs_fast_find_freeable(c); - if (IS_ERR(lp)) { - err = PTR_ERR(lp); - goto out; - } if (!lp) break; ubifs_assert(!(lp->flags & LPROPS_TAKEN)); diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 97be41215332..3be28900bf37 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -452,16 +452,22 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) */ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { + ktime_t softlimit = ms_to_ktime(dirty_writeback_interval * 10); + unsigned long long delta = dirty_writeback_interval; + + /* centi to milli, milli to nano, then 10% */ + delta *= 10ULL * NSEC_PER_MSEC / 10ULL; + ubifs_assert(!hrtimer_active(&wbuf->timer)); + ubifs_assert(delta <= ULONG_MAX); if (wbuf->no_timer) return; dbg_io("set timer for jhead %s, %llu-%llu millisecs", dbg_jhead(wbuf->jhead), - div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC), - div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta, - USEC_PER_SEC)); - hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + div_u64(ktime_to_ns(softlimit), USEC_PER_SEC), + div_u64(ktime_to_ns(softlimit) + delta, USEC_PER_SEC)); + hrtimer_start_range_ns(&wbuf->timer, softlimit, delta, HRTIMER_MODE_REL); } @@ -1059,10 +1065,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); wbuf->timer.function = wbuf_timer_callback_nolock; - wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0); - wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT; - wbuf->delta *= 1000000000ULL; - ubifs_assert(wbuf->delta <= ULONG_MAX); return 0; } diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 3c7b29de0ca7..da519ba205f6 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -181,6 +181,26 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) mnt_drop_write_file(file); return err; } + case FS_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + struct ubifs_info *c = inode->i_sb->s_fs_info; + + err = ubifs_enable_encryption(c); + if (err) + return err; + + return fscrypt_ioctl_set_policy(file, (const void __user *)arg); +#else + return -EOPNOTSUPP; +#endif + } + case FS_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + return fscrypt_ioctl_get_policy(file, (void __user *)arg); +#else + return -EOPNOTSUPP; +#endif + } default: return -ENOTTY; @@ -197,6 +217,9 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; + case FS_IOC_SET_ENCRYPTION_POLICY: + case FS_IOC_GET_ENCRYPTION_POLICY: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 91bc76dc559e..294519b98874 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -78,16 +78,6 @@ static inline void zero_ino_node_unused(struct ubifs_ino_node *ino) static inline void zero_dent_node_unused(struct ubifs_dent_node *dent) { dent->padding1 = 0; - memset(dent->padding2, 0, 4); -} - -/** - * zero_data_node_unused - zero out unused fields of an on-flash data node. - * @data: the data node to zero out - */ -static inline void zero_data_node_unused(struct ubifs_data_node *data) -{ - memset(data->padding, 0, 2); } /** @@ -511,6 +501,14 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) ui->dirty = 0; } +static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent) +{ + if (c->double_hash) + dent->cookie = prandom_u32(); + else + dent->cookie = 0; +} + /** * ubifs_jnl_update - update inode. * @c: UBIFS file-system description object @@ -539,7 +537,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui) * success. In case of failure, a negative error code is returned. */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, - const struct qstr *nm, const struct inode *inode, + const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent) { int err, dlen, ilen, len, lnum, ino_offs, dent_offs; @@ -551,11 +549,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, struct ubifs_ino_node *ino; union ubifs_key dent_key, ino_key; - dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", - inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); + //dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", + // inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); - dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + dlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1; ilen = UBIFS_INO_NODE_SZ; /* @@ -596,9 +594,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, key_write(c, &dent_key, dent->key); dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino); dent->type = get_dent_type(inode->i_mode); - dent->nlen = cpu_to_le16(nm->len); - memcpy(dent->name, nm->name, nm->len); - dent->name[nm->len] = '\0'; + dent->nlen = cpu_to_le16(fname_len(nm)); + memcpy(dent->name, fname_name(nm), fname_len(nm)); + dent->name[fname_len(nm)] = '\0'; + set_dent_cookie(c, dent); + zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen, 0); @@ -697,14 +697,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len) { struct ubifs_data_node *data; - int err, lnum, offs, compr_type, out_len; + int err, lnum, offs, compr_type, out_len, compr_len; int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1; struct ubifs_inode *ui = ubifs_inode(inode); + bool encrypted = ubifs_crypt_is_encrypted(inode); dbg_jnlk(key, "ino %lu, blk %u, len %d, key ", (unsigned long)key_inum(c, key), key_block(c, key), len); ubifs_assert(len <= UBIFS_BLOCK_SIZE); + if (encrypted) + dlen += UBIFS_CIPHER_BLOCK_SIZE; + data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN); if (!data) { /* @@ -722,7 +726,6 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, data->ch.node_type = UBIFS_DATA_NODE; key_write(c, key, &data->key); data->size = cpu_to_le32(len); - zero_data_node_unused(data); if (!(ui->flags & UBIFS_COMPR_FL)) /* Compression is disabled for this inode */ @@ -730,9 +733,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, else compr_type = ui->compr_type; - out_len = dlen - UBIFS_DATA_NODE_SZ; - ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type); - ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); + out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ; + ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type); + ubifs_assert(compr_len <= UBIFS_BLOCK_SIZE); + + if (encrypted) { + err = ubifs_encrypt(inode, data, compr_len, &out_len, key_block(c, key)); + if (err) + goto out_free; + + } else { + data->compr_size = 0; + out_len = compr_len; + } dlen = UBIFS_DATA_NODE_SZ + out_len; data->compr_type = cpu_to_le16(compr_type); @@ -911,9 +924,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) * ubifs_jnl_xrename - cross rename two directory entries. * @c: UBIFS file-system description object * @fst_dir: parent inode of 1st directory entry to exchange - * @fst_dentry: 1st directory entry to exchange + * @fst_inode: 1st inode to exchange + * @fst_nm: name of 1st inode to exchange * @snd_dir: parent inode of 2nd directory entry to exchange - * @snd_dentry: 2nd directory entry to exchange + * @snd_inode: 2nd inode to exchange + * @snd_nm: name of 2nd inode to exchange * @sync: non-zero if the write-buffer has to be synchronized * * This function implements the cross rename operation which may involve @@ -922,29 +937,29 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode) * returned. */ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, - const struct dentry *fst_dentry, + const struct inode *fst_inode, + const struct fscrypt_name *fst_nm, const struct inode *snd_dir, - const struct dentry *snd_dentry, int sync) + const struct inode *snd_inode, + const struct fscrypt_name *snd_nm, int sync) { union ubifs_key key; struct ubifs_dent_node *dent1, *dent2; int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ; int aligned_dlen1, aligned_dlen2; int twoparents = (fst_dir != snd_dir); - const struct inode *fst_inode = d_inode(fst_dentry); - const struct inode *snd_inode = d_inode(snd_dentry); void *p; - dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu", - fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino); + //dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu", + // fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino); ubifs_assert(ubifs_inode(fst_dir)->data_len == 0); ubifs_assert(ubifs_inode(snd_dir)->data_len == 0); ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex)); ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex)); - dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1; - dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1; + dlen1 = UBIFS_DENT_NODE_SZ + fname_len(snd_nm) + 1; + dlen2 = UBIFS_DENT_NODE_SZ + fname_len(fst_nm) + 1; aligned_dlen1 = ALIGN(dlen1, 8); aligned_dlen2 = ALIGN(dlen2, 8); @@ -963,24 +978,24 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, /* Make new dent for 1st entry */ dent1->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name); + dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, snd_nm); dent1->inum = cpu_to_le64(fst_inode->i_ino); dent1->type = get_dent_type(fst_inode->i_mode); - dent1->nlen = cpu_to_le16(snd_dentry->d_name.len); - memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len); - dent1->name[snd_dentry->d_name.len] = '\0'; + dent1->nlen = cpu_to_le16(fname_len(snd_nm)); + memcpy(dent1->name, fname_name(snd_nm), fname_len(snd_nm)); + dent1->name[fname_len(snd_nm)] = '\0'; zero_dent_node_unused(dent1); ubifs_prep_grp_node(c, dent1, dlen1, 0); /* Make new dent for 2nd entry */ dent2 = (void *)dent1 + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name); + dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, fst_nm); dent2->inum = cpu_to_le64(snd_inode->i_ino); dent2->type = get_dent_type(snd_inode->i_mode); - dent2->nlen = cpu_to_le16(fst_dentry->d_name.len); - memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len); - dent2->name[fst_dentry->d_name.len] = '\0'; + dent2->nlen = cpu_to_le16(fname_len(fst_nm)); + memcpy(dent2->name, fname_name(fst_nm), fname_len(fst_nm)); + dent2->name[fname_len(fst_nm)] = '\0'; zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); @@ -1004,14 +1019,14 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, } release_head(c, BASEHD); - dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name); + dent_key_init(c, &key, snd_dir->i_ino, snd_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm); if (err) goto out_ro; offs += aligned_dlen1; - dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name); + dent_key_init(c, &key, fst_dir->i_ino, fst_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm); if (err) goto out_ro; @@ -1063,31 +1078,31 @@ out_free: * returned. */ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, - const struct dentry *old_dentry, + const struct inode *old_inode, + const struct fscrypt_name *old_nm, const struct inode *new_dir, - const struct dentry *new_dentry, + const struct inode *new_inode, + const struct fscrypt_name *new_nm, const struct inode *whiteout, int sync) { void *p; union ubifs_key key; struct ubifs_dent_node *dent, *dent2; int err, dlen1, dlen2, ilen, lnum, offs, len; - const struct inode *old_inode = d_inode(old_dentry); - const struct inode *new_inode = d_inode(new_dentry); int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); struct ubifs_inode *uninitialized_var(new_ui); - dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", - old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); + //dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu", + // old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino); ubifs_assert(ubifs_inode(old_dir)->data_len == 0); ubifs_assert(ubifs_inode(new_dir)->data_len == 0); ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex)); ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex)); - dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1; - dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1; + dlen1 = UBIFS_DENT_NODE_SZ + fname_len(new_nm) + 1; + dlen2 = UBIFS_DENT_NODE_SZ + fname_len(old_nm) + 1; if (new_inode) { new_ui = ubifs_inode(new_inode); ubifs_assert(mutex_is_locked(&new_ui->ui_mutex)); @@ -1113,19 +1128,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, /* Make new dent */ dent->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name); + dent_key_init_flash(c, &dent->key, new_dir->i_ino, new_nm); dent->inum = cpu_to_le64(old_inode->i_ino); dent->type = get_dent_type(old_inode->i_mode); - dent->nlen = cpu_to_le16(new_dentry->d_name.len); - memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len); - dent->name[new_dentry->d_name.len] = '\0'; + dent->nlen = cpu_to_le16(fname_len(new_nm)); + memcpy(dent->name, fname_name(new_nm), fname_len(new_nm)); + dent->name[fname_len(new_nm)] = '\0'; + set_dent_cookie(c, dent); zero_dent_node_unused(dent); ubifs_prep_grp_node(c, dent, dlen1, 0); dent2 = (void *)dent + aligned_dlen1; dent2->ch.node_type = UBIFS_DENT_NODE; - dent_key_init_flash(c, &dent2->key, old_dir->i_ino, - &old_dentry->d_name); + dent_key_init_flash(c, &dent2->key, old_dir->i_ino, old_nm); if (whiteout) { dent2->inum = cpu_to_le64(whiteout->i_ino); @@ -1135,9 +1150,10 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, dent2->inum = 0; dent2->type = DT_UNKNOWN; } - dent2->nlen = cpu_to_le16(old_dentry->d_name.len); - memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len); - dent2->name[old_dentry->d_name.len] = '\0'; + dent2->nlen = cpu_to_le16(fname_len(old_nm)); + memcpy(dent2->name, fname_name(old_nm), fname_len(old_nm)); + dent2->name[fname_len(old_nm)] = '\0'; + set_dent_cookie(c, dent2); zero_dent_node_unused(dent2); ubifs_prep_grp_node(c, dent2, dlen2, 0); @@ -1178,15 +1194,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, } release_head(c, BASEHD); - dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name); + dent_key_init(c, &key, new_dir->i_ino, new_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm); if (err) goto out_ro; offs += aligned_dlen1; if (whiteout) { - dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); - err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name); + dent_key_init(c, &key, old_dir->i_ino, old_nm); + err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm); if (err) goto out_ro; @@ -1196,8 +1212,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, if (err) goto out_ro; - dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name); - err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name); + dent_key_init(c, &key, old_dir->i_ino, old_nm); + err = ubifs_tnc_remove_nm(c, &key, old_nm); if (err) goto out_ro; } @@ -1251,35 +1267,60 @@ out_free: } /** - * recomp_data_node - re-compress a truncated data node. + * truncate_data_node - re-compress/encrypt a truncated data node. + * @c: UBIFS file-system description object + * @inode: inode which referes to the data node + * @block: data block number * @dn: data node to re-compress * @new_len: new length * * This function is used when an inode is truncated and the last data node of - * the inode has to be re-compressed and re-written. + * the inode has to be re-compressed/encrypted and re-written. */ -static int recomp_data_node(const struct ubifs_info *c, - struct ubifs_data_node *dn, int *new_len) +static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode, + unsigned int block, struct ubifs_data_node *dn, + int *new_len) { void *buf; - int err, len, compr_type, out_len; + int err, dlen, compr_type, out_len, old_dlen; out_len = le32_to_cpu(dn->size); buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS); if (!buf) return -ENOMEM; - len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; + dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); - err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type); - if (err) - goto out; - ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_decrypt(inode, dn, &dlen, block); + if (err) + goto out; + } + + if (compr_type != UBIFS_COMPR_NONE) { + err = ubifs_decompress(c, &dn->data, dlen, buf, &out_len, compr_type); + if (err) + goto out; + + ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); + } + + if (ubifs_crypt_is_encrypted(inode)) { + err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block); + if (err) + goto out; + + out_len = old_dlen; + } else { + dn->compr_size = 0; + } + ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); dn->compr_type = cpu_to_le16(compr_type); dn->size = cpu_to_le32(*new_len); *new_len = UBIFS_DATA_NODE_SZ + out_len; + err = 0; out: kfree(buf); return err; @@ -1347,17 +1388,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, if (le32_to_cpu(dn->size) <= dlen) dlen = 0; /* Nothing to do */ else { - int compr_type = le16_to_cpu(dn->compr_type); - - if (compr_type != UBIFS_COMPR_NONE) { - err = recomp_data_node(c, dn, &dlen); - if (err) - goto out_free; - } else { - dn->size = cpu_to_le32(dlen); - dlen += UBIFS_DATA_NODE_SZ; - } - zero_data_node_unused(dn); + err = truncate_data_node(c, inode, blk, dn, &dlen); + if (err) + goto out_free; } } } @@ -1442,7 +1475,8 @@ out_free: * error code in case of failure. */ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, - const struct inode *inode, const struct qstr *nm) + const struct inode *inode, + const struct fscrypt_name *nm) { int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen; struct ubifs_dent_node *xent; @@ -1451,9 +1485,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, int sync = IS_DIRSYNC(host); struct ubifs_inode *host_ui = ubifs_inode(host); - dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", - host->i_ino, inode->i_ino, nm->name, - ubifs_inode(inode)->data_len); + //dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d", + // host->i_ino, inode->i_ino, nm->name, + // ubifs_inode(inode)->data_len); ubifs_assert(inode->i_nlink == 0); ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); @@ -1461,7 +1495,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, * Since we are deleting the inode, we do not bother to attach any data * to it and assume its length is %UBIFS_INO_NODE_SZ. */ - xlen = UBIFS_DENT_NODE_SZ + nm->len + 1; + xlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1; aligned_xlen = ALIGN(xlen, 8); hlen = host_ui->data_len + UBIFS_INO_NODE_SZ; len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8); @@ -1482,9 +1516,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, key_write(c, &xent_key, xent->key); xent->inum = 0; xent->type = get_dent_type(inode->i_mode); - xent->nlen = cpu_to_le16(nm->len); - memcpy(xent->name, nm->name, nm->len); - xent->name[nm->len] = '\0'; + xent->nlen = cpu_to_le16(fname_len(nm)); + memcpy(xent->name, fname_name(nm), fname_len(nm)); + xent->name[fname_len(nm)] = '\0'; zero_dent_node_unused(xent); ubifs_prep_grp_node(c, xent, xlen, 0); diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index c0a95e393347..7547be512db2 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -69,7 +69,7 @@ static inline uint32_t key_r5_hash(const char *s, int len) uint32_t a = 0; const signed char *str = (const signed char *)s; - while (*str) { + while (len--) { a += *str << 4; a += *str >> 4; a *= 11; @@ -153,13 +153,13 @@ static inline void highest_ino_key(const struct ubifs_info *c, * @c: UBIFS file-system description object * @key: key to initialize * @inum: parent inode number - * @nm: direntry name and length + * @nm: direntry name and length. Not a string when encrypted! */ static inline void dent_key_init(const struct ubifs_info *c, union ubifs_key *key, ino_t inum, - const struct qstr *nm) + const struct fscrypt_name *nm) { - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->u32[0] = inum; @@ -191,10 +191,11 @@ static inline void dent_key_init_hash(const struct ubifs_info *c, * @nm: direntry name and length */ static inline void dent_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, const struct qstr *nm) + ino_t inum, + const struct fscrypt_name *nm) { union ubifs_key *key = k; - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->j32[0] = cpu_to_le32(inum); @@ -225,9 +226,9 @@ static inline void lowest_dent_key(const struct ubifs_info *c, */ static inline void xent_key_init(const struct ubifs_info *c, union ubifs_key *key, ino_t inum, - const struct qstr *nm) + const struct fscrypt_name *nm) { - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->u32[0] = inum; @@ -242,10 +243,10 @@ static inline void xent_key_init(const struct ubifs_info *c, * @nm: extended attribute entry name and length */ static inline void xent_key_init_flash(const struct ubifs_info *c, void *k, - ino_t inum, const struct qstr *nm) + ino_t inum, const struct fscrypt_name *nm) { union ubifs_key *key = k; - uint32_t hash = c->key_hash(nm->name, nm->len); + uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm)); ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK)); key->j32[0] = cpu_to_le32(inum); diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index fb0f44cd1e28..ae5c02f22f3e 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -61,7 +61,7 @@ struct replay_entry { struct list_head list; union ubifs_key key; union { - struct qstr nm; + struct fscrypt_name nm; struct { loff_t old_size; loff_t new_size; @@ -327,7 +327,7 @@ static void destroy_replay_list(struct ubifs_info *c) list_for_each_entry_safe(r, tmp, &c->replay_list, list) { if (is_hash_key(c, &r->key)) - kfree(r->nm.name); + kfree(fname_name(&r->nm)); list_del(&r->list); kfree(r); } @@ -430,10 +430,10 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len, r->deletion = !!deletion; r->sqnum = sqnum; key_copy(c, key, &r->key); - r->nm.len = nlen; + fname_len(&r->nm) = nlen; memcpy(nbuf, name, nlen); nbuf[nlen] = '\0'; - r->nm.name = nbuf; + fname_name(&r->nm) = nbuf; list_add_tail(&r->list, &c->replay_list); return 0; @@ -456,7 +456,7 @@ int ubifs_validate_entry(struct ubifs_info *c, if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 || dent->type >= UBIFS_ITYPES_CNT || nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || - strnlen(dent->name, nlen) != nlen || + (key_type == UBIFS_XENT_KEY && strnlen(dent->name, nlen) != nlen) || le64_to_cpu(dent->inum) > MAX_INUM) { ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ? "directory entry" : "extended attribute entry"); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 3cbb904a6d7d..7f1ead29e727 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -163,6 +163,7 @@ static int create_default_filesystem(struct ubifs_info *c) tmp64 = (long long)max_buds * c->leb_size; if (big_lpt) sup_flags |= UBIFS_FLG_BIGLPT; + sup_flags |= UBIFS_FLG_DOUBLE_HASH; sup->ch.node_type = UBIFS_SB_NODE; sup->key_hash = UBIFS_KEY_HASH_R5; @@ -465,6 +466,16 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) goto failed; } + if (!c->double_hash && c->fmt_version >= 5) { + err = 16; + goto failed; + } + + if (c->encrypted && c->fmt_version < 5) { + err = 17; + goto failed; + } + return 0; failed: @@ -620,6 +631,24 @@ int ubifs_read_superblock(struct ubifs_info *c) memcpy(&c->uuid, &sup->uuid, 16); c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT); c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP); + c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH); + c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION); + + if ((sup_flags & ~UBIFS_FLG_MASK) != 0) { + ubifs_err(c, "Unknown feature flags found: %#x", + sup_flags & ~UBIFS_FLG_MASK); + err = -EINVAL; + goto out; + } + +#ifndef CONFIG_UBIFS_FS_ENCRYPTION + if (c->encrypted) { + ubifs_err(c, "file system contains encrypted files but UBIFS" + " was built without crypto support."); + err = -EINVAL; + goto out; + } +#endif /* Automatically increase file system size to the maximum size */ c->old_leb_cnt = c->leb_cnt; @@ -807,3 +836,33 @@ int ubifs_fixup_free_space(struct ubifs_info *c) ubifs_msg(c, "free space fixup complete"); return err; } + +int ubifs_enable_encryption(struct ubifs_info *c) +{ + int err; + struct ubifs_sb_node *sup; + + if (c->encrypted) + return 0; + + if (c->ro_mount || c->ro_media) + return -EROFS; + + if (c->fmt_version < 5) { + ubifs_err(c, "on-flash format version 5 is needed for encryption"); + return -EINVAL; + } + + sup = ubifs_read_sb_node(c); + if (IS_ERR(sup)) + return PTR_ERR(sup); + + sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION); + + err = ubifs_write_sb_node(c, sup); + if (!err) + c->encrypted = 1; + kfree(sup); + + return err; +} diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 4ec051089186..e08aa04fc835 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -198,7 +198,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; - inode->i_link = ui->data; break; case S_IFBLK: case S_IFCHR: @@ -380,6 +379,9 @@ out: } done: clear_inode(inode); +#ifdef CONFIG_UBIFS_FS_ENCRYPTION + fscrypt_put_encryption_info(inode, NULL); +#endif } static void ubifs_dirty_inode(struct inode *inode, int flags) @@ -1207,7 +1209,8 @@ static int mount_ubifs(struct ubifs_info *c) bu_init(c); if (!c->ro_mount) { - c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ + UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) goto out_free; @@ -1620,7 +1623,8 @@ static int ubifs_remount_rw(struct ubifs_info *c) goto out; } - c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL); + c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \ + UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL); if (!c->write_reserve_buf) { err = -ENOMEM; goto out; @@ -1995,6 +1999,12 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi) return c; } +#ifndef CONFIG_UBIFS_FS_ENCRYPTION +struct fscrypt_operations ubifs_crypt_operations = { + .is_encrypted = __ubifs_crypt_is_encrypted, +}; +#endif + static int ubifs_fill_super(struct super_block *sb, void *data, int silent) { struct ubifs_info *c = sb->s_fs_info; @@ -2041,6 +2051,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; sb->s_op = &ubifs_super_operations; sb->s_xattr = ubifs_xattr_handlers; + sb->s_cop = &ubifs_crypt_operations; mutex_lock(&c->umount_mutex); err = mount_ubifs(c); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index fa9a20cc60d6..709aa098dd46 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -34,6 +34,11 @@ #include <linux/slab.h> #include "ubifs.h" +static int try_read_node(const struct ubifs_info *c, void *buf, int type, + int len, int lnum, int offs); +static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_zbranch *zbr, void *node); + /* * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions. * @NAME_LESS: name corresponding to the first argument is less than second @@ -378,7 +383,7 @@ static void lnc_free(struct ubifs_zbranch *zbr) } /** - * tnc_read_node_nm - read a "hashed" leaf node. + * tnc_read_hashed_node - read a "hashed" leaf node. * @c: UBIFS file-system description object * @zbr: key and position of the node * @node: node is returned here @@ -388,8 +393,8 @@ static void lnc_free(struct ubifs_zbranch *zbr) * added to LNC. Returns zero in case of success or a negative negative error * code in case of failure. */ -static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, - void *node) +static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, + void *node) { int err; @@ -402,7 +407,19 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr, return 0; } - err = ubifs_tnc_read_node(c, zbr, node); + if (c->replaying) { + err = fallible_read_node(c, &zbr->key, zbr, node); + /* + * When the node was not found, return -ENOENT, 0 otherwise. + * Negative return codes stay as-is. + */ + if (err == 0) + err = -ENOENT; + else if (err == 1) + err = 0; + } else { + err = ubifs_tnc_read_node(c, zbr, node); + } if (err) return err; @@ -519,7 +536,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, * of failure, a negative error code is returned. */ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, - const struct qstr *nm) + const struct fscrypt_name *nm) { struct ubifs_dent_node *dent; int nlen, err; @@ -542,11 +559,11 @@ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, dent = zbr->leaf; nlen = le16_to_cpu(dent->nlen); - err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm))); if (err == 0) { - if (nlen == nm->len) + if (nlen == fname_len(nm)) return NAME_MATCHES; - else if (nlen < nm->len) + else if (nlen < fname_len(nm)) return NAME_LESS; else return NAME_GREATER; @@ -689,7 +706,7 @@ static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n) */ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n, - const struct qstr *nm) + const struct fscrypt_name *nm) { int err; @@ -807,7 +824,7 @@ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key, */ static int fallible_matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr, - const struct qstr *nm) + const struct fscrypt_name *nm) { struct ubifs_dent_node *dent; int nlen, err; @@ -835,11 +852,11 @@ static int fallible_matches_name(struct ubifs_info *c, dent = zbr->leaf; nlen = le16_to_cpu(dent->nlen); - err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len)); + err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm))); if (err == 0) { - if (nlen == nm->len) + if (nlen == fname_len(nm)) return NAME_MATCHES; - else if (nlen < nm->len) + else if (nlen < fname_len(nm)) return NAME_LESS; else return NAME_GREATER; @@ -878,7 +895,8 @@ out_free: static int fallible_resolve_collision(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n, - const struct qstr *nm, int adding) + const struct fscrypt_name *nm, + int adding) { struct ubifs_znode *o_znode = NULL, *znode = *zn; int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n; @@ -1453,7 +1471,7 @@ again: * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - err = tnc_read_node_nm(c, zt, node); + err = tnc_read_hashed_node(c, zt, node); goto out; } if (safely) { @@ -1782,19 +1800,19 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) * @node: the node is returned here * @nm: node name * - * This function look up and reads a node which contains name hash in the key. + * This function looks up and reads a node which contains name hash in the key. * Since the hash may have collisions, there may be many nodes with the same * key, so we have to sequentially look to all of them until the needed one is * found. This function returns zero in case of success, %-ENOENT if the node * was not found, and a negative error code in case of failure. */ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm) + void *node, const struct fscrypt_name *nm) { int found, n, err; struct ubifs_znode *znode; - dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); + //dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name); mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1816,7 +1834,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - err = tnc_read_node_nm(c, &znode->zbranch[n], node); + err = tnc_read_hashed_node(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); @@ -1830,14 +1848,14 @@ out_unlock: * @node: the node is returned here * @nm: node name * - * This function look up and reads a node which contains name hash in the key. + * This function looks up and reads a node which contains name hash in the key. * Since the hash may have collisions, there may be many nodes with the same * key, so we have to sequentially look to all of them until the needed one is * found. This function returns zero in case of success, %-ENOENT if the node * was not found, and a negative error code in case of failure. */ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm) + void *node, const struct fscrypt_name *nm) { int err, len; const struct ubifs_dent_node *dent = node; @@ -1851,16 +1869,105 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, return err; len = le16_to_cpu(dent->nlen); - if (nm->len == len && !memcmp(dent->name, nm->name, len)) + if (fname_len(nm) == len && !memcmp(dent->name, fname_name(nm), len)) return 0; /* * Unluckily, there are hash collisions and we have to iterate over * them look at each direntry with colliding name hash sequentially. */ + return do_lookup_nm(c, key, node, nm); } +static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + struct ubifs_dent_node *dent, uint32_t cookie) +{ + int n, err, type = key_type(c, key); + struct ubifs_znode *znode; + struct ubifs_zbranch *zbr; + union ubifs_key *dkey, start_key; + + ubifs_assert(is_hash_key(c, key)); + + lowest_dent_key(c, &start_key, key_inum(c, key)); + + mutex_lock(&c->tnc_mutex); + err = ubifs_lookup_level0(c, &start_key, &znode, &n); + if (unlikely(err < 0)) + goto out_unlock; + + for (;;) { + if (!err) { + err = tnc_next(c, &znode, &n); + if (err) + goto out_unlock; + } + + zbr = &znode->zbranch[n]; + dkey = &zbr->key; + + if (key_inum(c, dkey) != key_inum(c, key) || + key_type(c, dkey) != type) { + err = -ENOENT; + goto out_unlock; + } + + err = tnc_read_hashed_node(c, zbr, dent); + if (err) + goto out_unlock; + + if (key_hash(c, key) == key_hash(c, dkey) && + le32_to_cpu(dent->cookie) == cookie) + goto out_unlock; + } + +out_unlock: + mutex_unlock(&c->tnc_mutex); + return err; +} + +/** + * ubifs_tnc_lookup_dh - look up a "double hashed" node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * @cookie: node cookie for collision resolution + * + * This function looks up and reads a node which contains name hash in the key. + * Since the hash may have collisions, there may be many nodes with the same + * key, so we have to sequentially look to all of them until the needed one + * with the same cookie value is found. + * This function returns zero in case of success, %-ENOENT if the node + * was not found, and a negative error code in case of failure. + */ +int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + void *node, uint32_t cookie) +{ + int err; + const struct ubifs_dent_node *dent = node; + + if (!c->double_hash) + return -EOPNOTSUPP; + + /* + * We assume that in most of the cases there are no name collisions and + * 'ubifs_tnc_lookup()' returns us the right direntry. + */ + err = ubifs_tnc_lookup(c, key, node); + if (err) + return err; + + if (le32_to_cpu(dent->cookie) == cookie) + return 0; + + /* + * Unluckily, there are hash collisions and we have to iterate over + * them look at each direntry with colliding name hash sequentially. + */ + return do_lookup_dh(c, key, node, cookie); +} + /** * correct_parent_keys - correct parent znodes' keys. * @c: UBIFS file-system description object @@ -2279,14 +2386,15 @@ out_unlock: * may have collisions, like directory entry keys. */ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, const struct qstr *nm) + int lnum, int offs, int len, + const struct fscrypt_name *nm) { int found, n, err = 0; struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", - lnum, offs, nm->len, nm->name); + //dbg_tnck(key, "LEB %d:%d, name '%.*s', key ", + // lnum, offs, nm->len, nm->name); found = lookup_level0_dirty(c, key, &znode, &n); if (found < 0) { err = found; @@ -2344,7 +2452,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, * by passing 'ubifs_tnc_remove_nm()' the same key but * an unmatchable name. */ - struct qstr noname = { .name = "" }; + struct fscrypt_name noname = { .disk_name = { .name = "", .len = 1 } }; err = dbg_check_tnc(c, 0); mutex_unlock(&c->tnc_mutex); @@ -2514,13 +2622,13 @@ out_unlock: * Returns %0 on success or negative error code on failure. */ int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, - const struct qstr *nm) + const struct fscrypt_name *nm) { int n, err; struct ubifs_znode *znode; mutex_lock(&c->tnc_mutex); - dbg_tnck(key, "%.*s, key ", nm->len, nm->name); + //dbg_tnck(key, "%.*s, key ", nm->len, nm->name); err = lookup_level0_dirty(c, key, &znode, &n); if (err < 0) goto out_unlock; @@ -2669,7 +2777,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) { union ubifs_key key1, key2; struct ubifs_dent_node *xent, *pxent = NULL; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; dbg_tnc("ino %lu", (unsigned long)inum); @@ -2694,8 +2802,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) dbg_tnc("xent '%s', ino %lu", xent->name, (unsigned long)xattr_inum); - nm.name = xent->name; - nm.len = le16_to_cpu(xent->nlen); + fname_name(&nm) = xent->name; + fname_len(&nm) = le16_to_cpu(xent->nlen); err = ubifs_tnc_remove_nm(c, &key1, &nm); if (err) { kfree(xent); @@ -2747,7 +2855,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum) */ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, union ubifs_key *key, - const struct qstr *nm) + const struct fscrypt_name *nm) { int n, err, type = key_type(c, key); struct ubifs_znode *znode; @@ -2755,7 +2863,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, struct ubifs_zbranch *zbr; union ubifs_key *dkey; - dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); + //dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)"); ubifs_assert(is_hash_key(c, key)); mutex_lock(&c->tnc_mutex); @@ -2763,10 +2871,14 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, if (unlikely(err < 0)) goto out_unlock; - if (nm->name) { + if (fname_len(nm) > 0) { if (err) { /* Handle collisions */ - err = resolve_collision(c, key, &znode, &n, nm); + if (c->replaying) + err = fallible_resolve_collision(c, key, &znode, &n, + nm, 0); + else + err = resolve_collision(c, key, &znode, &n, nm); dbg_tnc("rc returned %d, znode %p, n %d", err, znode, n); if (unlikely(err < 0)) @@ -2813,7 +2925,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, goto out_free; } - err = tnc_read_node_nm(c, zbr, dent); + err = tnc_read_hashed_node(c, zbr, dent); if (unlikely(err)) goto out_free; diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index e24380cf46ed..e8c23c9d4f4a 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -46,7 +46,7 @@ * UBIFS went into mainline kernel with format version 4. The older formats * were development formats. */ -#define UBIFS_FORMAT_VERSION 4 +#define UBIFS_FORMAT_VERSION 5 /* * Read-only compatibility version. If the UBIFS format is changed, older UBIFS @@ -301,6 +301,13 @@ enum { #define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ /* + * xattr name of UBIFS encryption context, we don't use a prefix + * nor a long name to not waste space on the flash. + */ +#define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c" + + +/* * On-flash inode flags. * * UBIFS_COMPR_FL: use compression for this inode @@ -309,6 +316,7 @@ enum { * UBIFS_APPEND_FL: writes to the inode may only append data * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value + * UBIFS_CRYPT_FL: use encryption for this inode * * Note, these are on-flash flags which correspond to ioctl flags * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not @@ -321,6 +329,7 @@ enum { UBIFS_APPEND_FL = 0x08, UBIFS_DIRSYNC_FL = 0x10, UBIFS_XATTR_FL = 0x20, + UBIFS_CRYPT_FL = 0x40, }; /* Inode flag bits used by UBIFS */ @@ -409,12 +418,19 @@ enum { * * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed + * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to + * support 64bit cookies for lookups by hash + * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files */ enum { UBIFS_FLG_BIGLPT = 0x02, UBIFS_FLG_SPACE_FIXUP = 0x04, + UBIFS_FLG_DOUBLE_HASH = 0x08, + UBIFS_FLG_ENCRYPTION = 0x10, }; +#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION) + /** * struct ubifs_ch - common header node. * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC) @@ -521,7 +537,8 @@ struct ubifs_ino_node { * @padding1: reserved for future, zeroes * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc) * @nlen: name length - * @padding2: reserved for future, zeroes + * @cookie: A 32bits random number, used to construct a 64bits + * identifier. * @name: zero-terminated name * * Note, do not forget to amend 'zero_dent_node_unused()' function when @@ -534,7 +551,7 @@ struct ubifs_dent_node { __u8 padding1; __u8 type; __le16 nlen; - __u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */ + __le32 cookie; __u8 name[]; } __packed; @@ -544,18 +561,16 @@ struct ubifs_dent_node { * @key: node key * @size: uncompressed data size in bytes * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc) - * @padding: reserved for future, zeroes + * @compr_size: compressed data size in bytes, only valid when data is encrypted * @data: data * - * Note, do not forget to amend 'zero_data_node_unused()' function when - * changing the padding fields. */ struct ubifs_data_node { struct ubifs_ch ch; __u8 key[UBIFS_MAX_KEY_LEN]; __le32 size; __le16 compr_type; - __u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */ + __le16 compr_size; __u8 data[]; } __packed; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 096035eb29d0..ca72382ce6cc 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -38,6 +38,8 @@ #include <linux/backing-dev.h> #include <linux/security.h> #include <linux/xattr.h> +#include <linux/fscrypto.h> +#include <linux/random.h> #include "ubifs-media.h" /* Version of this UBIFS implementation */ @@ -83,10 +85,6 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Write-buffer synchronization timeout interval in seconds */ -#define WBUF_TIMEOUT_SOFTLIMIT 3 -#define WBUF_TIMEOUT_HARDLIMIT 5 - /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -138,6 +136,12 @@ */ #define WORST_COMPR_FACTOR 2 +#ifdef CONFIG_UBIFS_FS_ENCRYPTION +#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE +#else +#define UBIFS_CIPHER_BLOCK_SIZE 0 +#endif + /* * How much memory is needed for a buffer where we compress a data node. */ @@ -645,9 +649,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @io_mutex: serializes write-buffer I/O * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields - * @softlimit: soft write-buffer timeout interval - * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit - * and @softlimit + @delta) * @timer: write-buffer timer * @no_timer: non-zero if this write-buffer does not have a timer * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing @@ -676,8 +677,6 @@ struct ubifs_wbuf { int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; spinlock_t lock; - ktime_t softlimit; - unsigned long long delta; struct hrtimer timer; unsigned int no_timer:1; unsigned int need_sync:1; @@ -1007,6 +1006,8 @@ struct ubifs_debug_info; * * @big_lpt: flag that LPT is too big to write whole during commit * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up + * @double_hash: flag indicating that we can do lookups by hash + * @encrypted: flag indicating that this file system contains encrypted files * @no_chk_data_crc: do not check CRCs when reading data nodes (except during * recovery) * @bulk_read: enable bulk-reads @@ -1249,6 +1250,8 @@ struct ubifs_info { unsigned int big_lpt:1; unsigned int space_fixup:1; + unsigned int double_hash:1; + unsigned int encrypted:1; unsigned int no_chk_data_crc:1; unsigned int bulk_read:1; unsigned int default_compr:2; @@ -1515,25 +1518,29 @@ int ubifs_consolidate_log(struct ubifs_info *c); /* journal.c */ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, - const struct qstr *nm, const struct inode *inode, + const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, const union ubifs_key *key, const void *buf, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, - const struct dentry *fst_dentry, + const struct inode *fst_inode, + const struct fscrypt_name *fst_nm, const struct inode *snd_dir, - const struct dentry *snd_dentry, int sync); + const struct inode *snd_inode, + const struct fscrypt_name *snd_nm, int sync); int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, - const struct dentry *old_dentry, + const struct inode *old_inode, + const struct fscrypt_name *old_nm, const struct inode *new_dir, - const struct dentry *new_dentry, + const struct inode *new_inode, + const struct fscrypt_name *new_nm, const struct inode *whiteout, int sync); int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, loff_t old_size, loff_t new_size); int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host, - const struct inode *inode, const struct qstr *nm); + const struct inode *inode, const struct fscrypt_name *nm); int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1, const struct inode *inode2); @@ -1568,7 +1575,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, - void *node, const struct qstr *nm); + void *node, const struct fscrypt_name *nm); +int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key, + void *node, uint32_t secondary_hash); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs); int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, @@ -1576,16 +1585,16 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum, int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key, int old_lnum, int old_offs, int lnum, int offs, int len); int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key, - int lnum, int offs, int len, const struct qstr *nm); + int lnum, int offs, int len, const struct fscrypt_name *nm); int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key); int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key, - const struct qstr *nm); + const struct fscrypt_name *nm); int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key, union ubifs_key *to_key); int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum); struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c, union ubifs_key *key, - const struct qstr *nm); + const struct fscrypt_name *nm); void ubifs_tnc_close(struct ubifs_info *c); int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level, int lnum, int offs, int is_idx); @@ -1642,6 +1651,7 @@ int ubifs_read_superblock(struct ubifs_info *c); struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c); int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup); int ubifs_fixup_free_space(struct ubifs_info *c); +int ubifs_enable_encryption(struct ubifs_info *c); /* replay.c */ int ubifs_validate_entry(struct ubifs_info *c, @@ -1733,16 +1743,21 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags); #endif /* dir.c */ -struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, +struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode); int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); +int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); int ubifs_init_security(struct inode *dentry, struct inode *inode, const struct qstr *qstr); +int ubifs_xattr_set(struct inode *host, const char *name, const void *value, + size_t size, int flags); +ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, + size_t size); /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); @@ -1781,6 +1796,66 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, #include "misc.h" #include "key.h" +#ifndef CONFIG_UBIFS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_ioctl_set_policy fscrypt_notsupp_ioctl_set_policy +#define fscrypt_ioctl_get_policy fscrypt_notsupp_ioctl_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk +static inline int ubifs_encrypt(const struct inode *inode, + struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, + int block) +{ + ubifs_assert(0); + return -EOPNOTSUPP; +} +static inline int ubifs_decrypt(const struct inode *inode, + struct ubifs_data_node *dn, + unsigned int *out_len, int block) +{ + ubifs_assert(0); + return -EOPNOTSUPP; +} +#else +/* crypto.c */ +int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int in_len, unsigned int *out_len, int block); +int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, + unsigned int *out_len, int block); +#endif + +extern struct fscrypt_operations ubifs_crypt_operations; + +static inline bool __ubifs_crypt_is_encrypted(struct inode *inode) +{ + struct ubifs_inode *ui = ubifs_inode(inode); + + return ui->flags & UBIFS_CRYPT_FL; +} + +static inline bool ubifs_crypt_is_encrypted(const struct inode *inode) +{ + return __ubifs_crypt_is_encrypted((struct inode *)inode); +} + /* Normal UBIFS messages */ __printf(2, 3) void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index d9f9615bfd71..efe00fcb8b75 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -97,7 +97,7 @@ static const struct file_operations empty_fops; * of failure. */ static int create_xattr(struct ubifs_info *c, struct inode *host, - const struct qstr *nm, const void *value, int size) + const struct fscrypt_name *nm, const void *value, int size) { int err, names_len; struct inode *inode; @@ -117,7 +117,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, * extended attributes if the name list becomes larger. This limitation * is artificial for UBIFS, though. */ - names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1; + names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1; if (names_len > XATTR_LIST_MAX) { ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", host->i_ino, names_len, XATTR_LIST_MAX); @@ -154,9 +154,18 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt += 1; - host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); - host_ui->xattr_names += nm->len; + host_ui->xattr_names += fname_len(nm); + + /* + * We handle UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT here because we + * have to set the UBIFS_CRYPT_FL flag on the host inode. + * To avoid multiple updates of the same inode in the same operation, + * let's do it here. + */ + if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) + host_ui->flags |= UBIFS_CRYPT_FL; err = ubifs_jnl_update(c, host, nm, inode, 0, 1); if (err) @@ -170,9 +179,10 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_cnt -= 1; - host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(size); - host_ui->xattr_names -= nm->len; + host_ui->xattr_names -= fname_len(nm); + host_ui->flags &= ~UBIFS_CRYPT_FL; mutex_unlock(&host_ui->ui_mutex); out_free: make_bad_inode(inode); @@ -269,22 +279,28 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) return ERR_PTR(-EINVAL); } -static int __ubifs_setxattr(struct inode *host, const char *name, - const void *value, size_t size, int flags) +int ubifs_xattr_set(struct inode *host, const char *name, const void *value, + size_t size, int flags) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_dent_node *xent; union ubifs_key key; int err; - ubifs_assert(inode_is_locked(host)); + /* + * Creating an encryption context is done unlocked since we + * operate on a new inode which is not visible to other users + * at this point. + */ + if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) != 0) + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -329,18 +345,18 @@ out_free: return err; } -static ssize_t __ubifs_getxattr(struct inode *host, const char *name, - void *buf, size_t size) +ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, + size_t size) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_inode *ui; struct ubifs_dent_node *xent; union ubifs_key key; int err; - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -387,6 +403,20 @@ out_unlock: return err; } +static bool xattr_visible(const char *name) +{ + /* File encryption related xattrs are for internal use only */ + if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0) + return false; + + /* Show trusted namespace only for "power" users */ + if (strncmp(name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) == 0 && !capable(CAP_SYS_ADMIN)) + return false; + + return true; +} + ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) { union ubifs_key key; @@ -395,7 +425,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_dent_node *xent, *pxent = NULL; int err, len, written = 0; - struct qstr nm = { .name = NULL }; + struct fscrypt_name nm = {0}; dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino, dentry, size); @@ -419,15 +449,12 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) break; } - nm.name = xent->name; - nm.len = le16_to_cpu(xent->nlen); + fname_name(&nm) = xent->name; + fname_len(&nm) = le16_to_cpu(xent->nlen); - /* Show trusted namespace only for "power" users */ - if (strncmp(xent->name, XATTR_TRUSTED_PREFIX, - XATTR_TRUSTED_PREFIX_LEN) || - capable(CAP_SYS_ADMIN)) { - memcpy(buffer + written, nm.name, nm.len + 1); - written += nm.len + 1; + if (xattr_visible(xent->name)) { + memcpy(buffer + written, fname_name(&nm), fname_len(&nm) + 1); + written += fname_len(&nm) + 1; } kfree(pxent); @@ -446,7 +473,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) } static int remove_xattr(struct ubifs_info *c, struct inode *host, - struct inode *inode, const struct qstr *nm) + struct inode *inode, const struct fscrypt_name *nm) { int err; struct ubifs_inode *host_ui = ubifs_inode(host); @@ -463,9 +490,9 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, mutex_lock(&host_ui->ui_mutex); host->i_ctime = ubifs_current_time(host); host_ui->xattr_cnt -= 1; - host_ui->xattr_size -= CALC_DENT_SIZE(nm->len); + host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_names -= nm->len; + host_ui->xattr_names -= fname_len(nm); err = ubifs_jnl_delete_xattr(c, host, inode, nm); if (err) @@ -477,27 +504,27 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, out_cancel: host_ui->xattr_cnt += 1; - host_ui->xattr_size += CALC_DENT_SIZE(nm->len); + host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len); - host_ui->xattr_names += nm->len; + host_ui->xattr_names += fname_len(nm); mutex_unlock(&host_ui->ui_mutex); ubifs_release_budget(c, &req); make_bad_inode(inode); return err; } -static int __ubifs_removexattr(struct inode *host, const char *name) +static int ubifs_xattr_remove(struct inode *host, const char *name) { struct inode *inode; struct ubifs_info *c = host->i_sb->s_fs_info; - struct qstr nm = QSTR_INIT(name, strlen(name)); + struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))}; struct ubifs_dent_node *xent; union ubifs_key key; int err; ubifs_assert(inode_is_locked(host)); - if (nm.len > UBIFS_MAX_NLEN) + if (fname_len(&nm) > UBIFS_MAX_NLEN) return -ENAMETOOLONG; xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS); @@ -548,7 +575,8 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); - err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0); + err = ubifs_xattr_set(inode, name, xattr->value, + xattr->value_len, 0); kfree(name); if (err < 0) break; @@ -572,7 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, return err; } -static int ubifs_xattr_get(const struct xattr_handler *handler, +static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { @@ -580,10 +608,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler, inode->i_ino, dentry, size); name = xattr_full_name(handler, name); - return __ubifs_getxattr(inode, name, buffer, size); + return ubifs_xattr_get(inode, name, buffer, size); } -static int ubifs_xattr_set(const struct xattr_handler *handler, +static int xattr_set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -594,27 +622,27 @@ static int ubifs_xattr_set(const struct xattr_handler *handler, name = xattr_full_name(handler, name); if (value) - return __ubifs_setxattr(inode, name, value, size, flags); + return ubifs_xattr_set(inode, name, value, size, flags); else - return __ubifs_removexattr(inode, name); + return ubifs_xattr_remove(inode, name); } static const struct xattr_handler ubifs_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; static const struct xattr_handler ubifs_trusted_xattr_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = ubifs_xattr_get, - .set = ubifs_xattr_set, + .get = xattr_get, + .set = xattr_set, }; const struct xattr_handler *ubifs_xattr_handlers[] = { diff --git a/fs/udf/dir.c b/fs/udf/dir.c index aaec13c95253..2d0e028067eb 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -30,6 +30,7 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> +#include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 988d5352bdb8..7aa48bd7cbaf 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -16,6 +16,7 @@ #include <linux/fs.h> #include <linux/string.h> +#include <linux/bio.h> struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, diff --git a/fs/udf/inode.c b/fs/udf/inode.c index aad46401ede5..0f3db71753aa 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,6 +38,7 @@ #include <linux/crc-itu-t.h> #include <linux/mpage.h> #include <linux/uio.h> +#include <linux/bio.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 67e085d591d8..a0376a2c1c29 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/capability.h> #include <linux/bitops.h> +#include <linux/bio.h> #include <asm/byteorder.h> #include "ufs_fs.h" @@ -306,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, (unsigned long long)(pos + newb), pos); bh->b_blocknr = newb + pos; - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); mark_buffer_dirty(bh); ++j; bh = bh->b_this_page; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 190d64be22ed..7e41aee7b69a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -25,7 +25,7 @@ * David S. Miller (davem@caip.rutgers.edu), 1995 */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> @@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) if (buffer_new(bh)) { clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); + clean_bdev_bh_alias(bh); /* * we do not zeroize fragment, because of * if it maped to hole, it already contains zeroes @@ -1192,7 +1191,7 @@ out: return err; } -void ufs_truncate_blocks(struct inode *inode) +static void ufs_truncate_blocks(struct inode *inode) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f04ab232d08d..131b2b77c818 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -71,7 +71,7 @@ #include <stdarg.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/errno.h> #include <linux/fs.h> diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..43953e03c356 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -63,6 +63,7 @@ struct userfaultfd_wait_queue { struct uffd_msg msg; wait_queue_t wq; struct userfaultfd_ctx *ctx; + bool waken; }; struct userfaultfd_wake_range { @@ -86,6 +87,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode, if (len && (start > uwq->msg.arg.pagefault.address || start + len <= uwq->msg.arg.pagefault.address)) goto out; + WRITE_ONCE(uwq->waken, true); + /* + * The implicit smp_mb__before_spinlock in try_to_wake_up() + * renders uwq->waken visible to other CPUs before the task is + * waken. + */ ret = wake_up_state(wq->private, mode); if (ret) /* @@ -257,18 +264,19 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct fault_env *fe, unsigned long reason) +int handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = fe->vma->vm_mm; + struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; bool must_wait, return_to_userland; + long blocking_state; BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = fe->vma->vm_userfaultfd_ctx.ctx; + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -301,17 +309,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -323,7 +332,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * and wait. */ ret = VM_FAULT_RETRY; - if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -331,12 +340,15 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(fe->address, fe->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; + uwq.waken = false; return_to_userland = - (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); + blocking_state = return_to_userland ? TASK_INTERRUPTIBLE : + TASK_KILLABLE; spin_lock(&ctx->fault_pending_wqh.lock); /* @@ -349,11 +361,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * following the spin_unlock to happen before the list_add in * __add_wait_queue. */ - set_current_state(return_to_userland ? TASK_INTERRUPTIBLE : - TASK_KILLABLE); + set_current_state(blocking_state); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && @@ -362,6 +374,29 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) wake_up_poll(&ctx->fd_wqh, POLLIN); schedule(); ret |= VM_FAULT_MAJOR; + + /* + * False wakeups can orginate even from rwsem before + * up_read() however userfaults will wait either for a + * targeted wakeup on the specific uwq waitqueue from + * wake_userfault() or for signals or for uffd + * release. + */ + while (!READ_ONCE(uwq.waken)) { + /* + * This needs the full smp_store_mb() + * guarantee as the state write must be + * visible to other CPUs before reading + * uwq.waken from other CPUs. + */ + set_current_state(blocking_state); + if (READ_ONCE(uwq.waken) || + READ_ONCE(ctx->released) || + (return_to_userland ? signal_pending(current) : + fatal_signal_pending(current))) + break; + schedule(); + } } __set_current_state(TASK_RUNNING); diff --git a/fs/utimes.c b/fs/utimes.c index 22307cdf7014..32b15b3f6629 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -8,7 +8,7 @@ #include <linux/stat.h> #include <linux/utime.h> #include <linux/syscalls.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unistd.h> #ifdef __ARCH_WANT_SYS_UTIME @@ -48,7 +48,7 @@ static bool nsec_valid(long nsec) return nsec >= 0 && nsec <= 999999999; } -static int utimes_common(struct path *path, struct timespec *times) +static int utimes_common(const struct path *path, struct timespec *times) { int error; struct iattr newattrs; diff --git a/fs/xattr.c b/fs/xattr.c index 2d13b4e62fae..7e3317cf4045 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -22,7 +22,7 @@ #include <linux/vmalloc.h> #include <linux/posix_acl_xattr.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static const char * strcmp_prefix(const char *a, const char *a_prefix) diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index e5ebc3770460..33db69be4832 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -39,6 +39,7 @@ #include "xfs_rmap_btree.h" #include "xfs_btree.h" #include "xfs_refcount_btree.h" +#include "xfs_ialloc_btree.h" /* * Per-AG Block Reservations @@ -200,22 +201,30 @@ __xfs_ag_resv_init( struct xfs_mount *mp = pag->pag_mount; struct xfs_ag_resv *resv; int error; + xfs_extlen_t reserved; - resv = xfs_perag_resv(pag, type); if (used > ask) ask = used; - resv->ar_asked = ask; - resv->ar_reserved = resv->ar_orig_reserved = ask - used; - mp->m_ag_max_usable -= ask; + reserved = ask - used; - trace_xfs_ag_resv_init(pag, type, ask); - - error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); - if (error) + error = xfs_mod_fdblocks(mp, -(int64_t)reserved, true); + if (error) { trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, error, _RET_IP_); + xfs_warn(mp, +"Per-AG reservation for AG %u failed. Filesystem may run out of space.", + pag->pag_agno); + return error; + } - return error; + mp->m_ag_max_usable -= ask; + + resv = xfs_perag_resv(pag, type); + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = reserved; + + trace_xfs_ag_resv_init(pag, type, ask); + return 0; } /* Create a per-AG block reservation. */ @@ -223,6 +232,8 @@ int xfs_ag_resv_init( struct xfs_perag *pag) { + struct xfs_mount *mp = pag->pag_mount; + xfs_agnumber_t agno = pag->pag_agno; xfs_extlen_t ask; xfs_extlen_t used; int error = 0; @@ -231,23 +242,45 @@ xfs_ag_resv_init( if (pag->pag_meta_resv.ar_asked == 0) { ask = used = 0; - error = xfs_refcountbt_calc_reserves(pag->pag_mount, - pag->pag_agno, &ask, &used); + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; - error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, - ask, used); + error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) { + /* + * Because we didn't have per-AG reservations when the + * finobt feature was added we might not be able to + * reserve all needed blocks. Warn and fall back to the + * old and potentially buggy code in that case, but + * ensure we do have the reservation for the refcountbt. + */ + ask = used = 0; + + mp->m_inotbt_nores = true; + + error = xfs_refcountbt_calc_reserves(mp, agno, &ask, + &used); + if (error) + goto out; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } } /* Create the AGFL metadata reservation */ if (pag->pag_agfl_resv.ar_asked == 0) { ask = used = 0; - error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno, - &ask, &used); + error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); if (error) goto out; @@ -256,6 +289,16 @@ xfs_ag_resv_init( goto out; } +#ifdef DEBUG + /* need to read in the AGF for the ASSERT below to work */ + error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); + if (error) + return error; + + ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + + xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <= + pag->pagf_freeblks + pag->pagf_flcount); +#endif out: return error; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index effb64cf714f..9f06a211e157 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -95,10 +95,7 @@ unsigned int xfs_alloc_set_aside( struct xfs_mount *mp) { - unsigned int blocks; - - blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - return blocks; + return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4); } /* @@ -365,36 +362,12 @@ xfs_alloc_fix_len( return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); ASSERT(rlen % args->prod == args->mod); + ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= + rlen + args->minleft); args->len = rlen; } /* - * Fix up length if there is too little space left in the a.g. - * Return 1 if ok, 0 if too little, should give up. - */ -STATIC int -xfs_alloc_fix_minleft( - xfs_alloc_arg_t *args) /* allocation argument structure */ -{ - xfs_agf_t *agf; /* a.g. freelist header */ - int diff; /* free space difference */ - - if (args->minleft == 0) - return 1; - agf = XFS_BUF_TO_AGF(args->agbp); - diff = be32_to_cpu(agf->agf_freeblks) - - args->len - args->minleft; - if (diff >= 0) - return 1; - args->len += diff; /* shrink the allocated space */ - /* casts to (int) catch length underflows */ - if ((int)args->len >= (int)args->minlen) - return 1; - args->agbno = NULLAGBLOCK; - return 0; -} - -/* * Update the two btrees, logically removing from freespace the extent * starting at rbno, rlen blocks. The extent is contained within the * actual (current) free extent fbno for flen blocks. @@ -689,8 +662,6 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; - xfs_extlen_t reservation; - xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); @@ -699,20 +670,6 @@ xfs_alloc_ag_vextent( ASSERT(args->alignment > 0); /* - * Clamp maxlen to the amount of free space minus any reservations - * that have been made. - */ - oldmax = args->maxlen; - reservation = xfs_ag_resv_needed(args->pag, args->resv); - if (args->maxlen > args->pag->pagf_freeblks - reservation) - args->maxlen = args->pag->pagf_freeblks - reservation; - if (args->maxlen == 0) { - args->agbno = NULLAGBLOCK; - args->maxlen = oldmax; - return 0; - } - - /* * Branch to correct routine based on the type. */ args->wasfromfl = 0; @@ -731,8 +688,6 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } - args->maxlen = oldmax; - if (error || args->agbno == NULLAGBLOCK) return error; @@ -841,9 +796,6 @@ xfs_alloc_ag_vextent_exact( args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) - args->agbno; xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto not_found; - ASSERT(args->agbno + args->len <= tend); /* @@ -1149,12 +1101,7 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; - if (!xfs_alloc_fix_minleft(args)) { - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - trace_xfs_alloc_near_nominleft(args); - return 0; - } - blen = args->len; + /* * We are allocating starting at bnew for blen blocks. */ @@ -1346,12 +1293,6 @@ restart: */ args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) { - trace_xfs_alloc_near_nominleft(args); - xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); - return 0; - } rlen = args->len; (void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment, args->datatype, ltbnoa, ltlena, <new); @@ -1553,8 +1494,6 @@ restart: } xfs_alloc_fix_len(args); - if (!xfs_alloc_fix_minleft(args)) - goto out_nominleft; rlen = args->len; XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* @@ -2056,7 +1995,7 @@ xfs_alloc_space_available( int flags) { struct xfs_perag *pag = args->pag; - xfs_extlen_t longest; + xfs_extlen_t alloc_len, longest; xfs_extlen_t reservation; /* blocks that are still reserved */ int available; @@ -2066,17 +2005,28 @@ xfs_alloc_space_available( reservation = xfs_ag_resv_needed(pag, args->resv); /* do we have enough contiguous free space for the allocation? */ + alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, reservation); - if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + if (longest < alloc_len) return false; /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - reservation - min_free - args->total); - if (available < (int)args->minleft || available <= 0) + reservation - min_free - args->minleft); + if (available < (int)max(args->total, alloc_len)) return false; + /* + * Clamp maxlen to the amount of free space available for the actual + * extent allocation. + */ + if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { + args->maxlen = available; + ASSERT(args->maxlen > 0); + ASSERT(args->maxlen >= args->minlen); + } + return true; } @@ -2122,7 +2072,8 @@ xfs_alloc_fix_freelist( } need = xfs_alloc_min_freelist(mp, pag); - if (!xfs_alloc_space_available(args, need, flags)) + if (!xfs_alloc_space_available(args, need, flags | + XFS_ALLOC_FLAG_CHECK)) goto out_agbp_relse; /* @@ -2455,12 +2406,15 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; - if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) return false; if (xfs_sb_version_hasrmapbt(&mp->m_sb) && - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)) return false; /* @@ -2477,7 +2431,8 @@ xfs_agf_verify( return false; if (xfs_sb_version_hasreflink(&mp->m_sb) && - be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS) + (be32_to_cpu(agf->agf_refcount_level) < 1 || + be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)) return false; return true;; @@ -2634,12 +2589,10 @@ xfs_alloc_vextent( xfs_agblock_t agsize; /* allocation group size */ int error; int flags; /* XFS_ALLOC_FLAG_... locking flags */ - xfs_extlen_t minleft;/* minimum left value, temp copy */ xfs_mount_t *mp; /* mount structure pointer */ xfs_agnumber_t sagno; /* starting allocation group number */ xfs_alloctype_t type; /* input allocation type */ int bump_rotor = 0; - int no_min = 0; xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ mp = args->mp; @@ -2668,7 +2621,6 @@ xfs_alloc_vextent( trace_xfs_alloc_vextent_badargs(args); return 0; } - minleft = args->minleft; switch (type) { case XFS_ALLOCTYPE_THIS_AG: @@ -2679,9 +2631,7 @@ xfs_alloc_vextent( */ args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); args->pag = xfs_perag_get(mp, args->agno); - args->minleft = 0; error = xfs_alloc_fix_freelist(args, 0); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2746,9 +2696,7 @@ xfs_alloc_vextent( */ for (;;) { args->pag = xfs_perag_get(mp, args->agno); - if (no_min) args->minleft = 0; error = xfs_alloc_fix_freelist(args, flags); - args->minleft = minleft; if (error) { trace_xfs_alloc_vextent_nofix(args); goto error0; @@ -2788,20 +2736,17 @@ xfs_alloc_vextent( * or switch to non-trylock mode. */ if (args->agno == sagno) { - if (no_min == 1) { + if (flags == 0) { args->agbno = NULLAGBLOCK; trace_xfs_alloc_vextent_allfailed(args); break; } - if (flags == 0) { - no_min = 1; - } else { - flags = 0; - if (type == XFS_ALLOCTYPE_START_BNO) { - args->agbno = XFS_FSB_TO_AGBNO(mp, - args->fsbno); - args->type = XFS_ALLOCTYPE_NEAR_BNO; - } + + flags = 0; + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); + args->type = XFS_ALLOCTYPE_NEAR_BNO; } } xfs_perag_put(args->pag); diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 7c404a6b0ae3..1d0f48a501a3 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t; #define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/ #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ - +#define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ /* * Argument structure for xfs_alloc routines. diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 5ba2dac5e67c..efb467b10a71 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -421,13 +421,17 @@ xfs_allocbt_init_cursor( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_allocbt_ops; + if (btnum == XFS_BTNUM_BNO) + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + else + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index af1ecb19121e..6622d46ddec3 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -131,9 +131,6 @@ xfs_attr_get( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (!xfs_inode_hasattr(ip)) - return -ENOATTR; - error = xfs_attr_args_init(&args, ip, name, flags); if (error) return error; @@ -392,9 +389,6 @@ xfs_attr_remove( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - if (!xfs_inode_hasattr(dp)) - return -ENOATTR; - error = xfs_attr_args_init(&args, dp, name, flags); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 8ea91f363093..2852521fc8ec 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -253,6 +253,7 @@ xfs_attr3_leaf_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_attr_leafblock *leaf = bp->b_addr; + struct xfs_perag *pag = bp->b_pag; struct xfs_attr3_icleaf_hdr ichdr; xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); @@ -273,7 +274,12 @@ xfs_attr3_leaf_verify( if (ichdr.magic != XFS_ATTR_LEAF_MAGIC) return false; } - if (ichdr.count == 0) + /* + * In recovery there is a transient state where count == 0 is valid + * because we may have transitioned an empty shortform attr to a leaf + * if the attr didn't fit in shortform. + */ + if (pag && pag->pagf_init && ichdr.count == 0) return false; /* XXX: need to range check rest of attr header values */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index 4f2aed04f827..f7dda0c237b0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -51,7 +51,7 @@ int xfs_attr_shortform_getvalue(struct xfs_da_args *args); int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); -int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); +int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* @@ -77,7 +77,7 @@ int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, struct xfs_da_args *args); -int xfs_attr3_leaf_list_int(struct xfs_buf *bp, +void xfs_attr3_leaf_list_int(struct xfs_buf *bp, struct xfs_attr_list_context *context); /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c6eb21940783..bfc00de5c6f1 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -49,6 +49,8 @@ #include "xfs_rmap.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" +#include "xfs_rmap_btree.h" +#include "xfs_icache.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -190,8 +192,12 @@ xfs_bmap_worst_indlen( int maxrecs; /* maximum record count at this level */ xfs_mount_t *mp; /* mount structure */ xfs_filblks_t rval; /* return value */ + xfs_filblks_t orig_len; mp = ip->i_mount; + + /* Calculate the worst-case size of the bmbt. */ + orig_len = len; maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); @@ -199,12 +205,20 @@ xfs_bmap_worst_indlen( len += maxrecs - 1; do_div(len, maxrecs); rval += len; - if (len == 1) - return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - + if (len == 1) { + rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) - level - 1; + break; + } if (level == 0) maxrecs = mp->m_bmap_dmxr[1]; } + + /* Calculate the worst-case size of the rmapbt. */ + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) + + mp->m_rmap_maxlevels; + return rval; } @@ -504,7 +518,7 @@ void xfs_bmap_trace_exlist( xfs_inode_t *ip, /* incore inode pointer */ xfs_extnum_t cnt, /* count of entries in the list */ - int whichfork, /* data or attr fork */ + int whichfork, /* data or attr or cow fork */ unsigned long caller_ip) { xfs_extnum_t idx; /* extent record index */ @@ -513,11 +527,13 @@ xfs_bmap_trace_exlist( if (whichfork == XFS_ATTR_FORK) state |= BMAP_ATTRFORK; + else if (whichfork == XFS_COW_FORK) + state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT(cnt == xfs_iext_count(ifp)); for (idx = 0; idx < cnt; idx++) - trace_xfs_extlist(ip, idx, whichfork, caller_ip); + trace_xfs_extlist(ip, idx, state, caller_ip); } /* @@ -811,7 +827,7 @@ try_another_ag: XFS_BTREE_LONG_PTRS); arp = XFS_BMBT_REC_ADDR(mp, ablock, 1); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (cnt = i = 0; i < nextents; i++) { ep = xfs_iext_get_ext(ifp, i); if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) { @@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork( goto trans_cancel; if (XFS_IFORK_Q(ip)) goto trans_cancel; + if (ip->i_d.di_anextents != 0) { + error = -EFSCORRUPTED; + goto trans_cancel; + } if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { /* * For inodes coming from pre-6.2 filesystems. @@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork( ASSERT(ip->i_d.di_aformat == 0); ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; } - ASSERT(ip->i_d.di_anextents == 0); xfs_trans_ijoin(tp, ip, 0); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1296,7 +1315,7 @@ xfs_bmap_read_extents( /* * Here with bp and block set to the leftmost leaf node in the tree. */ - room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + room = xfs_iext_count(ifp); i = 0; /* * Loop over all leaf nodes. Copy information to the extent records. @@ -1361,8 +1380,9 @@ xfs_bmap_read_extents( return error; block = XFS_BUF_TO_BLOCK(bp); } - ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))); - ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork)); + if (i != XFS_IFORK_NEXTENTS(ip, whichfork)) + return -EFSCORRUPTED; + ASSERT(i == xfs_iext_count(ifp)); XFS_BMAP_TRACE_EXLIST(ip, i, whichfork); return 0; error0: @@ -1370,97 +1390,6 @@ error0: return -EFSCORRUPTED; } - -/* - * Search the extent records for the entry containing block bno. - * If bno lies in a hole, point to the next entry. If bno lies - * past eof, *eofp will be set, and *prevp will contain the last - * entry (null if none). Else, *lastxp will be set to the index - * of the found entry; *gotp will contain the entry. - */ -STATIC xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_multi_extents( - xfs_ifork_t *ifp, /* inode fork pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - xfs_extnum_t lastx; /* last extent index */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL; - gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL; - gotp->br_state = XFS_EXT_INVALID; - gotp->br_startblock = 0xffffa5a5a5a5a5a5LL; - prevp->br_startoff = NULLFILEOFF; - - ep = xfs_iext_bno_to_ext(ifp, bno, &lastx); - if (lastx > 0) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp); - } - if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) { - xfs_bmbt_get_all(ep, gotp); - *eofp = 0; - } else { - if (lastx > 0) { - *gotp = *prevp; - } - *eofp = 1; - ep = NULL; - } - *lastxp = lastx; - return ep; -} - -/* - * Search the extents list for the inode, for the extent containing bno. - * If bno lies in a hole, point to the next entry. If bno lies past eof, - * *eofp will be set, and *prevp will contain the last entry (null if none). - * Else, *lastxp will be set to the index of the found - * entry; *gotp will contain the entry. - */ -xfs_bmbt_rec_host_t * /* pointer to found extent entry */ -xfs_bmap_search_extents( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_fileoff_t bno, /* block number searched for */ - int fork, /* data or attr fork */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ - - XFS_STATS_INC(ip->i_mount, xs_look_exlist); - ifp = XFS_IFORK_PTR(ip, fork); - - ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp); - - if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) && - !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) { - xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, - "Access to block zero in inode %llu " - "start_block: %llx start_off: %llx " - "blkcnt: %llx extent-state: %x lastx: %x", - (unsigned long long)ip->i_ino, - (unsigned long long)gotp->br_startblock, - (unsigned long long)gotp->br_startoff, - (unsigned long long)gotp->br_blockcount, - gotp->br_state, *lastxp); - *lastxp = NULLEXTNUM; - *eofp = 1; - return NULL; - } - return ep; -} - /* * Returns the file-relative block number of the first unused block(s) * in the file with at least "len" logically contiguous blocks free. @@ -1497,7 +1426,7 @@ xfs_bmap_first_unused( (error = xfs_iread_extents(tp, ip, whichfork))) return error; lowest = *first_unused; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); off = xfs_bmbt_get_startoff(ep); @@ -1523,44 +1452,44 @@ xfs_bmap_first_unused( */ int /* error */ xfs_bmap_last_before( - xfs_trans_t *tp, /* transaction pointer */ - xfs_inode_t *ip, /* incore inode */ - xfs_fileoff_t *last_block, /* last block */ - int whichfork) /* data or attr fork */ + struct xfs_trans *tp, /* transaction pointer */ + struct xfs_inode *ip, /* incore inode */ + xfs_fileoff_t *last_block, /* last block */ + int whichfork) /* data or attr fork */ { - xfs_fileoff_t bno; /* input file offset */ - int eof; /* hit end of file */ - xfs_bmbt_rec_host_t *ep; /* pointer to last extent */ - int error; /* error return value */ - xfs_bmbt_irec_t got; /* current extent value */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t lastx; /* last extent used */ - xfs_bmbt_irec_t prev; /* previous extent value */ + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int error; - if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL) - return -EIO; - if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) { + switch (XFS_IFORK_FORMAT(ip, whichfork)) { + case XFS_DINODE_FMT_LOCAL: *last_block = 0; return 0; + case XFS_DINODE_FMT_BTREE: + case XFS_DINODE_FMT_EXTENTS: + break; + default: + return -EIO; } - ifp = XFS_IFORK_PTR(ip, whichfork); - if (!(ifp->if_flags & XFS_IFEXTENTS) && - (error = xfs_iread_extents(tp, ip, whichfork))) - return error; - bno = *last_block - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); - if (eof || xfs_bmbt_get_startoff(ep) > bno) { - if (prev.br_startoff == NULLFILEOFF) - *last_block = 0; - else - *last_block = prev.br_startoff + prev.br_blockcount; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; } - /* - * Otherwise *last_block is already the right answer. - */ + + if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) { + if (got.br_startoff <= *last_block - 1) + return 0; + } + + if (xfs_iext_get_extent(ifp, idx - 1, &got)) { + *last_block = got.br_startoff + got.br_blockcount; + return 0; + } + + *last_block = 0; return 0; } @@ -1582,7 +1511,7 @@ xfs_bmap_last_extent( return error; } - nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *is_empty = 1; return 0; @@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real( &bma->ip->i_d.di_nextents); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < xfs_iext_count(ifp) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real( ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); ASSERT(*idx >= 0); - ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (*idx < xfs_iext_count(&ip->i_df) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT); if (isnullstartblock(RIGHT.br_startblock)) @@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay( * Check and set flags if the current (right) segment exists. * If it doesn't exist, we're converting the hole at end-of-file. */ - if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); @@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real( ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(bma->idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); ASSERT(!bma->cur || !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); @@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) { + if (bma->idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); if (isnullstartblock(right.br_startblock)) @@ -3700,7 +3629,7 @@ xfs_bmap_btalloc( align = xfs_get_cowextsz_hint(ap->ip); else if (xfs_alloc_is_userdata(ap->datatype)) align = xfs_get_extsz_hint(ap->ip); - if (unlikely(align)) { + if (align) { error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, &ap->length); @@ -3772,7 +3701,7 @@ xfs_bmap_btalloc( args.minlen = ap->minlen; } /* apply extent size hints if obtained earlier */ - if (unlikely(align)) { + if (align) { args.prod = align; if ((args.mod = (xfs_extlen_t)do_mod(ap->offset, args.prod))) args.mod = (xfs_extlen_t)(args.prod - args.mod); @@ -3883,7 +3812,6 @@ xfs_bmap_btalloc( args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; args.total = ap->minlen; - args.minleft = 0; if ((error = xfs_alloc_vextent(&args))) return error; ap->dfops->dop_low = true; @@ -4145,12 +4073,11 @@ xfs_bmapi_read( struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_fileoff_t obno; xfs_fileoff_t end; - xfs_extnum_t lastx; + xfs_extnum_t idx; int error; - int eof; + bool eof = false; int n = 0; int whichfork = xfs_bmapi_whichfork(flags); @@ -4190,7 +4117,8 @@ xfs_bmapi_read( return error; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) + eof = true; end = bno + len; obno = bno; @@ -4221,10 +4149,8 @@ xfs_bmapi_read( break; /* Else go on to the next record. */ - if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got); - else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++idx, &got)) + eof = true; } *nmap = n; return 0; @@ -4234,10 +4160,10 @@ int xfs_bmapi_reserve_delalloc( struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, + xfs_fileoff_t off, xfs_filblks_t len, + xfs_filblks_t prealloc, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof) { @@ -4248,10 +4174,17 @@ xfs_bmapi_reserve_delalloc( char rt = XFS_IS_REALTIME_INODE(ip); xfs_extlen_t extsz; int error; + xfs_fileoff_t aoff = off; - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN); + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN); if (!eof) alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; /* Figure out the extent size, adjust alen */ if (whichfork == XFS_COW_FORK) @@ -4259,7 +4192,12 @@ xfs_bmapi_reserve_delalloc( else extsz = xfs_get_extsz_hint(ip); if (extsz) { - error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, + struct xfs_bmbt_irec prev; + + if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); } @@ -4312,6 +4250,16 @@ xfs_bmapi_reserve_delalloc( */ xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got); + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + ASSERT(got->br_startoff <= aoff); ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen); ASSERT(isnullstartblock(got->br_startblock)); @@ -4349,7 +4297,7 @@ xfs_bmapi_allocate( if (bma->wasdel) { bma->length = (xfs_extlen_t)bma->got.br_blockcount; bma->offset = bma->got.br_startoff; - if (bma->idx != NULLEXTNUM && bma->idx) { + if (bma->idx) { xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &bma->prev); } @@ -4395,8 +4343,6 @@ xfs_bmapi_allocate( if (error) return error; - if (bma->dfops->dop_low) - bma->minleft = 0; if (bma->cur) bma->cur->bc_private.b.firstblock = *bma->firstblock; if (bma->blkno == NULLFSBLOCK) @@ -4563,13 +4509,11 @@ xfs_bmapi_write( struct xfs_ifork *ifp; struct xfs_bmalloca bma = { NULL }; /* args for xfs_bmap_alloc */ xfs_fileoff_t end; /* end of mapped file region */ - int eof; /* after the end of extents */ + bool eof = false; /* after the end of extents */ int error; /* error return */ int n; /* current extent index */ xfs_fileoff_t obno; /* old block number (offset) */ int whichfork; /* data or attr fork */ - char inhole; /* current location is hole in file */ - char wasdelay; /* old extent was delayed */ #ifdef DEBUG xfs_fileoff_t orig_bno; /* original block number value */ @@ -4641,12 +4585,14 @@ xfs_bmapi_write( goto error0; } - xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got, - &bma.prev); n = 0; end = bno + len; obno = bno; + if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got)) + eof = true; + if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev)) + bma.prev.br_startoff = NULLFILEOFF; bma.tp = tp; bma.ip = ip; bma.total = total; @@ -4655,22 +4601,44 @@ xfs_bmapi_write( bma.firstblock = firstblock; while (bno < end && n < *nmap) { - inhole = eof || bma.got.br_startoff > bno; - wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); + bool need_alloc = false, wasdelay = false; - /* - * Make sure we only reflink into a hole. - */ - if (flags & XFS_BMAPI_REMAP) - ASSERT(inhole); - if (flags & XFS_BMAPI_COWFORK) - ASSERT(!inhole); + /* in hole or beyoned EOF? */ + if (eof || bma.got.br_startoff > bno) { + if (flags & XFS_BMAPI_DELALLOC) { + /* + * For the COW fork we can reasonably get a + * request for converting an extent that races + * with other threads already having converted + * part of it, as there converting COW to + * regular blocks is not protected using the + * IOLOCK. + */ + ASSERT(flags & XFS_BMAPI_COWFORK); + if (!(flags & XFS_BMAPI_COWFORK)) { + error = -EIO; + goto error0; + } + + if (eof || bno >= end) + break; + } else { + need_alloc = true; + } + } else { + /* + * Make sure we only reflink into a hole. + */ + ASSERT(!(flags & XFS_BMAPI_REMAP)); + if (isnullstartblock(bma.got.br_startblock)) + wasdelay = true; + } /* * First, deal with the hole before the allocated space * that we found, if any. */ - if (inhole || wasdelay) { + if (need_alloc || wasdelay) { bma.eof = eof; bma.conv = !!(flags & XFS_BMAPI_CONVERT); bma.wasdel = wasdelay; @@ -4733,11 +4701,8 @@ xfs_bmapi_write( /* Else go on to the next record. */ bma.prev = bma.got; - if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx), - &bma.got); - } else - eof = 1; + if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got)) + eof = true; } *nmap = n; @@ -4885,7 +4850,7 @@ xfs_bmap_del_extent_delay( da_new = 0; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -4902,8 +4867,11 @@ xfs_bmap_del_extent_delay( * sb counters as we might have to borrow some blocks for the * indirect block accounting. */ - xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0, + error = xfs_trans_reserve_quota_nblks(NULL, ip, + -((long)del->br_blockcount), 0, isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS); + if (error) + return error; ip->i_delayed_blks -= del->br_blockcount; if (whichfork == XFS_COW_FORK) @@ -5013,7 +4981,7 @@ xfs_bmap_del_extent_cow( got_endoff = got->br_startoff + got->br_blockcount; ASSERT(*idx >= 0); - ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(del->br_blockcount > 0); ASSERT(got->br_startoff <= del->br_startoff); ASSERT(got_endoff >= del_endoff); @@ -5119,8 +5087,7 @@ xfs_bmap_del_extent( state |= BMAP_COWFORK; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT((*idx >= 0) && (*idx < ifp->if_bytes / - (uint)sizeof(xfs_bmbt_rec_t))); + ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp))); ASSERT(del->br_blockcount > 0); ep = xfs_iext_get_ext(ifp, *idx); xfs_bmbt_get_all(ep, &got); @@ -5434,8 +5401,6 @@ __xfs_bunmapi( { xfs_btree_cur_t *cur; /* bmap btree cursor */ xfs_bmbt_irec_t del; /* extent being deleted */ - int eof; /* is deleting at eof */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ int error; /* error return value */ xfs_extnum_t extno; /* extent number in list */ xfs_bmbt_irec_t got; /* current extent record */ @@ -5445,8 +5410,6 @@ __xfs_bunmapi( int logflags; /* transaction logging flags */ xfs_extlen_t mod; /* rt extent offset */ xfs_mount_t *mp; /* mount structure */ - xfs_extnum_t nextents; /* number of file extents */ - xfs_bmbt_irec_t prev; /* previous extent record */ xfs_fileoff_t start; /* first file offset deleted */ int tmp_logflags; /* partial logging flags */ int wasdel; /* was a delayed alloc extent */ @@ -5477,8 +5440,7 @@ __xfs_bunmapi( if (!(ifp->if_flags & XFS_IFEXTENTS) && (error = xfs_iread_extents(tp, ip, whichfork))) return error; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); - if (nextents == 0) { + if (xfs_iext_count(ifp) == 0) { *rlen = 0; return 0; } @@ -5486,18 +5448,17 @@ __xfs_bunmapi( isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip); start = bno; bno = start + len - 1; - ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, - &prev); /* * Check to see if the given block number is past the end of the * file, back up to the last block if so... */ - if (eof) { - ep = xfs_iext_get_ext(ifp, --lastx); - xfs_bmbt_get_all(ep, &got); + if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) { + ASSERT(lastx > 0); + xfs_iext_get_extent(ifp, --lastx, &got); bno = got.br_startoff + got.br_blockcount - 1; } + logflags = 0; if (ifp->if_flags & XFS_IFBROOT) { ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE); @@ -5528,8 +5489,7 @@ __xfs_bunmapi( if (got.br_startoff > bno) { if (--lastx < 0) break; - ep = xfs_iext_get_ext(ifp, lastx); - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); } /* * Is the last block of this extent before the range @@ -5543,7 +5503,6 @@ __xfs_bunmapi( * Then deal with the (possibly delayed) allocated space * we found. */ - ASSERT(ep != NULL); del = got; wasdel = isnullstartblock(del.br_startblock); if (got.br_startoff < start) { @@ -5624,15 +5583,12 @@ __xfs_bunmapi( */ ASSERT(bno >= del.br_blockcount); bno -= del.br_blockcount; - if (got.br_startoff > bno) { - if (--lastx >= 0) { - ep = xfs_iext_get_ext(ifp, - lastx); - xfs_bmbt_get_all(ep, &got); - } - } + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); continue; } else if (del.br_state == XFS_EXT_UNWRITTEN) { + struct xfs_bmbt_irec prev; + /* * This one is already unwritten. * It must have a written left neighbor. @@ -5640,8 +5596,7 @@ __xfs_bunmapi( * try again. */ ASSERT(lastx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - lastx - 1), &prev); + xfs_iext_get_extent(ifp, lastx - 1, &prev); ASSERT(prev.br_state == XFS_EXT_NORM); ASSERT(!isnullstartblock(prev.br_startblock)); ASSERT(del.br_startblock == @@ -5739,13 +5694,9 @@ nodelete: */ if (bno != (xfs_fileoff_t)-1 && bno >= start) { if (lastx >= 0) { - ep = xfs_iext_get_ext(ifp, lastx); - if (xfs_bmbt_get_startoff(ep) > bno) { - if (--lastx >= 0) - ep = xfs_iext_get_ext(ifp, - lastx); - } - xfs_bmbt_get_all(ep, &got); + xfs_iext_get_extent(ifp, lastx, &got); + if (got.br_startoff > bno && --lastx >= 0) + xfs_iext_get_extent(ifp, lastx, &got); } extno++; } @@ -5963,7 +5914,7 @@ xfs_bmse_shift_one( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); xfs_bmbt_get_all(gotp, &got); @@ -6140,7 +6091,7 @@ xfs_bmap_shift_extents( * are collapsing out, so we cannot use the count of real extents here. * Instead we have to calculate it from the incore fork. */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); if (total_extents == 0) { *done = 1; goto del_cursor; @@ -6200,7 +6151,7 @@ xfs_bmap_shift_extents( * count can change. Update the total and grade the next record. */ if (direction == SHIFT_LEFT) { - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + total_extents = xfs_iext_count(ifp); stop_extent = total_extents; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 7cae6ec27fa6..cdef87db5262 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -110,6 +110,9 @@ struct xfs_extent_free_item /* Map something in the CoW fork. */ #define XFS_BMAPI_COWFORK 0x200 +/* Only convert delalloc space, don't allocate entirely new extents */ +#define XFS_BMAPI_DELALLOC 0x400 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -120,7 +123,8 @@ struct xfs_extent_free_item { XFS_BMAPI_CONVERT, "CONVERT" }, \ { XFS_BMAPI_ZERO, "ZERO" }, \ { XFS_BMAPI_REMAP, "REMAP" }, \ - { XFS_BMAPI_COWFORK, "COWFORK" } + { XFS_BMAPI_COWFORK, "COWFORK" }, \ + { XFS_BMAPI_DELALLOC, "DELALLOC" } static inline int xfs_bmapi_aflag(int w) @@ -237,14 +241,9 @@ int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, struct xfs_defer_ops *dfops, enum shift_direction direction, int num_exts); int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); -struct xfs_bmbt_rec_host * - xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno, - int fork, int *eofp, xfs_extnum_t *lastxp, - struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp); int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t aoff, xfs_filblks_t len, - struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev, - xfs_extnum_t *lastx, int eof); + xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 8007d2ba9aef..d9be241fc86f 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -502,12 +502,11 @@ try_another_ag: if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy - * a full btree split. Try again without minleft and if + * a full btree split. Try again and if * successful activate the lowspace algorithm. */ args.fsbno = 0; args.type = XFS_ALLOCTYPE_FIRST_AG; - args.minleft = 0; error = xfs_alloc_vextent(&args); if (error) goto error0; @@ -796,13 +795,14 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; cur->bc_btnum = XFS_BTNUM_BMAP; cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 0e80993c8a59..21e6a6ab6b9a 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block( if (error) return error; + /* Check the inode owner since the verifiers don't. */ + if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + (cur->bc_flags & XFS_BTREE_LONG_PTRS) && + be64_to_cpu((*blkp)->bb_u.l.bb_owner) != + cur->bc_private.b.ip->i_ino) + goto out_bad; + + /* Did we get the level we were looking for? */ + if (be16_to_cpu((*blkp)->bb_level) != level) + goto out_bad; + + /* Check that internal nodes have at least one record. */ + if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0) + goto out_bad; + xfs_btree_setbuf(cur, level, bp); return 0; + +out_bad: + *blkp = NULL; + xfs_trans_brelse(cur->bc_tp, bp); + return -EFSCORRUPTED; } /* diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index c2b01d1c79ee..b69b947c4c1b 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -96,46 +96,10 @@ union xfs_btree_rec { /* * Generic stats interface */ -#define __XFS_BTREE_STATS_INC(mp, type, stat) \ - XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat) #define XFS_BTREE_STATS_INC(cur, stat) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \ - case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \ - case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \ - case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \ - case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \ - case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \ - case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) - -#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \ - XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val) -#define XFS_BTREE_STATS_ADD(cur, stat, val) \ -do { \ - struct xfs_mount *__mp = cur->bc_mp; \ - switch (cur->bc_btnum) { \ - case XFS_BTNUM_BNO: \ - __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \ - case XFS_BTNUM_CNT: \ - __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \ - case XFS_BTNUM_BMAP: \ - __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \ - case XFS_BTNUM_INO: \ - __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \ - case XFS_BTNUM_FINO: \ - __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \ - case XFS_BTNUM_RMAP: \ - __XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \ - case XFS_BTNUM_REFC: \ - __XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \ - case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \ - } \ -} while (0) + XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat) +#define XFS_BTREE_STATS_ADD(cur, stat, val) \ + XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) #define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ @@ -253,6 +217,7 @@ typedef struct xfs_btree_cur __uint8_t bc_nlevels; /* number of levels in the tree */ __uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ xfs_btnum_t bc_btnum; /* identifies which btree type */ + int bc_statoff; /* offset of btre stats array */ union { struct { /* needed for BNO, CNT, INO */ struct xfs_buf *agbp; /* agf/agi buffer pointer */ diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h index fad1676ad8cd..a416c7cb23ea 100644 --- a/fs/xfs/libxfs/xfs_cksum.h +++ b/fs/xfs/libxfs/xfs_cksum.h @@ -6,10 +6,11 @@ /* * Calculate the intermediate checksum for a buffer that has the CRC field * inside it. The offset of the 32bit crc fields is passed as the - * cksum_offset parameter. + * cksum_offset parameter. We do not modify the buffer during verification, + * hence we have to split the CRC calculation across the cksum_offset. */ static inline __uint32_t -xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) +xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset) { __uint32_t zero = 0; __uint32_t crc; @@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset) } /* + * Fast CRC method where the buffer is modified. Callers must have exclusive + * access to the buffer while the calculation takes place. + */ +static inline __uint32_t +xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset) +{ + /* zero the CRC field */ + *(__le32 *)(buffer + cksum_offset) = 0; + + /* single pass CRC calculation for the entire buffer */ + return crc32c(XFS_CRC_SEED, buffer, length); +} + +/* * Convert the intermediate checksum to the final ondisk format. * * The CRC32c calculation uses LE format even on BE machines, but returns the @@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc) /* * Helper to generate the checksum for a buffer. + * + * This modifies the buffer temporarily - callers must have exclusive + * access to the buffer while the calculation takes place. */ static inline void xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset); *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc); } @@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset) static inline int xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset) { - __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset); + __uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset); return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc); } diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 20a96dd5af7e..2f389d366e93 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -36,21 +36,29 @@ struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR }; /* - * @mode, if set, indicates that the type field needs to be set up. - * This uses the transformation from file mode to DT_* as defined in linux/fs.h - * for file type specification. This will be propagated into the directory - * structure if appropriate for the given operation and filesystem config. + * Convert inode mode to directory entry filetype */ -const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = { - [0] = XFS_DIR3_FT_UNKNOWN, - [S_IFREG >> S_SHIFT] = XFS_DIR3_FT_REG_FILE, - [S_IFDIR >> S_SHIFT] = XFS_DIR3_FT_DIR, - [S_IFCHR >> S_SHIFT] = XFS_DIR3_FT_CHRDEV, - [S_IFBLK >> S_SHIFT] = XFS_DIR3_FT_BLKDEV, - [S_IFIFO >> S_SHIFT] = XFS_DIR3_FT_FIFO, - [S_IFSOCK >> S_SHIFT] = XFS_DIR3_FT_SOCK, - [S_IFLNK >> S_SHIFT] = XFS_DIR3_FT_SYMLINK, -}; +unsigned char xfs_mode_to_ftype(int mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + return XFS_DIR3_FT_REG_FILE; + case S_IFDIR: + return XFS_DIR3_FT_DIR; + case S_IFCHR: + return XFS_DIR3_FT_CHRDEV; + case S_IFBLK: + return XFS_DIR3_FT_BLKDEV; + case S_IFIFO: + return XFS_DIR3_FT_FIFO; + case S_IFSOCK: + return XFS_DIR3_FT_SOCK; + case S_IFLNK: + return XFS_DIR3_FT_SYMLINK; + default: + return XFS_DIR3_FT_UNKNOWN; + } +} /* * ASCII case-insensitive (ie. A-Z) support for directories that was @@ -93,7 +101,7 @@ xfs_ascii_ci_compname( return result; } -static struct xfs_nameops xfs_ascii_ci_nameops = { +static const struct xfs_nameops xfs_ascii_ci_nameops = { .hashname = xfs_ascii_ci_hashname, .compname = xfs_ascii_ci_compname, }; @@ -631,7 +639,8 @@ xfs_dir2_isblock( if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK))) return rval; rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize; - ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize); + if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) + return -EFSCORRUPTED; *vp = rval; return 0; } diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index becc926c3e3d..d6e6d9d16f6c 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -18,6 +18,9 @@ #ifndef __XFS_DIR2_H__ #define __XFS_DIR2_H__ +#include "xfs_da_format.h" +#include "xfs_da_btree.h" + struct xfs_defer_ops; struct xfs_da_args; struct xfs_inode; @@ -32,10 +35,9 @@ struct xfs_dir2_data_unused; extern struct xfs_name xfs_name_dotdot; /* - * directory filetype conversion tables. + * Convert inode mode to directory entry filetype */ -#define S_SHIFT 12 -extern const unsigned char xfs_mode_to_ftype[]; +extern unsigned char xfs_mode_to_ftype(int mode); /* * directory operations vector for encode/decode routines @@ -157,6 +159,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r); extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db, struct xfs_buf *bp); +extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, + struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_freescan(struct xfs_inode *dp, struct xfs_dir2_data_hdr *hdr, int *loghead); extern void xfs_dir2_data_log_entry(struct xfs_da_args *args, @@ -177,6 +182,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind( struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf, struct xfs_dir2_data_unused *dup); +extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); + extern const struct xfs_buf_ops xfs_dir3_block_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops; extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 725fc7841fde..d478065b9544 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -329,7 +329,7 @@ xfs_dir3_data_read( err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp, XFS_DATA_FORK, &xfs_dir3_data_buf_ops); - if (!err && tp) + if (!err && tp && *bpp) xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF); return err; } @@ -505,8 +505,9 @@ xfs_dir2_data_freeremove( * Given a data block, reconstruct its bestfree map. */ void -xfs_dir2_data_freescan( - struct xfs_inode *dp, +xfs_dir2_data_freescan_int( + struct xfs_da_geometry *geo, + const struct xfs_dir_ops *ops, struct xfs_dir2_data_hdr *hdr, int *loghead) { @@ -516,7 +517,6 @@ xfs_dir2_data_freescan( struct xfs_dir2_data_free *bf; char *endp; /* end of block's data */ char *p; /* current entry pointer */ - struct xfs_da_geometry *geo = dp->i_mount->m_dir_geo; ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) || @@ -526,13 +526,13 @@ xfs_dir2_data_freescan( /* * Start by clearing the table. */ - bf = dp->d_ops->data_bestfree_p(hdr); + bf = ops->data_bestfree_p(hdr); memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT); *loghead = 1; /* * Set up pointers. */ - p = (char *)dp->d_ops->data_entry_p(hdr); + p = (char *)ops->data_entry_p(hdr); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { btp = xfs_dir2_block_tail_p(geo, hdr); @@ -559,12 +559,22 @@ xfs_dir2_data_freescan( else { dep = (xfs_dir2_data_entry_t *)p; ASSERT((char *)dep - (char *)hdr == - be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep))); - p += dp->d_ops->data_entsize(dep->namelen); + be16_to_cpu(*ops->data_entry_tag_p(dep))); + p += ops->data_entsize(dep->namelen); } } } +void +xfs_dir2_data_freescan( + struct xfs_inode *dp, + struct xfs_dir2_data_hdr *hdr, + int *loghead) +{ + return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops, + hdr, loghead); +} + /* * Initialize a data block at the given block number in the directory. * Give back the buffer for the created block. diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h index ef9f6ead96a4..d04547fcf274 100644 --- a/fs/xfs/libxfs/xfs_dir2_priv.h +++ b/fs/xfs/libxfs/xfs_dir2_priv.h @@ -21,7 +21,6 @@ struct dir_context; /* xfs_dir2.c */ -extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space, xfs_dir2_db_t *dbp); extern int xfs_dir_cilookup_result(struct xfs_da_args *args, diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 51b4e0de1fdc..f272abff11e1 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2344,7 +2344,8 @@ xfs_imap( imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno); imap->im_len = XFS_FSB_TO_BB(mp, 1); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << + mp->m_sb.sb_inodelog); return 0; } @@ -2372,7 +2373,7 @@ out_map: imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno); imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster); - imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog); + imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog); /* * If the inode number maps to a block outside the bounds @@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi( ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC)); #endif - xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF); - /* * Compute byte offsets for the first and last fields in the first * region and log the agi buffer. This only logs up through @@ -2512,8 +2511,15 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; - if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + if (be32_to_cpu(agi->agi_level) < 1 || + be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; + + if (xfs_sb_version_hasfinobt(&mp->m_sb) && + (be32_to_cpu(agi->agi_free_level) < 1 || + be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS)) return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't @@ -2592,6 +2598,8 @@ xfs_read_agi( XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops); if (error) return error; + if (tp) + xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF); xfs_buf_set_ref(*bpp, XFS_AGI_REF); return 0; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index eab68ae2e011..7c471881c9a6 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -82,11 +82,12 @@ xfs_finobt_set_root( } STATIC int -xfs_inobt_alloc_block( +__xfs_inobt_alloc_block( struct xfs_btree_cur *cur, union xfs_btree_ptr *start, union xfs_btree_ptr *new, - int *stat) + int *stat, + enum xfs_ag_resv_type resv) { xfs_alloc_arg_t args; /* block allocation args */ int error; /* error return value */ @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( args.maxlen = 1; args.prod = 1; args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.resv = resv; error = xfs_alloc_vextent(&args); if (error) { @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( } STATIC int +xfs_inobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); +} + +STATIC int +xfs_finobt_alloc_block( + struct xfs_btree_cur *cur, + union xfs_btree_ptr *start, + union xfs_btree_ptr *new, + int *stat) +{ + return __xfs_inobt_alloc_block(cur, start, new, stat, + XFS_AG_RESV_METADATA); +} + +STATIC int xfs_inobt_free_block( struct xfs_btree_cur *cur, struct xfs_buf *bp) @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { .dup_cursor = xfs_inobt_dup_cursor, .set_root = xfs_finobt_set_root, - .alloc_block = xfs_inobt_alloc_block, + .alloc_block = xfs_finobt_alloc_block, .free_block = xfs_inobt_free_block, .get_minrecs = xfs_inobt_get_minrecs, .get_maxrecs = xfs_inobt_get_maxrecs, @@ -357,7 +380,7 @@ xfs_inobt_init_cursor( struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); struct xfs_btree_cur *cur; - cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP); + cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS); cur->bc_tp = tp; cur->bc_mp = mp; @@ -365,9 +388,11 @@ xfs_inobt_init_cursor( if (btnum == XFS_BTNUM_INO) { cur->bc_nlevels = be32_to_cpu(agi->agi_level); cur->bc_ops = &xfs_inobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); } else { cur->bc_nlevels = be32_to_cpu(agi->agi_free_level); cur->bc_ops = &xfs_finobt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2); } cur->bc_blocklog = mp->m_sb.sb_blocklog; @@ -478,3 +503,64 @@ xfs_inobt_rec_check_count( return 0; } #endif /* DEBUG */ + +static xfs_extlen_t +xfs_inobt_max_size( + struct xfs_mount *mp) +{ + /* Bail out if we're uninitialized, which can happen in mkfs. */ + if (mp->m_inobt_mxr[0] == 0) + return 0; + + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, + (uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock / + XFS_INODES_PER_CHUNK); +} + +static int +xfs_inobt_count_blocks( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_btnum_t btnum, + xfs_extlen_t *tree_blocks) +{ + struct xfs_buf *agbp; + struct xfs_btree_cur *cur; + int error; + + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); + if (error) + return error; + + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); + error = xfs_btree_count_blocks(cur, tree_blocks); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + xfs_buf_relse(agbp); + + return error; +} + +/* + * Figure out how many blocks to reserve and how many are used by this btree. + */ +int +xfs_finobt_calc_reserves( + struct xfs_mount *mp, + xfs_agnumber_t agno, + xfs_extlen_t *ask, + xfs_extlen_t *used) +{ + xfs_extlen_t tree_len = 0; + int error; + + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) + return 0; + + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); + if (error) + return error; + + *ask += xfs_inobt_max_size(mp); + *used += tree_len; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index bd88453217ce..aa81e2e63f3f 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, #define xfs_inobt_rec_check_count(mp, rec) 0 #endif /* DEBUG */ +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_extlen_t *ask, xfs_extlen_t *used); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 134424fac434..d93f9d918cfc 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -29,6 +29,7 @@ #include "xfs_icache.h" #include "xfs_trans.h" #include "xfs_ialloc.h" +#include "xfs_dir2.h" /* * Check that none of the inode's in the buffer have a next @@ -383,15 +384,28 @@ xfs_log_dinode_to_disk( static bool xfs_dinode_verify( struct xfs_mount *mp, - struct xfs_inode *ip, + xfs_ino_t ino, struct xfs_dinode *dip) { + uint16_t mode; uint16_t flags; uint64_t flags2; if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) return false; + /* don't allow invalid i_size */ + if (be64_to_cpu(dip->di_size) & (1ULL << 63)) + return false; + + mode = be16_to_cpu(dip->di_mode); + if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) + return false; + + /* No zero-length symlinks/dirs. */ + if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0) + return false; + /* only version 3 or greater inodes are extensively verified here */ if (dip->di_version < 3) return true; @@ -401,7 +415,7 @@ xfs_dinode_verify( if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF)) return false; - if (be64_to_cpu(dip->di_ino) != ip->i_ino) + if (be64_to_cpu(dip->di_ino) != ino) return false; if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid)) return false; @@ -436,7 +450,7 @@ xfs_dinode_calc_crc( return; ASSERT(xfs_sb_version_hascrc(&mp->m_sb)); - crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize, + crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize, XFS_DINODE_CRC_OFF); dip->di_crc = xfs_end_cksum(crc); } @@ -493,7 +507,7 @@ xfs_iread( return error; /* even unallocated inodes are verified */ - if (!xfs_dinode_verify(mp, ip, dip)) { + if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { xfs_alert(mp, "%s: validation failed for inode %lld failed", __func__, ip->i_ino); diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h index 3cfe12a4f58a..6848a0afbce7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.h +++ b/fs/xfs/libxfs/xfs_inode_buf.h @@ -58,8 +58,8 @@ struct xfs_icdinode { */ struct xfs_imap { xfs_daddr_t im_blkno; /* starting BB of inode chunk */ - ushort im_len; /* length in BBs of inode chunk */ - ushort im_boffset; /* inode offset in block in bytes */ + unsigned short im_len; /* length in BBs of inode chunk */ + unsigned short im_boffset; /* inode offset in block in bytes */ }; int xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *, diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 5dd56d3dbb3a..222e103356c6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -775,6 +775,13 @@ xfs_idestroy_fork( } } +/* Count number of incore extents based on if_bytes */ +xfs_extnum_t +xfs_iext_count(struct xfs_ifork *ifp) +{ + return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); +} + /* * Convert in-core extents to on-disk form * @@ -803,7 +810,7 @@ xfs_iextents_copy( ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); - nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nrecs = xfs_iext_count(ifp); XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork); ASSERT(nrecs > 0); @@ -941,7 +948,7 @@ xfs_iext_get_ext( xfs_extnum_t idx) /* index of target extent */ { ASSERT(idx >= 0); - ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); + ASSERT(idx < xfs_iext_count(ifp)); if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) { return ifp->if_u1.if_ext_irec->er_extbuf; @@ -1017,7 +1024,7 @@ xfs_iext_add( int new_size; /* size of extents after adding */ xfs_extnum_t nextents; /* number of extents in file */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT((idx >= 0) && (idx <= nextents)); byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t); new_size = ifp->if_bytes + byte_diff; @@ -1241,7 +1248,7 @@ xfs_iext_remove( trace_xfs_iext_remove(ip, idx, state, _RET_IP_); ASSERT(ext_diff > 0); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t); if (new_size == 0) { @@ -1270,7 +1277,7 @@ xfs_iext_remove_inline( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); ASSERT(idx < XFS_INLINE_EXTS); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(((nextents - ext_diff) > 0) && (nextents - ext_diff) < XFS_INLINE_EXTS); @@ -1309,7 +1316,7 @@ xfs_iext_remove_direct( ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); new_size = ifp->if_bytes - (ext_diff * sizeof(xfs_bmbt_rec_t)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (new_size == 0) { xfs_iext_destroy(ifp); @@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct( int size; /* size of file extents */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); @@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext( xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff = 0; /* start offset of extent */ - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { *idxp = 0; return NULL; @@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec( ASSERT(ifp->if_flags & XFS_IFEXTIREC); ASSERT(page_idx >= 0); - ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)); - ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc); + ASSERT(page_idx <= xfs_iext_count(ifp)); + ASSERT(page_idx < xfs_iext_count(ifp) || realloc); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; erp_idx = 0; @@ -1782,7 +1789,7 @@ xfs_iext_irec_init( xfs_extnum_t nextents; /* number of extents in file */ ASSERT(!(ifp->if_flags & XFS_IFEXTIREC)); - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); ASSERT(nextents <= XFS_LINEAR_EXTS); erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS); @@ -1906,7 +1913,7 @@ xfs_iext_irec_compact( ASSERT(ifp->if_flags & XFS_IFEXTIREC); nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); if (nextents == 0) { xfs_iext_destroy(ifp); @@ -1996,3 +2003,49 @@ xfs_ifork_init_cow( ip->i_cformat = XFS_DINODE_FMT_EXTENTS; ip->i_cnextents = 0; } + +/* + * Lookup the extent covering bno. + * + * If there is an extent covering bno return the extent index, and store the + * expanded extent structure in *gotp, and the extent index in *idx. + * If there is no extent covering bno, but there is an extent after it (e.g. + * it lies in a hole) return that extent in *gotp and its index in *idx + * instead. + * If bno is beyond the last extent return false, and return the index after + * the last valid index in *idxp. + */ +bool +xfs_iext_lookup_extent( + struct xfs_inode *ip, + struct xfs_ifork *ifp, + xfs_fileoff_t bno, + xfs_extnum_t *idxp, + struct xfs_bmbt_irec *gotp) +{ + struct xfs_bmbt_rec_host *ep; + + XFS_STATS_INC(ip->i_mount, xs_look_exlist); + + ep = xfs_iext_bno_to_ext(ifp, bno, idxp); + if (!ep) + return false; + xfs_bmbt_get_all(ep, gotp); + return true; +} + +/* + * Return true if there is an extent at index idx, and return the expanded + * extent structure at idx in that case. Else return false. + */ +bool +xfs_iext_get_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + if (idx < 0 || idx >= xfs_iext_count(ifp)) + return false; + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); + return true; +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index c9476f50e32d..7fb8365326d1 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -152,6 +152,7 @@ void xfs_init_local_fork(struct xfs_inode *, int, const void *, int); struct xfs_bmbt_rec_host * xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t); +xfs_extnum_t xfs_iext_count(struct xfs_ifork *); void xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t, struct xfs_bmbt_irec *, int); void xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int); @@ -181,6 +182,12 @@ void xfs_iext_irec_compact_pages(struct xfs_ifork *); void xfs_iext_irec_compact_full(struct xfs_ifork *); void xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int); +bool xfs_iext_lookup_extent(struct xfs_inode *ip, + struct xfs_ifork *ifp, xfs_fileoff_t bno, + xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); +bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); + extern struct kmem_zone *xfs_ifork_zone; extern void xfs_ifork_init_cow(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 083cdd6d6c28..7ae571f8e34a 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version) typedef struct xfs_buf_log_format { unsigned short blf_type; /* buf log item type indicator */ unsigned short blf_size; /* size of this item */ - ushort blf_flags; /* misc state */ - ushort blf_len; /* number of blocks in this buf */ + unsigned short blf_flags; /* misc state */ + unsigned short blf_len; /* number of blocks in this buf */ __int64_t blf_blkno; /* starting blkno of this buf */ unsigned int blf_map_size; /* used size of data bitmap in words */ unsigned int blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */ diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 8e385f91d660..d9f65e2d5cc8 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -52,7 +52,7 @@ typedef struct xlog_recover { struct list_head r_itemq; /* q for items */ } xlog_recover_t; -#define ITEM_TYPE(i) (*(ushort *)(i)->ri_buf[0].i_addr) +#define ITEM_TYPE(i) (*(unsigned short *)(i)->ri_buf[0].i_addr) /* * This is the number of entries in the l_buf_cancel_table used during diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 453bb2757ec2..50add5272807 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor( cur->bc_btnum = XFS_BTNUM_REFC; cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_refcountbt_ops; + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level); @@ -408,13 +409,14 @@ xfs_refcountbt_calc_size( */ xfs_extlen_t xfs_refcountbt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agblock_t agblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_refc_mxr[0] == 0) return 0; - return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks); + return xfs_refcountbt_calc_size(mp, agblocks); } /* @@ -429,22 +431,24 @@ xfs_refcountbt_calc_reserves( { struct xfs_buf *agbp; struct xfs_agf *agf; + xfs_agblock_t agblocks; xfs_extlen_t tree_len; int error; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return 0; - *ask += xfs_refcountbt_max_size(mp); error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) return error; agf = XFS_BUF_TO_AGF(agbp); + agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_refcount_blocks); xfs_buf_relse(agbp); + *ask += xfs_refcountbt_max_size(mp, agblocks); *used += tree_len; return error; diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index 3be7768bd51a..9db008b955b7 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -66,7 +66,8 @@ extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp, unsigned long long len); -extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp); +extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp, + xfs_agblock_t agblocks); extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index 83e672ff7577..74e5a54bc428 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor( cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_ops = &xfs_rmapbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_private.a.agbp = agbp; cur->bc_private.a.agno = agno; @@ -549,13 +550,14 @@ xfs_rmapbt_calc_size( */ xfs_extlen_t xfs_rmapbt_max_size( - struct xfs_mount *mp) + struct xfs_mount *mp, + xfs_agblock_t agblocks) { /* Bail out if we're uninitialized, which can happen in mkfs. */ if (mp->m_rmap_mxr[0] == 0) return 0; - return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks); + return xfs_rmapbt_calc_size(mp, agblocks); } /* @@ -570,25 +572,24 @@ xfs_rmapbt_calc_reserves( { struct xfs_buf *agbp; struct xfs_agf *agf; - xfs_extlen_t pool_len; + xfs_agblock_t agblocks; xfs_extlen_t tree_len; int error; if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) return 0; - /* Reserve 1% of the AG or enough for 1 block per record. */ - pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp)); - *ask += pool_len; - error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp); if (error) return error; agf = XFS_BUF_TO_AGF(agbp); + agblocks = be32_to_cpu(agf->agf_length); tree_len = be32_to_cpu(agf->agf_rmap_blocks); xfs_buf_relse(agbp); + /* Reserve 1% of the AG or enough for 1 block per record. */ + *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks)); *used += tree_len; return error; diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index 2a9ac472fb15..19c08e933049 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -60,7 +60,8 @@ extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp); extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp, unsigned long long len); -extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp); +extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, + xfs_agblock_t agblocks); extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index e2e1106c9fad..ea45584a9913 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,4 +1016,3 @@ xfs_rtfree_extent( } return 0; } - diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a70aec910626..584ec896a533 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -242,7 +242,7 @@ xfs_mount_validate_sb( sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || - sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || + sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || @@ -262,6 +262,12 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + if (xfs_sb_version_hascrc(&mp->m_sb) && + sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { + xfs_notice(mp, "v5 SB sanity check failed"); + return -EFSCORRUPTED; + } + /* * Until this is fixed only page-sized or smaller data blocks work. */ @@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); - if (sbp->sb_qflags & XFS_PQUOTA_ACCT) { + if (sbp->sb_qflags & XFS_PQUOTA_ACCT && + sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, - * copy over sb_gquotino to sb_pquotino. + * copy over sb_gquotino to sb_pquotino. The NULLFSINO test + * above is to make sure we don't do this twice and wipe them + * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 8d74870468c2..717909f2f7b7 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -57,7 +57,6 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ #define NULLAGBLOCK ((xfs_agblock_t)-1) #define NULLAGNUMBER ((xfs_agnumber_t)-1) -#define NULLEXTNUM ((xfs_extnum_t)-1) #define NULLCOMMITLSN ((xfs_lsn_t)-1) @@ -75,11 +74,14 @@ typedef __int64_t xfs_sfiloff_t; /* signed block number in a file */ * Minimum and maximum blocksize and sectorsize. * The blocksize upper limit is pretty much arbitrary. * The sectorsize upper limit is due to sizeof(sb_sectsize). + * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes + * cannot be used. */ #define XFS_MIN_BLOCKSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_BLOCKSIZE_LOG 16 /* i.e. 65536 bytes */ #define XFS_MIN_BLOCKSIZE (1 << XFS_MIN_BLOCKSIZE_LOG) #define XFS_MAX_BLOCKSIZE (1 << XFS_MAX_BLOCKSIZE_LOG) +#define XFS_MIN_CRC_BLOCKSIZE (1 << (XFS_MIN_BLOCKSIZE_LOG + 1)) #define XFS_MIN_SECTORSIZE_LOG 9 /* i.e. 512 bytes */ #define XFS_MAX_SECTORSIZE_LOG 15 /* i.e. 32768 bytes */ #define XFS_MIN_SECTORSIZE (1 << XFS_MIN_SECTORSIZE_LOG) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3e57a56cf829..631e7c0e0a29 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -37,11 +37,6 @@ #include <linux/pagevec.h> #include <linux/writeback.h> -/* flags for direct write completions */ -#define XFS_DIO_FLAG_UNWRITTEN (1 << 0) -#define XFS_DIO_FLAG_APPEND (1 << 1) -#define XFS_DIO_FLAG_COW (1 << 2) - /* * structure owned by writepages passed to individual writepage calls */ @@ -495,8 +490,8 @@ xfs_submit_ioend( ioend->io_bio->bi_private = ioend; ioend->io_bio->bi_end_io = xfs_end_bio; - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); + /* * If we are failing the IO now, just mark the ioend with an * error and finish it. This will run IO completion immediately @@ -567,8 +562,7 @@ xfs_chain_bio( bio_chain(ioend->io_bio, new); bio_get(ioend->io_bio); /* for xfs_destroy_ioend */ - bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, - (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0); + ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc); submit_bio(ioend->io_bio); ioend->io_bio = new; } @@ -777,7 +771,7 @@ xfs_map_cow( { struct xfs_inode *ip = XFS_I(inode); struct xfs_bmbt_irec imap; - bool is_cow = false, need_alloc = false; + bool is_cow = false; int error; /* @@ -795,7 +789,7 @@ xfs_map_cow( * Else we need to check if there is a COW mapping at this offset. */ xfs_ilock(ip, XFS_ILOCK_SHARED); - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc); + is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap); xfs_iunlock(ip, XFS_ILOCK_SHARED); if (!is_cow) @@ -805,7 +799,7 @@ xfs_map_cow( * And if the COW mapping has a delayed extent here we need to * allocate real space for it now. */ - if (need_alloc) { + if (isnullstartblock(imap.br_startblock)) { error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset, &imap); if (error) @@ -1158,63 +1152,27 @@ xfs_vm_releasepage( * block_invalidatepage() can send pages that are still marked dirty * but otherwise have invalidated buffers. * - * We've historically freed buffers on the latter. Instead, quietly - * filter out all dirty pages to avoid spurious buffer state warnings. - * This can likely be removed once shrink_active_list() is fixed. + * We want to release the latter to avoid unnecessary buildup of the + * LRU, skip the former and warn if we've left any lingering + * delalloc/unwritten buffers on clean pages. Skip pages with delalloc + * or unwritten buffers and warn if the page is not dirty. Otherwise + * try to release the buffers. */ - if (PageDirty(page)) - return 0; - xfs_count_page_state(page, &delalloc, &unwritten); - if (WARN_ON_ONCE(delalloc)) + if (delalloc) { + WARN_ON_ONCE(!PageDirty(page)); return 0; - if (WARN_ON_ONCE(unwritten)) + } + if (unwritten) { + WARN_ON_ONCE(!PageDirty(page)); return 0; + } return try_to_free_buffers(page); } /* - * When we map a DIO buffer, we may need to pass flags to - * xfs_end_io_direct_write to tell it what kind of write IO we are doing. - * - * Note that for DIO, an IO to the highest supported file block offset (i.e. - * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64 - * bit variable. Hence if we see this overflow, we have to assume that the IO is - * extending the file size. We won't know for sure until IO completion is run - * and the actual max write offset is communicated to the IO completion - * routine. - */ -static void -xfs_map_direct( - struct inode *inode, - struct buffer_head *bh_result, - struct xfs_bmbt_irec *imap, - xfs_off_t offset, - bool is_cow) -{ - uintptr_t *flags = (uintptr_t *)&bh_result->b_private; - xfs_off_t size = bh_result->b_size; - - trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size, - ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW : - XFS_IO_OVERWRITE, imap); - - if (ISUNWRITTEN(imap)) { - *flags |= XFS_DIO_FLAG_UNWRITTEN; - set_buffer_defer_completion(bh_result); - } else if (is_cow) { - *flags |= XFS_DIO_FLAG_COW; - set_buffer_defer_completion(bh_result); - } - if (offset + size > i_size_read(inode) || offset + size < 0) { - *flags |= XFS_DIO_FLAG_APPEND; - set_buffer_defer_completion(bh_result); - } -} - -/* * If this is O_DIRECT or the mpage code calling tell them how large the mapping * is, so that we can avoid repeated get_blocks calls. * @@ -1254,52 +1212,12 @@ xfs_map_trim_size( bh_result->b_size = mapping_size; } -/* Bounce unaligned directio writes to the page cache. */ static int -xfs_bounce_unaligned_dio_write( - struct xfs_inode *ip, - xfs_fileoff_t offset_fsb, - struct xfs_bmbt_irec *imap) -{ - struct xfs_bmbt_irec irec; - xfs_fileoff_t delta; - bool shared; - bool x; - int error; - - irec = *imap; - if (offset_fsb > irec.br_startoff) { - delta = offset_fsb - irec.br_startoff; - irec.br_blockcount -= delta; - irec.br_startblock += delta; - irec.br_startoff = offset_fsb; - } - error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x); - if (error) - return error; - - /* - * We're here because we're trying to do a directio write to a - * region that isn't aligned to a filesystem block. If any part - * of the extent is shared, fall back to buffered mode to handle - * the RMW. This is done by returning -EREMCHG ("remote addr - * changed"), which is caught further up the call stack. - */ - if (shared) { - trace_xfs_reflink_bounce_dio_write(ip, imap); - return -EREMCHG; - } - return 0; -} - -STATIC int -__xfs_get_blocks( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, - int create, - bool direct, - bool dax_fault) + int create) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1310,11 +1228,8 @@ __xfs_get_blocks( int nimaps = 1; xfs_off_t offset; ssize_t size; - int new = 0; - bool is_cow = false; - bool need_alloc = false; - BUG_ON(create && !direct); + BUG_ON(create); if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1323,7 +1238,7 @@ __xfs_get_blocks( ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; - if (!create && offset >= i_size_read(inode)) + if (offset >= i_size_read(inode)) return 0; /* @@ -1338,52 +1253,12 @@ __xfs_get_blocks( end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size); offset_fsb = XFS_B_TO_FSBT(mp, offset); - if (create && direct && xfs_is_reflink_inode(ip)) - is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, - &need_alloc); - if (!is_cow) { - error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, - &imap, &nimaps, XFS_BMAPI_ENTIRE); - /* - * Truncate an overwrite extent if there's a pending CoW - * reservation before the end of this extent. This - * forces us to come back to get_blocks to take care of - * the CoW. - */ - if (create && direct && nimaps && - imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) - xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, - &imap); - } - ASSERT(!need_alloc); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, XFS_BMAPI_ENTIRE); if (error) goto out_unlock; - /* for DAX, we convert unwritten extents directly */ - if (create && - (!nimaps || - (imap.br_startblock == HOLESTARTBLOCK || - imap.br_startblock == DELAYSTARTBLOCK) || - (IS_DAX(inode) && ISUNWRITTEN(&imap)))) { - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - - error = xfs_iomap_write_direct(ip, offset, size, - &imap, nimaps); - if (error) - return error; - new = 1; - - trace_xfs_get_blocks_alloc(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_DELALLOC, &imap); - } else if (nimaps) { + if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); @@ -1393,12 +1268,6 @@ __xfs_get_blocks( goto out_unlock; } - if (IS_DAX(inode) && create) { - ASSERT(!ISUNWRITTEN(&imap)); - /* zeroing is not needed at a higher layer */ - new = 0; - } - /* trim mapping down to size requested */ xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size); @@ -1408,50 +1277,14 @@ __xfs_get_blocks( */ if (imap.br_startblock != HOLESTARTBLOCK && imap.br_startblock != DELAYSTARTBLOCK && - (create || !ISUNWRITTEN(&imap))) { - if (create && direct && !is_cow) { - error = xfs_bounce_unaligned_dio_write(ip, offset_fsb, - &imap); - if (error) - return error; - } - + !ISUNWRITTEN(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); - if (ISUNWRITTEN(&imap)) - set_buffer_unwritten(bh_result); - /* direct IO needs special help */ - if (create) { - if (dax_fault) - ASSERT(!ISUNWRITTEN(&imap)); - else - xfs_map_direct(inode, bh_result, &imap, offset, - is_cow); - } - } /* * If this is a realtime file, data may be on a different device. * to that pointed to from the buffer_head b_bdev currently. */ bh_result->b_bdev = xfs_find_bdev_for_inode(inode); - - /* - * If we previously allocated a block out beyond eof and we are now - * coming back to use it then we will need to flag it as new even if it - * has a disk address. - * - * With sub-block writes into unwritten extents we also need to mark - * the buffer as new so that the unwritten parts of the buffer gets - * correctly zeroed. - */ - if (create && - ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || - (offset >= i_size_read(inode)) || - (new || ISUNWRITTEN(&imap)))) - set_buffer_new(bh_result); - - BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK); - return 0; out_unlock: @@ -1459,110 +1292,6 @@ out_unlock: return error; } -int -xfs_get_blocks( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, false, false); -} - -int -xfs_get_blocks_direct( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, false); -} - -int -xfs_get_blocks_dax_fault( - struct inode *inode, - sector_t iblock, - struct buffer_head *bh_result, - int create) -{ - return __xfs_get_blocks(inode, iblock, bh_result, create, true, true); -} - -/* - * Complete a direct I/O write request. - * - * xfs_map_direct passes us some flags in the private data to tell us what to - * do. If no flags are set, then the write IO is an overwrite wholly within - * the existing allocated file size and so there is nothing for us to do. - * - * Note that in this case the completion can be called in interrupt context, - * whereas if we have flags set we will always be called in task context - * (i.e. from a workqueue). - */ -int -xfs_end_io_direct_write( - struct kiocb *iocb, - loff_t offset, - ssize_t size, - void *private) -{ - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - uintptr_t flags = (uintptr_t)private; - int error = 0; - - trace_xfs_end_io_direct_write(ip, offset, size); - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - return -EIO; - - if (size <= 0) - return size; - - /* - * The flags tell us whether we are doing unwritten extent conversions - * or an append transaction that updates the on-disk file size. These - * cases are the only cases where we should *potentially* be needing - * to update the VFS inode size. - */ - if (flags == 0) { - ASSERT(offset + size <= i_size_read(inode)); - return 0; - } - - /* - * We need to update the in-core inode size here so that we don't end up - * with the on-disk inode size being outside the in-core inode size. We - * have no other method of updating EOF for AIO, so always do it here - * if necessary. - * - * We need to lock the test/set EOF update as we can be racing with - * other IO completions here to update the EOF. Failing to serialise - * here can result in EOF moving backwards and Bad Things Happen when - * that occurs. - */ - spin_lock(&ip->i_flags_lock); - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); - - if (flags & XFS_DIO_FLAG_COW) - error = xfs_reflink_end_cow(ip, offset, size); - if (flags & XFS_DIO_FLAG_UNWRITTEN) { - trace_xfs_end_io_direct_write_unwritten(ip, offset, size); - - error = xfs_iomap_write_unwritten(ip, offset, size); - } - if (flags & XFS_DIO_FLAG_APPEND) { - trace_xfs_end_io_direct_write_append(ip, offset, size); - - error = xfs_setfilesize(ip, offset, size); - } - - return error; -} - STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1583,7 +1312,6 @@ xfs_vm_bmap( struct xfs_inode *ip = XFS_I(inode); trace_xfs_vm_bmap(XFS_I(inode)); - xfs_ilock(ip, XFS_IOLOCK_SHARED); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1591,12 +1319,10 @@ xfs_vm_bmap( * that on reflinks inodes, so we have to skip out here. And yes, * 0 is the magic code for a bmap error.. */ - if (xfs_is_reflink_inode(ip)) { - xfs_iunlock(ip, XFS_IOLOCK_SHARED); + if (xfs_is_reflink_inode(ip)) return 0; - } + filemap_write_and_wait(mapping); - xfs_iunlock(ip, XFS_IOLOCK_SHARED); return generic_block_bmap(mapping, block, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index b3c6634f9518..cc174ec6c2fd 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -55,15 +55,6 @@ struct xfs_ioend { extern const struct address_space_operations xfs_address_space_operations; -int xfs_get_blocks(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_direct(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); -int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, - struct buffer_head *map_bh, int create); - -int xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private); int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index e3da5d448bcf..d14691aa02b4 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern { *========================================================================*/ -/* Return 0 on success, or -errno; other state communicated via *context */ -typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int, +/* void; state communicated via *context */ +typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int, unsigned char *, int, int); typedef struct xfs_attr_list_context { diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 25e76cd6c053..97c45b6eb91e 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) xfs_attr_sf_entry_t *sfe; xfs_inode_t *dp; int sbsize, nsbuf, count, i; - int error; ASSERT(context != NULL); dp = context->dp; @@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) (XFS_ISRESET_CURSOR(cursor) && (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { - error = context->put_listent(context, - sfe->flags, - sfe->nameval, - (int)sfe->namelen, - (int)sfe->valuelen); - if (error) - return error; + context->put_listent(context, + sfe->flags, + sfe->nameval, + (int)sfe->namelen, + (int)sfe->valuelen); /* * Either search callback finished early or * didn't fit it all in the buffer after all. @@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context) cursor->hashval = sbp->hash; cursor->offset = 0; } - error = context->put_listent(context, - sbp->flags, - sbp->name, - sbp->namelen, - sbp->valuelen); - if (error) { - kmem_free(sbuf); - return error; - } + context->put_listent(context, + sbp->flags, + sbp->name, + sbp->namelen, + sbp->valuelen); if (context->seen_enough) break; cursor->offset++; @@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) */ for (;;) { leaf = bp->b_addr; - error = xfs_attr3_leaf_list_int(bp, context); - if (error) { - xfs_trans_brelse(NULL, bp); - return error; - } + xfs_attr3_leaf_list_int(bp, context); xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; @@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) /* * Copy out attribute list entries for attr_list(), for leaf attribute lists. */ -int +void xfs_attr3_leaf_list_int( struct xfs_buf *bp, struct xfs_attr_list_context *context) @@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int( struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; struct xfs_attr_leaf_entry *entry; - int retval; int i; struct xfs_mount *mp = context->dp->i_mount; @@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int( } if (i == ichdr.count) { trace_xfs_attr_list_notfound(context); - return 0; + return; } } else { entry = &entries[0]; @@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int( /* * We have found our place, start copying out the new attributes. */ - retval = 0; for (; i < ichdr.count; entry++, i++) { char *name; int namelen, valuelen; @@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int( valuelen = be32_to_cpu(name_rmt->valuelen); } - retval = context->put_listent(context, entry->flags, + context->put_listent(context, entry->flags, name, namelen, valuelen); - if (retval) - break; if (context->seen_enough) break; cursor->offset++; } trace_xfs_attr_list_leaf_end(context); - return retval; + return; } /* @@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context) if (error) return error; - error = xfs_attr3_leaf_list_int(bp, context); + xfs_attr3_leaf_list_int(bp, context); xfs_trans_brelse(NULL, bp); - return error; + return 0; } int @@ -513,7 +498,7 @@ xfs_attr_list_int( * Take care to check values and protect against them changing later, * we may be reading them directly out of a user buffer. */ -STATIC int +STATIC void xfs_attr_put_listent( xfs_attr_list_context_t *context, int flags, @@ -536,10 +521,10 @@ xfs_attr_put_listent( */ if (((context->flags & ATTR_SECURE) == 0) != ((flags & XFS_ATTR_SECURE) == 0)) - return 0; + return; if (((context->flags & ATTR_ROOT) == 0) != ((flags & XFS_ATTR_ROOT) == 0)) - return 0; + return; arraytop = sizeof(*alist) + context->count * sizeof(alist->al_offset[0]); @@ -548,7 +533,7 @@ xfs_attr_put_listent( trace_xfs_attr_list_full(context); alist->al_more = 1; context->seen_enough = 1; - return 0; + return; } aep = (attrlist_ent_t *)&context->alist[context->firstu]; @@ -558,7 +543,7 @@ xfs_attr_put_listent( alist->al_offset[context->count++] = context->firstu; alist->al_count = context->count; trace_xfs_attr_list_add(context); - return 0; + return; } /* diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 552465e011ec..c1417919ab0a 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -359,9 +359,7 @@ xfs_bmap_count_blocks( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) { - xfs_bmap_count_leaves(ifp, 0, - ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t), - count); + xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count); return 0; } @@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole( ifp = XFS_IFORK_PTR(ip, whichfork); if (!moretocome && xfs_iext_bno_to_ext(ifp, fileblock, &lastx) && - (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1)) + (lastx == xfs_iext_count(ifp) - 1)) out->bmv_oflags |= BMV_OF_LAST; } @@ -530,7 +528,6 @@ xfs_getbmap( xfs_bmbt_irec_t *map; /* buffer for user's data */ xfs_mount_t *mp; /* file system mount point */ int nex; /* # of user extents can do */ - int nexleft; /* # of user extents left */ int subnex; /* # of bmapi's can do */ int nmap; /* number of map entries */ struct getbmapx *out; /* output structure */ @@ -688,10 +685,8 @@ xfs_getbmap( goto out_free_map; } - nexleft = nex; - do { - nmap = (nexleft > subnex) ? subnex : nexleft; + nmap = (nex> subnex) ? subnex : nex; error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset), XFS_BB_TO_FSB(mp, bmv->bmv_length), map, &nmap, bmapi_flags); @@ -699,8 +694,8 @@ xfs_getbmap( goto out_free_map; ASSERT(nmap <= subnex); - for (i = 0; i < nmap && nexleft && bmv->bmv_length && - cur_ext < bmv->bmv_count; i++) { + for (i = 0; i < nmap && bmv->bmv_length && + cur_ext < bmv->bmv_count - 1; i++) { out[cur_ext].bmv_oflags = 0; if (map[i].br_state == XFS_EXT_UNWRITTEN) out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC; @@ -762,16 +757,27 @@ xfs_getbmap( continue; } + /* + * In order to report shared extents accurately, + * we report each distinct shared/unshared part + * of a single bmbt record using multiple bmap + * extents. To make that happen, we iterate the + * same map array item multiple times, each + * time trimming out the subextent that we just + * reported. + * + * Because of this, we must check the out array + * index (cur_ext) directly against bmv_count-1 + * to avoid overflows. + */ if (inject_map.br_startblock != NULLFSBLOCK) { map[i] = inject_map; i--; - } else - nexleft--; + } bmv->bmv_entries++; cur_ext++; } - } while (nmap && nexleft && bmv->bmv_length && - cur_ext < bmv->bmv_count); + } while (nmap && bmv->bmv_length && cur_ext < bmv->bmv_count - 1); out_free_map: kmem_free(map); @@ -1792,6 +1798,7 @@ xfs_swap_extent_forks( struct xfs_ifork tempifp, *ifp, *tifp; int aforkblks = 0; int taforkblks = 0; + xfs_extnum_t nextents; __uint64_t tmp; int error; @@ -1877,14 +1884,13 @@ xfs_swap_extent_forks( switch (ip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) { - ifp->if_u1.if_extents = - ifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&ip->i_df); + if (nextents <= XFS_INLINE_EXTS) + ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext; (*src_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1896,14 +1902,13 @@ xfs_swap_extent_forks( switch (tip->i_d.di_format) { case XFS_DINODE_FMT_EXTENTS: - /* If the extents fit in the inode, fix the - * pointer. Otherwise it's already NULL or - * pointing to the extent. + /* + * If the extents fit in the inode, fix the pointer. Otherwise + * it's already NULL or pointing to the extent. */ - if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) { - tifp->if_u1.if_extents = - tifp->if_u2.if_inline_ext; - } + nextents = xfs_iext_count(&tip->i_df); + if (nextents <= XFS_INLINE_EXTS) + tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext; (*target_log_flags) |= XFS_ILOG_DEXT; break; case XFS_DINODE_FMT_BTREE: @@ -1938,8 +1943,8 @@ xfs_swap_extents( * page cache safely. Once we have done this we can take the ilocks and * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + lock_two_nondirectories(VFS_I(ip), VFS_I(tip)); + lock_flags = XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ @@ -2079,15 +2084,13 @@ xfs_swap_extents( trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); +out_unlock: xfs_iunlock(ip, lock_flags); xfs_iunlock(tip, lock_flags); + unlock_two_nondirectories(VFS_I(ip), VFS_I(tip)); return error; out_trans_cancel: xfs_trans_cancel(tp); - -out_unlock: - xfs_iunlock(ip, lock_flags); - xfs_iunlock(tip, lock_flags); - return error; + goto out_unlock; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b5b9bffe3520..ac3b4db519df 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -219,7 +219,6 @@ _xfs_buf_alloc( init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_lru); INIT_LIST_HEAD(&bp->b_list); - RB_CLEAR_NODE(&bp->b_rbnode); sema_init(&bp->b_sema, 0); /* held, no waiters */ spin_lock_init(&bp->b_lock); XB_SET_OWNER(bp); @@ -423,6 +422,7 @@ retry: out_free_pages: for (i = 0; i < bp->b_page_count; i++) __free_page(bp->b_pages[i]); + bp->b_flags &= ~_XBF_PAGES; return error; } @@ -473,6 +473,62 @@ _xfs_buf_map_pages( /* * Finding and Reading Buffers */ +static int +_xfs_buf_obj_cmp( + struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct xfs_buf_map *map = arg->key; + const struct xfs_buf *bp = obj; + + /* + * The key hashing in the lookup path depends on the key being the + * first element of the compare_arg, make sure to assert this. + */ + BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0); + + if (bp->b_bn != map->bm_bn) + return 1; + + if (unlikely(bp->b_length != map->bm_len)) { + /* + * found a block number match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching for an exact match. + */ + ASSERT(bp->b_flags & XBF_STALE); + return 1; + } + return 0; +} + +static const struct rhashtable_params xfs_buf_hash_params = { + .min_size = 32, /* empty AGs have minimal footprint */ + .nelem_hint = 16, + .key_len = sizeof(xfs_daddr_t), + .key_offset = offsetof(struct xfs_buf, b_bn), + .head_offset = offsetof(struct xfs_buf, b_rhash_head), + .automatic_shrinking = true, + .obj_cmpfn = _xfs_buf_obj_cmp, +}; + +int +xfs_buf_hash_init( + struct xfs_perag *pag) +{ + spin_lock_init(&pag->pag_buf_lock); + return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params); +} + +void +xfs_buf_hash_destroy( + struct xfs_perag *pag) +{ + rhashtable_destroy(&pag->pag_buf_hash); +} /* * Look up, and creates if absent, a lockable buffer for @@ -488,27 +544,24 @@ _xfs_buf_find( xfs_buf_t *new_bp) { struct xfs_perag *pag; - struct rb_node **rbp; - struct rb_node *parent; xfs_buf_t *bp; - xfs_daddr_t blkno = map[0].bm_bn; + struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn }; xfs_daddr_t eofs; - int numblks = 0; int i; for (i = 0; i < nmaps; i++) - numblks += map[i].bm_len; + cmap.bm_len += map[i].bm_len; /* Check for IOs smaller than the sector size / not sector aligned */ - ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize)); - ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask)); + ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize)); + ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask)); /* * Corrupted block numbers can get through to here, unfortunately, so we * have to check that the buffer falls within the filesystem bounds. */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); - if (blkno < 0 || blkno >= eofs) { + if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { /* * XXX (dgc): we should really be returning -EFSCORRUPTED here, * but none of the higher level infrastructure supports @@ -516,53 +569,29 @@ _xfs_buf_find( */ xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", - __func__, blkno, eofs); + __func__, cmap.bm_bn, eofs); WARN_ON(1); return NULL; } - /* get tree root */ pag = xfs_perag_get(btp->bt_mount, - xfs_daddr_to_agno(btp->bt_mount, blkno)); + xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn)); - /* walk tree */ spin_lock(&pag->pag_buf_lock); - rbp = &pag->pag_buf_tree.rb_node; - parent = NULL; - bp = NULL; - while (*rbp) { - parent = *rbp; - bp = rb_entry(parent, struct xfs_buf, b_rbnode); - - if (blkno < bp->b_bn) - rbp = &(*rbp)->rb_left; - else if (blkno > bp->b_bn) - rbp = &(*rbp)->rb_right; - else { - /* - * found a block number match. If the range doesn't - * match, the only way this is allowed is if the buffer - * in the cache is stale and the transaction that made - * it stale has not yet committed. i.e. we are - * reallocating a busy extent. Skip this buffer and - * continue searching to the right for an exact match. - */ - if (bp->b_length != numblks) { - ASSERT(bp->b_flags & XBF_STALE); - rbp = &(*rbp)->rb_right; - continue; - } - atomic_inc(&bp->b_hold); - goto found; - } + bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap, + xfs_buf_hash_params); + if (bp) { + atomic_inc(&bp->b_hold); + goto found; } /* No match found */ if (new_bp) { - rb_link_node(&new_bp->b_rbnode, parent, rbp); - rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); /* the buffer keeps the perag reference until it is freed */ new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, + &new_bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); @@ -930,7 +959,6 @@ xfs_buf_rele( if (!pag) { ASSERT(list_empty(&bp->b_lru)); - ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) { xfs_buf_ioacct_dec(bp); xfs_buf_free(bp); @@ -938,8 +966,6 @@ xfs_buf_rele( return; } - ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); - ASSERT(atomic_read(&bp->b_hold) > 0); release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock); @@ -983,7 +1009,8 @@ xfs_buf_rele( } ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); - rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head, + xfs_buf_hash_params); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); freebuf = true; @@ -1304,7 +1331,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_WRITE) { op = REQ_OP_WRITE; if (bp->b_flags & XBF_SYNCIO) - op_flags = WRITE_SYNC; + op_flags = REQ_SYNC; if (bp->b_flags & XBF_FUA) op_flags |= REQ_FUA; if (bp->b_flags & XBF_FLUSH) @@ -1711,8 +1738,7 @@ xfs_free_buftarg( percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - if (mp->m_flags & XFS_MOUNT_BARRIER) - xfs_blkdev_issue_flush(btp); + xfs_blkdev_issue_flush(btp); kmem_free(btp); } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 1c2e52b2d926..8a9d3a9599f0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ + { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ @@ -150,7 +151,7 @@ typedef struct xfs_buf { * which is the only bit that is touched if we hit the semaphore * fast-path on locking. */ - struct rb_node b_rbnode; /* rbtree node */ + struct rhash_head b_rhash_head; /* pag buffer hash node */ xfs_daddr_t b_bn; /* block number of buffer */ int b_length; /* size of buffer in BBs */ atomic_t b_hold; /* reference count */ diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index 29816981b50a..003a99b83bd8 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -677,7 +677,6 @@ xfs_readdir( args.dp = dp; args.geo = dp->i_mount->m_dir_geo; - xfs_ilock(dp, XFS_IOLOCK_SHARED); if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) rval = xfs_dir2_sf_getdents(&args, ctx); else if ((rval = xfs_dir2_isblock(&args, &v))) @@ -686,7 +685,6 @@ xfs_readdir( rval = xfs_dir2_block_getdents(&args, ctx); else rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize); - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return rval; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7a30b8f11db7..9d06cc30e875 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -710,6 +710,10 @@ xfs_dq_get_next_id( /* Simple advance */ next_id = *id + 1; + /* If we'd wrap past the max ID, stop */ + if (next_id < *id) + return -ENOENT; + /* If new ID is within the current chunk, advancing it sufficed */ if (next_id % mp->m_quotainfo->qi_dqperchunk) { *id = next_id; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 6e4f7f900fea..bbb9eb6811b2 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -48,40 +48,6 @@ static const struct vm_operations_struct xfs_file_vm_ops; /* - * Locking primitives for read and write IO paths to ensure we consistently use - * and order the inode->i_mutex, ip->i_lock and ip->i_iolock. - */ -static inline void -xfs_rw_ilock( - struct xfs_inode *ip, - int type) -{ - if (type & XFS_IOLOCK_EXCL) - inode_lock(VFS_I(ip)); - xfs_ilock(ip, type); -} - -static inline void -xfs_rw_iunlock( - struct xfs_inode *ip, - int type) -{ - xfs_iunlock(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -static inline void -xfs_rw_ilock_demote( - struct xfs_inode *ip, - int type) -{ - xfs_ilock_demote(ip, type); - if (type & XFS_IOLOCK_EXCL) - inode_unlock(VFS_I(ip)); -} - -/* * Clear the specified ranges to zero through either the pagecache or DAX. * Holes and unwritten extents will be left as-is as they already are zeroed. */ @@ -183,19 +149,16 @@ xfs_file_fsync( xfs_iflags_clear(ip, XFS_ITRUNCATED); - if (mp->m_flags & XFS_MOUNT_BARRIER) { - /* - * If we have an RT and/or log subvolume we need to make sure - * to flush the write cache the device used for file data - * first. This is to ensure newly written file data make - * it to disk before logging the new inode size in case of - * an extending write. - */ - if (XFS_IS_REALTIME_INODE(ip)) - xfs_blkdev_issue_flush(mp->m_rtdev_targp); - else if (mp->m_logdev_targp != mp->m_ddev_targp) - xfs_blkdev_issue_flush(mp->m_ddev_targp); - } + /* + * If we have an RT and/or log subvolume we need to make sure to flush + * the write cache the device used for file data first. This is to + * ensure newly written file data make it to disk before logging the new + * inode size in case of an extending write. + */ + if (XFS_IS_REALTIME_INODE(ip)) + xfs_blkdev_issue_flush(mp->m_rtdev_targp); + else if (mp->m_logdev_targp != mp->m_ddev_targp) + xfs_blkdev_issue_flush(mp->m_ddev_targp); /* * All metadata updates are logged, which means that we just have to @@ -230,10 +193,8 @@ xfs_file_fsync( * an already allocated file and thus do not have any metadata to * commit. */ - if ((mp->m_flags & XFS_MOUNT_BARRIER) && - mp->m_logdev_targp == mp->m_ddev_targp && - !XFS_IS_REALTIME_INODE(ip) && - !log_flushed) + if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) && + mp->m_logdev_targp == mp->m_ddev_targp) xfs_blkdev_issue_flush(mp->m_ddev_targp); return error; @@ -244,62 +205,21 @@ xfs_file_dio_aio_read( struct kiocb *iocb, struct iov_iter *to) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - struct inode *inode = mapping->host; - struct xfs_inode *ip = XFS_I(inode); - loff_t isize = i_size_read(inode); + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); size_t count = iov_iter_count(to); - loff_t end = iocb->ki_pos + count - 1; - struct iov_iter data; - struct xfs_buftarg *target; - ssize_t ret = 0; + ssize_t ret; trace_xfs_file_direct_read(ip, count, iocb->ki_pos); if (!count) return 0; /* skip atime */ - if (XFS_IS_REALTIME_INODE(ip)) - target = ip->i_mount->m_rtdev_targp; - else - target = ip->i_mount->m_ddev_targp; - - /* DIO must be aligned to device logical sector size */ - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { - if (iocb->ki_pos == isize) - return 0; - return -EINVAL; - } - file_accessed(iocb->ki_filp); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out_unlock; + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } - - data = *to; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, NULL, NULL, 0); - if (ret >= 0) { - iocb->ki_pos += ret; - iov_iter_advance(to, ret); - } - -out_unlock: - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -317,9 +237,9 @@ xfs_file_dax_read( if (!count) return 0; /* skip atime */ - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); + ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); file_accessed(iocb->ki_filp); return ret; @@ -335,9 +255,9 @@ xfs_file_buffered_aio_read( trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos); - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + xfs_ilock(ip, XFS_IOLOCK_SHARED); ret = generic_file_read_iter(iocb, to); - xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; } @@ -418,15 +338,18 @@ restart: if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock, true); + error = xfs_break_layouts(inode, iolock); if (error) return error; - /* For changing security info in file_remove_privs() we need i_mutex */ + /* + * For changing security info in file_remove_privs() we need i_rwsem + * exclusively. + */ if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); goto restart; } /* @@ -451,9 +374,9 @@ restart: spin_unlock(&ip->i_flags_lock); if (!drained_dio) { if (*iolock == XFS_IOLOCK_SHARED) { - xfs_rw_iunlock(ip, *iolock); + xfs_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, *iolock); + xfs_ilock(ip, *iolock); iov_iter_reexpand(from, count); } /* @@ -496,6 +419,58 @@ restart: return 0; } +static int +xfs_dio_write_end_io( + struct kiocb *iocb, + ssize_t size, + unsigned flags) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_inode *ip = XFS_I(inode); + loff_t offset = iocb->ki_pos; + bool update_size = false; + int error = 0; + + trace_xfs_end_io_direct_write(ip, offset, size); + + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + if (size <= 0) + return size; + + /* + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&ip->i_flags_lock); + if (offset + size > i_size_read(inode)) { + i_size_write(inode, offset + size); + update_size = true; + } + spin_unlock(&ip->i_flags_lock); + + if (flags & IOMAP_DIO_COW) { + error = xfs_reflink_end_cow(ip, offset, size); + if (error) + return error; + } + + if (flags & IOMAP_DIO_UNWRITTEN) + error = xfs_iomap_write_unwritten(ip, offset, size); + else if (update_size) + error = xfs_setfilesize(ip, offset, size); + + return error; +} + /* * xfs_file_dio_aio_write - handle direct IO writes * @@ -535,9 +510,7 @@ xfs_file_dio_aio_write( int unaligned_io = 0; int iolock; size_t count = iov_iter_count(from); - loff_t end; - struct iov_iter data; - struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? + struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ @@ -559,29 +532,12 @@ xfs_file_dio_aio_write( iolock = XFS_IOLOCK_SHARED; } - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; count = iov_iter_count(from); - end = iocb->ki_pos + count - 1; - - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end); - if (ret) - goto out; - - /* - * Invalidate whole pages. This can return an error if we fail - * to invalidate a page, but this should never happen on XFS. - * Warn if it does fail. - */ - ret = invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } /* * If we are doing unaligned IO, wait for all other IO to drain, @@ -591,7 +547,7 @@ xfs_file_dio_aio_write( if (unaligned_io) inode_dio_wait(inode); else if (iolock == XFS_IOLOCK_EXCL) { - xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); iolock = XFS_IOLOCK_SHARED; } @@ -604,24 +560,9 @@ xfs_file_dio_aio_write( goto out; } - data = *from; - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, - xfs_get_blocks_direct, xfs_end_io_direct_write, - NULL, DIO_ASYNC_EXTEND); - - /* see generic_file_direct_write() for why this is necessary */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - iocb->ki_pos >> PAGE_SHIFT, - end >> PAGE_SHIFT); - } - - if (ret > 0) { - iocb->ki_pos += ret; - iov_iter_advance(from, ret); - } + ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io); out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); /* * No fallback to buffered IO on errors for XFS, direct IO will either @@ -643,7 +584,7 @@ xfs_file_dax_write( size_t count; loff_t pos; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; @@ -652,15 +593,13 @@ xfs_file_dax_write( count = iov_iter_count(from); trace_xfs_file_dax_write(ip, count, pos); - - ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops); + ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops); if (ret > 0 && iocb->ki_pos > i_size_read(inode)) { i_size_write(inode, iocb->ki_pos); error = xfs_setfilesize(ip, pos, ret); } - out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return error ? error : ret; } @@ -677,7 +616,7 @@ xfs_file_buffered_aio_write( int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - xfs_rw_ilock(ip, iolock); + xfs_ilock(ip, iolock); ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) @@ -721,7 +660,7 @@ write_retry: current->backing_dev_info = NULL; out: - xfs_rw_iunlock(ip, iolock); + xfs_iunlock(ip, iolock); return ret; } @@ -797,7 +736,7 @@ xfs_file_fallocate( return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -909,24 +848,6 @@ out_unlock: return error; } -STATIC ssize_t -xfs_file_copy_range( - struct file *file_in, - loff_t pos_in, - struct file *file_out, - loff_t pos_out, - size_t len, - unsigned int flags) -{ - int error; - - error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out, - len, false); - if (error) - return error; - return len; -} - STATIC int xfs_file_clone_range( struct file *file_in, @@ -939,7 +860,6 @@ xfs_file_clone_range( len, false); } -#define XFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) STATIC ssize_t xfs_file_dedupe_range( struct file *src_file, @@ -950,14 +870,6 @@ xfs_file_dedupe_range( { int error; - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > XFS_MAX_DEDUPE_LEN) - len = XFS_MAX_DEDUPE_LEN; - error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff, len, true); if (error) @@ -1474,7 +1386,7 @@ xfs_filemap_page_mkwrite( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); } else { ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); @@ -1501,15 +1413,9 @@ xfs_filemap_fault( return xfs_filemap_page_mkwrite(vma, vmf); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) { - /* - * we do not want to trigger unwritten extent conversion on read - * faults - that is unnecessary overhead and would also require - * changes to xfs_get_blocks_direct() to map unwritten extent - * ioend for conversion on read-only mappings. - */ - ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops); - } else + if (IS_DAX(inode)) + ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops); + else ret = filemap_fault(vma, vmf); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); @@ -1545,7 +1451,7 @@ xfs_filemap_pmd_fault( } xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault); + ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops); xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (flags & FAULT_FLAG_WRITE) @@ -1625,7 +1531,6 @@ const struct file_operations xfs_file_operations = { .fsync = xfs_file_fsync, .get_unmapped_area = thp_get_unmapped_area, .fallocate = xfs_file_fallocate, - .copy_file_range = xfs_file_copy_range, .clone_file_range = xfs_file_clone_range, .dedupe_file_range = xfs_file_dedupe_range, }; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 93d12fa2670d..242e8091296d 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -631,6 +631,20 @@ xfs_growfs_data_private( xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + /* + * If we expanded the last AG, free the per-AG reservation + * so we can reinitialize it with the new size. + */ + if (new) { + struct xfs_perag *pag; + + pag = xfs_perag_get(mp, agno); + error = xfs_ag_resv_free(pag); + xfs_perag_put(pag); + if (error) + goto out; + } + /* Reserve AG metadata blocks. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f295049db681..70ca4f608321 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -70,8 +70,6 @@ xfs_inode_alloc( ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); - /* initialise the xfs inode */ ip->i_ino = ino; ip->i_mount = mp; @@ -123,7 +121,6 @@ __xfs_inode_free( { /* asserts to verify all state is correct here */ ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!xfs_isiflocked(ip)); XFS_STATS_DEC(ip->i_mount, vn_active); call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); @@ -133,6 +130,8 @@ void xfs_inode_free( struct xfs_inode *ip) { + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always * appears to be reclaimed with an invalid inode number when in the @@ -393,8 +392,8 @@ xfs_iget_cache_hit( xfs_inode_clear_reclaim_tag(pag, ip->i_ino); inode->i_state = I_NEW; - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); - mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + init_rwsem(&inode->i_rwsem); spin_unlock(&ip->i_flags_lock); spin_unlock(&pag->pag_ici_lock); @@ -981,6 +980,7 @@ restart: if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); + /* xfs_iflush_abort() drops the flush lock */ xfs_iflush_abort(ip, false); goto reclaim; } @@ -989,10 +989,10 @@ restart: goto out_ifunlock; xfs_iunpin_wait(ip); } - if (xfs_iflags_test(ip, XFS_ISTALE)) - goto reclaim; - if (xfs_inode_clean(ip)) + if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { + xfs_ifunlock(ip); goto reclaim; + } /* * Never flush out dirty data during non-blocking reclaim, as it would @@ -1030,25 +1030,24 @@ restart: xfs_buf_relse(bp); } - xfs_iflock(ip); reclaim: + ASSERT(!xfs_isiflocked(ip)); + /* * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. - * We do this as early as possible under the ILOCK and flush lock so - * that xfs_iflush_cluster() can be guaranteed to detect races with us - * here. By doing this, we guarantee that once xfs_iflush_cluster has - * locked both the XFS_ILOCK and the flush lock that it will see either - * a valid, flushable inode that will serialise correctly against the - * locks below, or it will see a clean (and invalid) inode that it can - * skip. + * We do this as early as possible under the ILOCK so that + * xfs_iflush_cluster() can be guaranteed to detect races with us here. + * By doing this, we guarantee that once xfs_iflush_cluster has locked + * XFS_ILOCK that it will see either a valid, flushable inode that will + * serialise correctly, or it will see a clean (and invalid) inode that + * it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; ip->i_ino = 0; spin_unlock(&ip->i_flags_lock); - xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); @@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks( struct xfs_eofblocks *eofb = args; bool need_iolock = true; int match; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); - if (!xfs_reflink_has_real_cow_blocks(ip)) { + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. + */ + if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { trace_xfs_inode_free_cowblocks_invalid(ip); xfs_inode_clear_cowblocks_tag(ip); return 0; @@ -1593,7 +1597,8 @@ xfs_inode_free_cowblocks( * If the mapping is dirty or under writeback we cannot touch the * CoW fork. Leave it alone if we're in the midst of a directio. */ - if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || atomic_read(&VFS_I(ip)->i_dio_count)) return 0; diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index d45ca72af6fb..865ad1373e5e 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -133,7 +133,7 @@ xfs_icreate_item_committing( /* * This is the ops vector shared by all buf log items. */ -static struct xfs_item_ops xfs_icreate_item_ops = { +static const struct xfs_item_ops xfs_icreate_item_ops = { .iop_size = xfs_icreate_item_size, .iop_format = xfs_icreate_item_format, .iop_pin = xfs_icreate_item_pin, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4e560e6a12c1..de32f0fe47c8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and - * the i_lock. This routine allows various combinations of the locks to be - * obtained. + * In addition to i_rwsem in the VFS inode, the xfs inode contains 2 + * multi-reader locks: i_mmap_lock and the i_lock. This routine allows + * various combinations of the locks to be obtained. * * The 3 locks should always be ordered so that the IO lock is obtained first, * the mmap lock second and the ilock last in order to prevent deadlock. * * Basic locking order: * - * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock * * mmap_sem locking order: * - * i_iolock -> page lock -> mmap_sem + * i_rwsem -> page lock -> mmap_sem * mmap_sem -> i_mmap_lock -> page_lock * * The difference in mmap_sem locking order mean that we cannot hold the * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can * fault in pages during copy in/out (for buffered IO) or require the mmap_sem * in get_user_pages() to map the user pages into the kernel address space for - * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because * page faults already hold the mmap_sem. * * Hence to serialise fully against both syscall and mmap based IO, we need to - * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both * taken in places where we need to invalidate the page cache in a race * free manner (e.g. truncate, hole punch and other extent manipulation * functions). @@ -191,10 +191,13 @@ xfs_ilock( (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) - mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - else if (lock_flags & XFS_IOLOCK_SHARED) - mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_IOLOCK_EXCL) { + down_write_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } else if (lock_flags & XFS_IOLOCK_SHARED) { + down_read_nested(&VFS_I(ip)->i_rwsem, + XFS_IOLOCK_DEP(lock_flags)); + } if (lock_flags & XFS_MMAPLOCK_EXCL) mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); @@ -240,10 +243,10 @@ xfs_ilock_nowait( ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0); if (lock_flags & XFS_IOLOCK_EXCL) { - if (!mrtryupdate(&ip->i_iolock)) + if (!down_write_trylock(&VFS_I(ip)->i_rwsem)) goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - if (!mrtryaccess(&ip->i_iolock)) + if (!down_read_trylock(&VFS_I(ip)->i_rwsem)) goto out; } @@ -271,9 +274,9 @@ out_undo_mmaplock: mrunlock_shared(&ip->i_mmaplock); out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); out: return 0; } @@ -310,9 +313,9 @@ xfs_iunlock( ASSERT(lock_flags != 0); if (lock_flags & XFS_IOLOCK_EXCL) - mrunlock_excl(&ip->i_iolock); + up_write(&VFS_I(ip)->i_rwsem); else if (lock_flags & XFS_IOLOCK_SHARED) - mrunlock_shared(&ip->i_iolock); + up_read(&VFS_I(ip)->i_rwsem); if (lock_flags & XFS_MMAPLOCK_EXCL) mrunlock_excl(&ip->i_mmaplock); @@ -345,7 +348,7 @@ xfs_ilock_demote( if (lock_flags & XFS_MMAPLOCK_EXCL) mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) - mrdemote(&ip->i_iolock); + downgrade_write(&VFS_I(ip)->i_rwsem); trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_); } @@ -370,8 +373,9 @@ xfs_isilocked( if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) - return !!ip->i_iolock.mr_writer; - return rwsem_is_locked(&ip->i_iolock.mr_lock); + return !debug_locks || + lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0); + return rwsem_is_locked(&VFS_I(ip)->i_rwsem); } ASSERT(0); @@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass) if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS); - ASSERT(xfs_lockdep_subclass_ok(subclass + - XFS_IOLOCK_PARENT_VAL)); class += subclass << XFS_IOLOCK_SHIFT; - if (lock_mode & XFS_IOLOCK_PARENT) - class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT; } if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { @@ -477,8 +477,6 @@ xfs_lock_inodes( XFS_ILOCK_EXCL)); ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED))); - ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) || - inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) || inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1); ASSERT(!(lock_mode & XFS_ILOCK_EXCL) || @@ -581,10 +579,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); - ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); - } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))); + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); ASSERT(ip0->i_ino != ip1->i_ino); @@ -715,7 +711,6 @@ xfs_lookup( if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return -EIO; - xfs_ilock(dp, XFS_IOLOCK_SHARED); error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name); if (error) goto out_unlock; @@ -724,14 +719,12 @@ xfs_lookup( if (error) goto out_free_name; - xfs_iunlock(dp, XFS_IOLOCK_SHARED); return 0; out_free_name: if (ci_name) kmem_free(ci_name->name); out_unlock: - xfs_iunlock(dp, XFS_IOLOCK_SHARED); *ipp = NULL; return error; } @@ -1215,8 +1208,7 @@ xfs_create( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); @@ -1252,7 +1244,7 @@ xfs_create( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; error = xfs_dir_createname(tp, dp, name, ip->i_ino, @@ -1325,7 +1317,7 @@ xfs_create( xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -1466,11 +1458,10 @@ xfs_link( if (error) goto std_return; - xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow hard link @@ -1801,22 +1792,23 @@ xfs_inactive_ifree( int error; /* - * The ifree transaction might need to allocate blocks for record - * insertion to the finobt. We don't want to fail here at ENOSPC, so - * allow ifree to dip into the reserved block pool if necessary. - * - * Freeing large sets of inodes generally means freeing inode chunks, - * directory and file data blocks, so this should be relatively safe. - * Only under severe circumstances should it be possible to free enough - * inodes to exhaust the reserve block pool via finobt expansion while - * at the same time not creating free space in the filesystem. + * We try to use a per-AG reservation for any block needed by the finobt + * tree, but as the finobt feature predates the per-AG reservation + * support a degraded file system might not have enough space for the + * reservation at mount time. In that case try to dip into the reserved + * pool and pray. * * Send a warning if the reservation does happen to fail, as the inode * now remains allocated and sits on the unlinked list until the fs is * repaired. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (unlikely(mp->m_inotbt_nores)) { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, + XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, + &tp); + } else { + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); + } if (error) { if (error == -ENOSPC) { xfs_warn_ratelimited(mp, @@ -2041,7 +2033,6 @@ xfs_iunlink( agi->agi_unlinked[bucket_index] = cpu_to_be32(agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); return 0; @@ -2133,7 +2124,6 @@ xfs_iunlink_remove( agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino); offset = offsetof(xfs_agi_t, agi_unlinked) + (sizeof(xfs_agino_t) * bucket_index); - xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF); xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); } else { @@ -2579,10 +2569,9 @@ xfs_remove( goto std_return; } - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* @@ -2963,12 +2952,6 @@ xfs_rename( * whether the target directory is the same as the source * directory, we can lock from 2 to 4 inodes. */ - if (!new_parent) - xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - else - xfs_lock_two_inodes(src_dp, target_dp, - XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT); - xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); /* @@ -2976,9 +2959,9 @@ xfs_rename( * we can rely on either trans_commit or trans_cancel to unlock * them. */ - xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); if (new_parent) - xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index f14c1de2549d..10dcf27b4c85 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,7 +56,6 @@ typedef struct xfs_inode { /* Transaction and locking information. */ struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ - mrlock_t i_iolock; /* inode IO lock */ mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ @@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) * Synchronize processes attempting to flush the in-core inode back to disk. */ +static inline int xfs_isiflocked(struct xfs_inode *ip) +{ + return xfs_iflags_test(ip, XFS_IFLOCK); +} + extern void __xfs_iflock(struct xfs_inode *ip); static inline int xfs_iflock_nowait(struct xfs_inode *ip) @@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip) static inline void xfs_ifunlock(struct xfs_inode *ip) { + ASSERT(xfs_isiflocked(ip)); xfs_iflags_clear(ip, XFS_IFLOCK); smp_mb(); wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT); } -static inline int xfs_isiflocked(struct xfs_inode *ip) -{ - return xfs_iflags_test(ip, XFS_IFLOCK); -} - /* * Flags for inode locking. * Bit ranges: 1<<1 - 1<<16-1 -- iolock/ilock modes (bitfield) @@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * IOLOCK values * * 0-3 subclass value - * 4-7 PARENT subclass values + * 4-7 unused * * MMAPLOCK values * @@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) * */ #define XFS_IOLOCK_SHIFT 16 -#define XFS_IOLOCK_PARENT_VAL 4 -#define XFS_IOLOCK_MAX_SUBCLASS (XFS_IOLOCK_PARENT_VAL - 1) +#define XFS_IOLOCK_MAX_SUBCLASS 3 #define XFS_IOLOCK_DEP_MASK 0x000f0000 -#define XFS_IOLOCK_PARENT (XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT) #define XFS_MMAPLOCK_SHIFT 20 #define XFS_MMAPLOCK_NUMORDER 0 diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 9610e9c00952..d90e7811ccdd 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork( struct xfs_bmbt_rec *p; ASSERT(ip->i_df.if_u1.if_extents != NULL); - ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0); + ASSERT(xfs_iext_count(&ip->i_df) > 0); p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT); data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK); @@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork( ip->i_afp->if_bytes > 0) { struct xfs_bmbt_rec *p; - ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) == + ASSERT(xfs_iext_count(ip->i_afp) == ip->i_d.di_anextents); ASSERT(ip->i_afp->if_u1.if_extents != NULL); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index c245bed3249b..c67cfb451fd3 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -287,7 +287,7 @@ xfs_readlink_by_handle( return PTR_ERR(dentry); /* Restrict this handle operation to symlinks only. */ - if (!d_inode(dentry)->i_op->readlink) { + if (!d_is_symlink(dentry)) { error = -EINVAL; goto out_dput; } @@ -297,7 +297,7 @@ xfs_readlink_by_handle( goto out_dput; } - error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen); + error = vfs_readlink(dentry, hreq->ohandle, olen); out_dput: dput(dentry); @@ -639,7 +639,7 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock, false); + error = xfs_break_layouts(inode, &iolock); if (error) goto out_unlock; @@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr( if (attr) { if (ip->i_afp) { if (ip->i_afp->if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_afp->if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(ip->i_afp); else fa.fsx_nextents = ip->i_d.di_anextents; } else fa.fsx_nextents = 0; } else { if (ip->i_df.if_flags & XFS_IFEXTENTS) - fa.fsx_nextents = ip->i_df.if_bytes / - sizeof(xfs_bmbt_rec_t); + fa.fsx_nextents = xfs_iext_count(&ip->i_df); else fa.fsx_nextents = ip->i_d.di_nextents; } diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 321f57721b92..7c49938c5aed 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -19,7 +19,7 @@ #include <linux/ioctl.h> #include <linux/mount.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 436e109bb01e..1aa3abd67b36 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -395,11 +395,12 @@ xfs_iomap_prealloc_size( struct xfs_inode *ip, loff_t offset, loff_t count, - xfs_extnum_t idx, - struct xfs_bmbt_irec *prev) + xfs_extnum_t idx) { struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + struct xfs_bmbt_irec prev; int shift = 0; int64_t freesp; xfs_fsblock_t qblocks; @@ -419,8 +420,8 @@ xfs_iomap_prealloc_size( */ if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) || XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || - idx == 0 || - prev->br_startoff + prev->br_blockcount < offset_fsb) + !xfs_iext_get_extent(ifp, idx - 1, &prev) || + prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_writeio_blocks; /* @@ -439,8 +440,8 @@ xfs_iomap_prealloc_size( * always extends to MAXEXTLEN rather than falling short due to things * like stripe unit/width alignment of real extents. */ - if (prev->br_blockcount <= (MAXEXTLEN >> 1)) - alloc_blocks = prev->br_blockcount << 1; + if (prev.br_blockcount <= (MAXEXTLEN >> 1)) + alloc_blocks = prev.br_blockcount << 1; else alloc_blocks = XFS_B_TO_FSB(mp, offset); if (!alloc_blocks) @@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay( xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t maxbytes_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes); - xfs_fileoff_t end_fsb, orig_end_fsb; + xfs_fileoff_t end_fsb; int error = 0, eof = 0; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec prev; xfs_extnum_t idx; + xfs_fsblock_t prealloc_blocks = 0; ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!xfs_get_extsz_hint(ip)); @@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay( goto out_unlock; } - xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx, - &got, &prev); + eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got); if (!eof && got.br_startoff <= offset_fsb) { if (xfs_is_reflink_inode(ip)) { bool shared; @@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay( * the lower level functions are updated. */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); - end_fsb = orig_end_fsb = - min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); + end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb); if (eof) { - xfs_fsblock_t prealloc_blocks; - - prealloc_blocks = - xfs_iomap_prealloc_size(ip, offset, count, idx, &prev); + prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; + xfs_fileoff_t p_end_fsb; end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1); - end_fsb = XFS_B_TO_FSBT(mp, end_offset) + - prealloc_blocks; + p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + + prealloc_blocks; align = xfs_eof_alignment(ip, 0); if (align) - end_fsb = roundup_64(end_fsb, align); + p_end_fsb = roundup_64(p_end_fsb, align); - end_fsb = min(end_fsb, maxbytes_fsb); - ASSERT(end_fsb > offset_fsb); + p_end_fsb = min(p_end_fsb, maxbytes_fsb); + ASSERT(p_end_fsb > offset_fsb); + prealloc_blocks = p_end_fsb - end_fsb; } } retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb, - end_fsb - offset_fsb, &got, - &prev, &idx, eof); + end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof); switch (error) { case 0: break; @@ -631,8 +628,8 @@ retry: case -EDQUOT: /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; + if (prealloc_blocks) { + prealloc_blocks = 0; goto retry; } /*FALLTHRU*/ @@ -640,13 +637,6 @@ retry: goto out_unlock; } - /* - * Tag the inode as speculatively preallocated so we can reclaim this - * space on demand, if necessary. - */ - if (end_fsb != orig_end_fsb) - xfs_inode_set_eofblocks_tag(ip); - trace_xfs_iomap_alloc(ip, offset, count, 0, &got); done: if (isnullstartblock(got.br_startblock)) @@ -691,7 +681,7 @@ xfs_iomap_write_allocate( xfs_trans_t *tp; int nimaps; int error = 0; - int flags = 0; + int flags = XFS_BMAPI_DELALLOC; int nres; if (whichfork == XFS_COW_FORK) @@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && ISUNWRITTEN(imap)); } +static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +{ + /* + * COW writes will allocate delalloc space, so we need to make sure + * to take the lock exclusively here. + */ + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) + return true; + if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE)) + return true; + return false; +} + static int xfs_file_iomap_begin( struct inode *inode, @@ -979,18 +982,14 @@ xfs_file_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - if ((flags & IOMAP_WRITE) && !IS_DAX(inode) && - !xfs_get_extsz_hint(ip)) { + if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && + !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ return xfs_file_iomap_begin_delay(inode, offset, length, flags, iomap); } - /* - * COW writes will allocate delalloc space, so we need to make sure - * to take the lock exclusively here. - */ - if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { + if (need_excl_ilock(ip, flags)) { lockmode = XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); } else { @@ -1003,17 +1002,41 @@ xfs_file_iomap_begin( offset_fsb = XFS_B_TO_FSBT(mp, offset); end_fsb = XFS_B_TO_FSB(mp, offset + length); + if (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) { + shared = xfs_reflink_find_cow_mapping(ip, offset, &imap); + if (shared) { + xfs_iunlock(ip, lockmode); + goto alloc_done; + } + ASSERT(!isnullstartblock(imap.br_startblock)); + } + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; - if (flags & IOMAP_REPORT) { + if ((flags & IOMAP_REPORT) || + (xfs_is_reflink_inode(ip) && + (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) { /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed); if (error) goto out_unlock; + + /* + * We're here because we're trying to do a directio write to a + * region that isn't aligned to a filesystem block. If the + * extent is shared, fall back to buffered mode to handle the + * RMW. + */ + if (!(flags & IOMAP_REPORT) && shared) { + trace_xfs_reflink_bounce_dio_write(ip, &imap); + error = -EREMCHG; + goto out_unlock; + } } if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) { @@ -1048,6 +1071,7 @@ xfs_file_iomap_begin( if (error) return error; +alloc_done: iomap->flags = IOMAP_F_NEW; trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); } else { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 405a65cd9d6b..22c16155f1b4 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -98,12 +98,27 @@ xfs_init_security( static void xfs_dentry_to_name( struct xfs_name *namep, + struct dentry *dentry) +{ + namep->name = dentry->d_name.name; + namep->len = dentry->d_name.len; + namep->type = XFS_DIR3_FT_UNKNOWN; +} + +static int +xfs_dentry_mode_to_name( + struct xfs_name *namep, struct dentry *dentry, int mode) { namep->name = dentry->d_name.name; namep->len = dentry->d_name.len; - namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT]; + namep->type = xfs_mode_to_ftype(mode); + + if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN)) + return -EFSCORRUPTED; + + return 0; } STATIC void @@ -119,7 +134,7 @@ xfs_cleanup_inode( * xfs_init_security we must back out. * ENOSPC can hit here, among other things. */ - xfs_dentry_to_name(&teardown, dentry, 0); + xfs_dentry_to_name(&teardown, dentry); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); } @@ -154,8 +169,12 @@ xfs_generic_create( if (error) return error; + /* Verify mode is valid also for tmpfile case */ + error = xfs_dentry_mode_to_name(&name, dentry, mode); + if (unlikely(error)) + goto out_free_acl; + if (!tmpfile) { - xfs_dentry_to_name(&name, dentry, mode); error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); } else { error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); @@ -248,7 +267,7 @@ xfs_vn_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&name, dentry, 0); + xfs_dentry_to_name(&name, dentry); error = xfs_lookup(XFS_I(dir), &name, &cip, NULL); if (unlikely(error)) { if (unlikely(error != -ENOENT)) @@ -275,7 +294,7 @@ xfs_vn_ci_lookup( if (dentry->d_name.len >= MAXNAMELEN) return ERR_PTR(-ENAMETOOLONG); - xfs_dentry_to_name(&xname, dentry, 0); + xfs_dentry_to_name(&xname, dentry); error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name); if (unlikely(error)) { if (unlikely(error != -ENOENT)) @@ -310,7 +329,9 @@ xfs_vn_link( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry, inode->i_mode); + error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode); + if (unlikely(error)) + return error; error = xfs_link(XFS_I(dir), XFS_I(inode), &name); if (unlikely(error)) @@ -329,7 +350,7 @@ xfs_vn_unlink( struct xfs_name name; int error; - xfs_dentry_to_name(&name, dentry, 0); + xfs_dentry_to_name(&name, dentry); error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) @@ -359,7 +380,9 @@ xfs_vn_symlink( mode = S_IFLNK | (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); - xfs_dentry_to_name(&name, dentry, mode); + error = xfs_dentry_mode_to_name(&name, dentry, mode); + if (unlikely(error)) + goto out; error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) @@ -395,6 +418,7 @@ xfs_vn_rename( { struct inode *new_inode = d_inode(ndentry); int omode = 0; + int error; struct xfs_name oname; struct xfs_name nname; @@ -405,8 +429,14 @@ xfs_vn_rename( if (flags & RENAME_EXCHANGE) omode = d_inode(ndentry)->i_mode; - xfs_dentry_to_name(&oname, odentry, omode); - xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); + error = xfs_dentry_mode_to_name(&oname, odentry, omode); + if (omode && unlikely(error)) + return error; + + error = xfs_dentry_mode_to_name(&nname, ndentry, + d_inode(odentry)->i_mode); + if (unlikely(error)) + return error; return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, @@ -983,15 +1013,13 @@ xfs_vn_setattr( struct xfs_inode *ip = XFS_I(d_inode(dentry)); uint iolock = XFS_IOLOCK_EXCL; - xfs_ilock(ip, iolock); - error = xfs_break_layouts(d_inode(dentry), &iolock, true); - if (!error) { - xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_break_layouts(d_inode(dentry), &iolock); + if (error) + return error; - error = xfs_vn_setattr_size(dentry, iattr); - } - xfs_iunlock(ip, iolock); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + error = xfs_vn_setattr_size(dentry, iattr); + xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { error = xfs_vn_setattr_nonsize(dentry, iattr); } @@ -1122,7 +1150,6 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { }; static const struct inode_operations xfs_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = xfs_vn_get_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -1131,7 +1158,6 @@ static const struct inode_operations xfs_symlink_inode_operations = { }; static const struct inode_operations xfs_inline_symlink_inode_operations = { - .readlink = generic_readlink, .get_link = xfs_vn_get_link_inline, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 68640fb63a54..7a989de224f4 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -78,11 +78,12 @@ typedef __u32 xfs_nlink_t; #include <linux/freezer.h> #include <linux/list_sort.h> #include <linux/ratelimit.h> +#include <linux/rhashtable.h> #include <asm/page.h> #include <asm/div64.h> #include <asm/param.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/byteorder.h> #include <asm/unaligned.h> @@ -330,11 +331,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) } #define ASSERT_ALWAYS(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifdef DEBUG #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) #ifndef STATIC # define STATIC noinline @@ -345,7 +346,7 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #ifdef XFS_WARN #define ASSERT(expr) \ - (unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) + (likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__)) #ifndef STATIC # define STATIC static noinline diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3b74fa011bb1..b1469f0a91a6 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1668,7 +1668,7 @@ xlog_cksum( __uint32_t crc; /* first generate the crc for the record header ... */ - crc = xfs_start_cksum((char *)rhead, + crc = xfs_start_cksum_update((char *)rhead, sizeof(struct xlog_rec_header), offsetof(struct xlog_rec_header, h_crc)); @@ -1862,26 +1862,21 @@ xlog_sync( bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) { - bp->b_flags |= XBF_FUA; - - /* - * Flush the data device before flushing the log to make - * sure all meta data written back from the AIL actually made - * it to disk before stamping the new log tail LSN into the - * log buffer. For an external log we need to issue the - * flush explicitly, and unfortunately synchronously here; - * for an internal log we can simply use the block layer - * state machine for preflushes. - */ - if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) - xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); - else - bp->b_flags |= XBF_FLUSH; - } + /* + * Flush the data device before flushing the log to make sure all meta + * data written back from the AIL actually made it to disk before + * stamping the new log tail LSN into the log buffer. For an external + * log we need to issue the flush explicitly, and unfortunately + * synchronously here; for an internal log we can simply use the block + * layer state machine for preflushes. + */ + if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp) + xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp); + else + bp->b_flags |= XBF_FLUSH; ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -1906,10 +1901,8 @@ xlog_sync( xfs_buf_associate_memory(bp, (char *)&iclog->ic_header + count, split); bp->b_fspriv = iclog; - bp->b_flags &= ~(XBF_FUA | XBF_FLUSH); - bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE); - if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) - bp->b_flags |= XBF_FUA; + bp->b_flags &= ~XBF_FLUSH; + bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA); ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); @@ -3324,12 +3317,8 @@ xfs_log_force( xfs_mount_t *mp, uint flags) { - int error; - trace_xfs_log_force(mp, 0, _RET_IP_); - error = _xfs_log_force(mp, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force(mp, flags, NULL); } /* @@ -3473,12 +3462,8 @@ xfs_log_force_lsn( xfs_lsn_t lsn, uint flags) { - int error; - trace_xfs_log_force(mp, lsn, _RET_IP_); - error = _xfs_log_force_lsn(mp, lsn, flags, NULL); - if (error) - xfs_warn(mp, "%s: error %d returned.", __func__, error); + _xfs_log_force_lsn(mp, lsn, flags, NULL); } /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9b3d7c76915d..4a98762ec8b4 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct list_head *bucket; struct xfs_buf_cancel *bcp; @@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled( struct xlog *log, xfs_daddr_t blkno, uint len, - ushort flags) + unsigned short flags) { struct xfs_buf_cancel *bcp; @@ -5113,19 +5113,21 @@ xlog_recover_process( struct list_head *buffer_list) { int error; + __le32 old_crc = rhead->h_crc; __le32 crc; + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); /* * Nothing else to do if this is a CRC verification pass. Just return * if this a record with a non-zero crc. Unfortunately, mkfs always - * sets h_crc to 0 so we must consider this valid even on v5 supers. + * sets old_crc to 0 so we must consider this valid even on v5 supers. * Otherwise, return EFSBADCRC on failure so the callers up the stack * know precisely what failed. */ if (pass == XLOG_RECOVER_CRCPASS) { - if (rhead->h_crc && crc != rhead->h_crc) + if (old_crc && crc != old_crc) return -EFSBADCRC; return 0; } @@ -5136,11 +5138,11 @@ xlog_recover_process( * zero CRC check prevents warnings from being emitted when upgrading * the kernel from one that does not add CRCs by default. */ - if (crc != rhead->h_crc) { - if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + if (crc != old_crc) { + if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, "log record CRC mismatch: found 0x%x, expected 0x%x.", - le32_to_cpu(rhead->h_crc), + le32_to_cpu(old_crc), le32_to_cpu(crc)); xfs_hex_dump(dp, 32); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index b341f10cf481..9b9540db17a6 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -157,6 +157,7 @@ xfs_free_perag( spin_unlock(&mp->m_perag_lock); ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); + xfs_buf_hash_destroy(pag); call_rcu(&pag->rcu_head, __xfs_free_perag); } } @@ -212,8 +213,8 @@ xfs_initialize_perag( spin_lock_init(&pag->pag_ici_lock); mutex_init(&pag->pag_ici_reclaim_lock); INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC); - spin_lock_init(&pag->pag_buf_lock); - pag->pag_buf_tree = RB_ROOT; + if (xfs_buf_hash_init(pag)) + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) goto out_unwind; @@ -239,9 +240,11 @@ xfs_initialize_perag( return 0; out_unwind: + xfs_buf_hash_destroy(pag); kmem_free(pag); for (; index > first_initialised; index--) { pag = radix_tree_delete(&mp->m_perag_tree, index); + xfs_buf_hash_destroy(pag); kmem_free(pag); } return error; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 819b80b15bfb..7f351f706b7a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -140,6 +140,7 @@ typedef struct xfs_mount { int m_fixedfsid[2]; /* unchanged for life of FS */ uint m_dmevmask; /* DMI events for this FS */ __uint64_t m_flags; /* global mount flags */ + bool m_inotbt_nores; /* no per-AG finobt resv. */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ int m_ialloc_min_blks;/* min blocks in sparse inode @@ -393,8 +394,8 @@ typedef struct xfs_perag { unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_tree */ - struct rb_root pag_buf_tree; /* ordered tree of active buffers */ + spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ + struct rhashtable pag_buf_hash; /* for rcu-safe freeing */ struct rcu_head rcu_head; @@ -424,6 +425,9 @@ xfs_perag_resv( } } +int xfs_buf_hash_init(xfs_perag_t *pag); +void xfs_buf_hash_destroy(xfs_perag_t *pag); + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 93a7aafa56d6..2f2dc3c09ad0 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -32,8 +32,7 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock, - bool with_imutex) + uint *iolock) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -42,12 +41,8 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); - if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; - if (with_imutex) - inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index e8339f74966b..b587cb99b2b7 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); +int xfs_break_layouts(struct inode *inode, uint *iolock); #else static inline int -xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) +xfs_break_layouts(struct inode *inode, uint *iolock) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index a60d9e2739d1..b669b123287b 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks( return error; } rtblks = 0; - nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); + nextents = xfs_iext_count(ifp); for (idx = 0; idx < nextents; idx++) rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); *O_rtblks = (xfs_qcnt_t)rtblks; @@ -1177,7 +1177,8 @@ xfs_qm_dqusage_adjust( * the case in all other instances. It's OK that we do this because * quotacheck is done only at mount time. */ - error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL, + &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fe86a668a57e..6e4c7446c3d4 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -526,13 +526,14 @@ xfs_cui_recover( xfs_refcount_finish_one_cleanup(tp, rcur, error); error = xfs_defer_finish(&tp, &dfops, NULL); if (error) - goto abort_error; + goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); error = xfs_trans_commit(tp); return error; abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); +abort_defer: xfs_defer_cancel(&dfops); xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index a279b4e7f5fe..07593a362cd0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -243,12 +243,11 @@ xfs_reflink_reserve_cow( struct xfs_bmbt_irec *imap, bool *shared) { - struct xfs_bmbt_irec got, prev; - xfs_fileoff_t end_fsb, orig_end_fsb; - int eof = 0, error = 0; - bool trimmed; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; + int error = 0; + bool eof = false, trimmed; xfs_extnum_t idx; - xfs_extlen_t align; /* * Search the COW fork extent list first. This serves two purposes: @@ -258,8 +257,9 @@ xfs_reflink_reserve_cow( * extent list is generally faster than going out to the shared extent * tree. */ - xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx, - &got, &prev); + + if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got)) + eof = true; if (!eof && got.br_startoff <= imap->br_startoff) { trace_xfs_reflink_cow_found(ip, imap); xfs_trim_extent(imap, got.br_startoff, got.br_blockcount); @@ -285,33 +285,12 @@ xfs_reflink_reserve_cow( if (error) return error; - end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount; - - align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip)); - if (align) - end_fsb = roundup_64(end_fsb, align); - -retry: error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff, - end_fsb - imap->br_startoff, &got, &prev, &idx, eof); - switch (error) { - case 0: - break; - case -ENOSPC: - case -EDQUOT: - /* retry without any preallocation */ + imap->br_blockcount, 0, &got, &idx, eof); + if (error == -ENOSPC || error == -EDQUOT) trace_xfs_reflink_cow_enospc(ip, imap); - if (end_fsb != orig_end_fsb) { - end_fsb = orig_end_fsb; - goto retry; - } - /*FALLTHRU*/ - default: + if (error) return error; - } - - if (end_fsb != orig_end_fsb) - xfs_inode_set_cowblocks_tag(ip); trace_xfs_reflink_cow_alloc(ip, &got); return 0; @@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range( } /* - * Find the CoW reservation (and whether or not it needs block allocation) - * for a given byte offset of a file. + * Find the CoW reservation for a given byte offset of a file. */ bool xfs_reflink_find_cow_mapping( struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, - bool *need_alloc) + struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_fileoff_t bno; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + xfs_fileoff_t offset_fsb; + struct xfs_bmbt_irec got; xfs_extnum_t idx; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED)); ASSERT(xfs_is_reflink_inode(ip)); - /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - bno = XFS_B_TO_FSBT(ip->i_mount, offset); - gotp = xfs_iext_bno_to_ext(ifp, bno, &idx); - if (!gotp) + offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return false; - - xfs_bmbt_get_all(gotp, &irec); - if (bno >= irec.br_startoff + irec.br_blockcount || - bno < irec.br_startoff) + if (got.br_startoff > offset_fsb) return false; trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE, - &irec); - - /* If it's still delalloc, we must allocate later. */ - *imap = irec; - *need_alloc = !!(isnullstartblock(irec.br_startblock)); - + &got); + *imap = got; return true; } /* * Trim an extent to end at the next CoW reservation past offset_fsb. */ -int +void xfs_reflink_trim_irec_to_next_cow( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap) { - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); + struct xfs_bmbt_irec got; xfs_extnum_t idx; if (!xfs_is_reflink_inode(ip)) - return 0; + return; /* Find the extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx); - if (!gotp) - return 0; - xfs_bmbt_get_all(gotp, &irec); + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) + return; /* This is the extent before; try sliding up one. */ - if (irec.br_startoff < offset_fsb) { - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - return 0; - gotp = xfs_iext_get_ext(ifp, idx); - xfs_bmbt_get_all(gotp, &irec); + if (got.br_startoff < offset_fsb) { + if (!xfs_iext_get_extent(ifp, idx + 1, &got)) + return; } - if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount) - return 0; + if (got.br_startoff >= imap->br_startoff + imap->br_blockcount) + return; - imap->br_blockcount = irec.br_startoff - imap->br_startoff; + imap->br_blockcount = got.br_startoff - imap->br_startoff; trace_xfs_reflink_trim_irec(ip, imap); - - return 0; } /* @@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks( xfs_fileoff_t end_fsb) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; xfs_extnum_t idx; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error = 0, eof = 0; + int error = 0; if (!xfs_is_reflink_inode(ip)) return 0; - - xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx, - &got, &prev); - if (eof) + if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got)) return 0; while (got.br_startoff < end_fsb) { @@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks( xfs_bmap_del_extent_cow(ip, &idx, &got, &del); } - if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)) + if (!xfs_iext_get_extent(ifp, ++idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } /* clear tag if cow fork is emptied */ @@ -638,13 +591,13 @@ xfs_reflink_end_cow( xfs_off_t count) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - struct xfs_bmbt_irec got, prev, del; + struct xfs_bmbt_irec got, del; struct xfs_trans *tp; xfs_fileoff_t offset_fsb; xfs_fileoff_t end_fsb; xfs_fsblock_t firstfsb; struct xfs_defer_ops dfops; - int error, eof = 0; + int error; unsigned int resblks; xfs_filblks_t rlen; xfs_extnum_t idx; @@ -668,13 +621,11 @@ xfs_reflink_end_cow( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); - xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx, - &got, &prev); - /* If there is a hole at end_fsb - 1 go to the previous extent */ - if (eof || got.br_startoff > end_fsb) { + if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) || + got.br_startoff > end_fsb) { ASSERT(idx > 0); - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got); + xfs_iext_get_extent(ifp, --idx, &got); } /* Walk backwards until we're out of the I/O range... */ @@ -722,11 +673,9 @@ xfs_reflink_end_cow( error = xfs_defer_finish(&tp, &dfops, ip); if (error) goto out_defer; - next_extent: - if (idx < 0) + if (!xfs_iext_get_extent(ifp, idx, &got)) break; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got); } error = xfs_trans_commit(tp); @@ -1165,111 +1114,6 @@ err: } /* - * Read a page's worth of file data into the page cache. Return the page - * locked. - */ -static struct page * -xfs_get_page( - struct inode *inode, - xfs_off_t offset) -{ - struct address_space *mapping; - struct page *page; - pgoff_t n; - - n = offset >> PAGE_SHIFT; - mapping = inode->i_mapping; - page = read_mapping_page(mapping, n, NULL); - if (IS_ERR(page)) - return page; - if (!PageUptodate(page)) { - put_page(page); - return ERR_PTR(-EIO); - } - lock_page(page); - return page; -} - -/* - * Compare extents of two files to see if they are the same. - */ -static int -xfs_compare_extents( - struct inode *src, - xfs_off_t srcoff, - struct inode *dest, - xfs_off_t destoff, - xfs_off_t len, - bool *is_same) -{ - xfs_off_t src_poff; - xfs_off_t dest_poff; - void *src_addr; - void *dest_addr; - struct page *src_page; - struct page *dest_page; - xfs_off_t cmp_len; - bool same; - int error; - - error = -EINVAL; - same = true; - while (len) { - src_poff = srcoff & (PAGE_SIZE - 1); - dest_poff = destoff & (PAGE_SIZE - 1); - cmp_len = min(PAGE_SIZE - src_poff, - PAGE_SIZE - dest_poff); - cmp_len = min(cmp_len, len); - ASSERT(cmp_len > 0); - - trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len, - XFS_I(dest), destoff); - - src_page = xfs_get_page(src, srcoff); - if (IS_ERR(src_page)) { - error = PTR_ERR(src_page); - goto out_error; - } - dest_page = xfs_get_page(dest, destoff); - if (IS_ERR(dest_page)) { - error = PTR_ERR(dest_page); - unlock_page(src_page); - put_page(src_page); - goto out_error; - } - src_addr = kmap_atomic(src_page); - dest_addr = kmap_atomic(dest_page); - - flush_dcache_page(src_page); - flush_dcache_page(dest_page); - - if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len)) - same = false; - - kunmap_atomic(dest_addr); - kunmap_atomic(src_addr); - unlock_page(dest_page); - unlock_page(src_page); - put_page(dest_page); - put_page(src_page); - - if (!same) - break; - - srcoff += cmp_len; - destoff += cmp_len; - len -= cmp_len; - } - - *is_same = same; - return 0; - -out_error: - trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_); - return error; -} - -/* * Link a range of blocks from one file to another. */ int @@ -1286,14 +1130,11 @@ xfs_reflink_remap_range( struct inode *inode_out = file_inode(file_out); struct xfs_inode *dest = XFS_I(inode_out); struct xfs_mount *mp = src->i_mount; - loff_t bs = inode_out->i_sb->s_blocksize; bool same_inode = (inode_in == inode_out); xfs_fileoff_t sfsbno, dfsbno; xfs_filblks_t fsblen; xfs_extlen_t cowextsize; - loff_t isize; ssize_t ret; - loff_t blen; if (!xfs_sb_version_hasreflink(&mp->m_sb)) return -EOPNOTSUPP; @@ -1302,34 +1143,14 @@ xfs_reflink_remap_range( return -EIO; /* Lock both files against IO */ - if (same_inode) { - xfs_ilock(src, XFS_IOLOCK_EXCL); + lock_two_nondirectories(inode_in, inode_out); + if (same_inode) xfs_ilock(src, XFS_MMAPLOCK_EXCL); - } else { - xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL); + else xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL); - } - - /* Don't touch certain kinds of inodes */ - ret = -EPERM; - if (IS_IMMUTABLE(inode_out)) - goto out_unlock; - ret = -ETXTBSY; - if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out)) - goto out_unlock; - - - /* Don't reflink dirs, pipes, sockets... */ - ret = -EISDIR; - if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) - goto out_unlock; + /* Check file eligibility and prepare for block sharing. */ ret = -EINVAL; - if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode)) - goto out_unlock; - if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) - goto out_unlock; - /* Don't reflink realtime inodes */ if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest)) goto out_unlock; @@ -1338,91 +1159,18 @@ xfs_reflink_remap_range( if (IS_DAX(inode_in) || IS_DAX(inode_out)) goto out_unlock; - /* Are we going all the way to the end? */ - isize = i_size_read(inode_in); - if (isize == 0) { - ret = 0; - goto out_unlock; - } - - if (len == 0) - len = isize - pos_in; - - /* Ensure offsets don't wrap and the input is inside i_size */ - if (pos_in + len < pos_in || pos_out + len < pos_out || - pos_in + len > isize) - goto out_unlock; - - /* Don't allow dedupe past EOF in the dest file */ - if (is_dedupe) { - loff_t disize; - - disize = i_size_read(inode_out); - if (pos_out >= disize || pos_out + len > disize) - goto out_unlock; - } - - /* If we're linking to EOF, continue to the block boundary. */ - if (pos_in + len == isize) - blen = ALIGN(isize, bs) - pos_in; - else - blen = len; - - /* Only reflink if we're aligned to block boundaries */ - if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) || - !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs)) - goto out_unlock; - - /* Don't allow overlapped reflink within the same file */ - if (same_inode) { - if (pos_out + blen > pos_in && pos_out < pos_in + blen) - goto out_unlock; - } - - /* Wait for the completion of any pending IOs on both files */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - ret = filemap_write_and_wait_range(inode_in->i_mapping, - pos_in, pos_in + len - 1); - if (ret) - goto out_unlock; - - ret = filemap_write_and_wait_range(inode_out->i_mapping, - pos_out, pos_out + len - 1); - if (ret) + ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, + &len, is_dedupe); + if (ret <= 0) goto out_unlock; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); - /* - * Check that the extents are the same. - */ - if (is_dedupe) { - bool is_same = false; - - ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out, - len, &is_same); - if (ret) - goto out_unlock; - if (!is_same) { - ret = -EBADE; - goto out_unlock; - } - } - + /* Set flags and remap blocks. */ ret = xfs_reflink_set_inode_flag(src, dest); if (ret) goto out_unlock; - /* - * Invalidate the page cache so that we can clear any CoW mappings - * in the destination file. - */ - truncate_inode_pages_range(&inode_out->i_data, pos_out, - PAGE_ALIGN(pos_out + len) - 1); - dfsbno = XFS_B_TO_FSBT(mp, pos_out); sfsbno = XFS_B_TO_FSBT(mp, pos_in); fsblen = XFS_B_TO_FSB(mp, len); @@ -1431,6 +1179,10 @@ xfs_reflink_remap_range( if (ret) goto out_unlock; + /* Zap any page cache for the destination file's range. */ + truncate_inode_pages_range(&inode_out->i_data, pos_out, + PAGE_ALIGN(pos_out + len) - 1); + /* * Carry the cowextsize hint from src to dest if we're sharing the * entire source file to the entire destination file, the source file @@ -1447,11 +1199,9 @@ xfs_reflink_remap_range( out_unlock: xfs_iunlock(src, XFS_MMAPLOCK_EXCL); - xfs_iunlock(src, XFS_IOLOCK_EXCL); - if (src->i_ino != dest->i_ino) { + if (!same_inode) xfs_iunlock(dest, XFS_MMAPLOCK_EXCL); - xfs_iunlock(dest, XFS_IOLOCK_EXCL); - } + unlock_two_nondirectories(inode_in, inode_out); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); return ret; @@ -1697,37 +1447,3 @@ out: trace_xfs_reflink_unshare_error(ip, error, _RET_IP_); return error; } - -/* - * Does this inode have any real CoW reservations? - */ -bool -xfs_reflink_has_real_cow_blocks( - struct xfs_inode *ip) -{ - struct xfs_bmbt_irec irec; - struct xfs_ifork *ifp; - struct xfs_bmbt_rec_host *gotp; - xfs_extnum_t idx; - - if (!xfs_is_reflink_inode(ip)) - return false; - - /* Go find the old extent in the CoW fork. */ - ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); - gotp = xfs_iext_bno_to_ext(ifp, 0, &idx); - while (gotp) { - xfs_bmbt_get_all(gotp, &irec); - - if (!isnullstartblock(irec.br_startblock)) - return true; - - /* Roll on... */ - idx++; - if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) - break; - gotp = xfs_iext_get_ext(ifp, idx); - } - - return false; -} diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index fad11607c9ad..aa6a4d64bd35 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip, extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset, - struct xfs_bmbt_irec *imap, bool *need_alloc); -extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, + struct xfs_bmbt_irec *imap); +extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, @@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip, extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); -extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip); - #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c index 12d48cd8f8a4..f11282c96887 100644 --- a/fs/xfs/xfs_stats.c +++ b/fs/xfs/xfs_stats.c @@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf) } /* extra precision counters */ for_each_possible_cpu(i) { - xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes; - xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes; - xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes; + xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes; + xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes; + xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes; } len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n", @@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats) for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ - vn_active = per_cpu_ptr(stats, c)->vn_active; + vn_active = per_cpu_ptr(stats, c)->s.vn_active; memset(per_cpu_ptr(stats, c), 0, sizeof(*stats)); - per_cpu_ptr(stats, c)->vn_active = vn_active; + per_cpu_ptr(stats, c)->s.vn_active = vn_active; preempt_enable(); } } diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h index 79ad2e69fc33..375840f5a99a 100644 --- a/fs/xfs/xfs_stats.h +++ b/fs/xfs/xfs_stats.h @@ -22,9 +22,37 @@ #include <linux/percpu.h> /* + * The btree stats arrays have fixed offsets for the different stats. We + * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and + * that allows us to use fixed offsets into the stats array for each btree + * stat. These index offsets are defined in the order they will be emitted + * in the stats files, so it is possible to add new btree stat types by + * appending to the enum list below. + */ +enum { + __XBTS_lookup = 0, + __XBTS_compare = 1, + __XBTS_insrec = 2, + __XBTS_delrec = 3, + __XBTS_newroot = 4, + __XBTS_killroot = 5, + __XBTS_increment = 6, + __XBTS_decrement = 7, + __XBTS_lshift = 8, + __XBTS_rshift = 9, + __XBTS_split = 10, + __XBTS_join = 11, + __XBTS_alloc = 12, + __XBTS_free = 13, + __XBTS_moves = 14, + + __XBTS_MAX = 15, +}; + +/* * XFS global statistics */ -struct xfsstats { +struct __xfsstats { # define XFSSTAT_END_EXTENT_ALLOC 4 __uint32_t xs_allocx; __uint32_t xs_allocb; @@ -117,118 +145,20 @@ struct xfsstats { __uint32_t xb_page_found; __uint32_t xb_get_read; /* Version 2 btree counters */ -#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15) - __uint32_t xs_abtb_2_lookup; - __uint32_t xs_abtb_2_compare; - __uint32_t xs_abtb_2_insrec; - __uint32_t xs_abtb_2_delrec; - __uint32_t xs_abtb_2_newroot; - __uint32_t xs_abtb_2_killroot; - __uint32_t xs_abtb_2_increment; - __uint32_t xs_abtb_2_decrement; - __uint32_t xs_abtb_2_lshift; - __uint32_t xs_abtb_2_rshift; - __uint32_t xs_abtb_2_split; - __uint32_t xs_abtb_2_join; - __uint32_t xs_abtb_2_alloc; - __uint32_t xs_abtb_2_free; - __uint32_t xs_abtb_2_moves; -#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15) - __uint32_t xs_abtc_2_lookup; - __uint32_t xs_abtc_2_compare; - __uint32_t xs_abtc_2_insrec; - __uint32_t xs_abtc_2_delrec; - __uint32_t xs_abtc_2_newroot; - __uint32_t xs_abtc_2_killroot; - __uint32_t xs_abtc_2_increment; - __uint32_t xs_abtc_2_decrement; - __uint32_t xs_abtc_2_lshift; - __uint32_t xs_abtc_2_rshift; - __uint32_t xs_abtc_2_split; - __uint32_t xs_abtc_2_join; - __uint32_t xs_abtc_2_alloc; - __uint32_t xs_abtc_2_free; - __uint32_t xs_abtc_2_moves; -#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15) - __uint32_t xs_bmbt_2_lookup; - __uint32_t xs_bmbt_2_compare; - __uint32_t xs_bmbt_2_insrec; - __uint32_t xs_bmbt_2_delrec; - __uint32_t xs_bmbt_2_newroot; - __uint32_t xs_bmbt_2_killroot; - __uint32_t xs_bmbt_2_increment; - __uint32_t xs_bmbt_2_decrement; - __uint32_t xs_bmbt_2_lshift; - __uint32_t xs_bmbt_2_rshift; - __uint32_t xs_bmbt_2_split; - __uint32_t xs_bmbt_2_join; - __uint32_t xs_bmbt_2_alloc; - __uint32_t xs_bmbt_2_free; - __uint32_t xs_bmbt_2_moves; -#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15) - __uint32_t xs_ibt_2_lookup; - __uint32_t xs_ibt_2_compare; - __uint32_t xs_ibt_2_insrec; - __uint32_t xs_ibt_2_delrec; - __uint32_t xs_ibt_2_newroot; - __uint32_t xs_ibt_2_killroot; - __uint32_t xs_ibt_2_increment; - __uint32_t xs_ibt_2_decrement; - __uint32_t xs_ibt_2_lshift; - __uint32_t xs_ibt_2_rshift; - __uint32_t xs_ibt_2_split; - __uint32_t xs_ibt_2_join; - __uint32_t xs_ibt_2_alloc; - __uint32_t xs_ibt_2_free; - __uint32_t xs_ibt_2_moves; -#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2+15) - __uint32_t xs_fibt_2_lookup; - __uint32_t xs_fibt_2_compare; - __uint32_t xs_fibt_2_insrec; - __uint32_t xs_fibt_2_delrec; - __uint32_t xs_fibt_2_newroot; - __uint32_t xs_fibt_2_killroot; - __uint32_t xs_fibt_2_increment; - __uint32_t xs_fibt_2_decrement; - __uint32_t xs_fibt_2_lshift; - __uint32_t xs_fibt_2_rshift; - __uint32_t xs_fibt_2_split; - __uint32_t xs_fibt_2_join; - __uint32_t xs_fibt_2_alloc; - __uint32_t xs_fibt_2_free; - __uint32_t xs_fibt_2_moves; -#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2+15) - __uint32_t xs_rmap_2_lookup; - __uint32_t xs_rmap_2_compare; - __uint32_t xs_rmap_2_insrec; - __uint32_t xs_rmap_2_delrec; - __uint32_t xs_rmap_2_newroot; - __uint32_t xs_rmap_2_killroot; - __uint32_t xs_rmap_2_increment; - __uint32_t xs_rmap_2_decrement; - __uint32_t xs_rmap_2_lshift; - __uint32_t xs_rmap_2_rshift; - __uint32_t xs_rmap_2_split; - __uint32_t xs_rmap_2_join; - __uint32_t xs_rmap_2_alloc; - __uint32_t xs_rmap_2_free; - __uint32_t xs_rmap_2_moves; -#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + 15) - __uint32_t xs_refcbt_2_lookup; - __uint32_t xs_refcbt_2_compare; - __uint32_t xs_refcbt_2_insrec; - __uint32_t xs_refcbt_2_delrec; - __uint32_t xs_refcbt_2_newroot; - __uint32_t xs_refcbt_2_killroot; - __uint32_t xs_refcbt_2_increment; - __uint32_t xs_refcbt_2_decrement; - __uint32_t xs_refcbt_2_lshift; - __uint32_t xs_refcbt_2_rshift; - __uint32_t xs_refcbt_2_split; - __uint32_t xs_refcbt_2_join; - __uint32_t xs_refcbt_2_alloc; - __uint32_t xs_refcbt_2_free; - __uint32_t xs_refcbt_2_moves; +#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX) + __uint32_t xs_abtb_2[__XBTS_MAX]; +#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX) + __uint32_t xs_abtc_2[__XBTS_MAX]; +#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX) + __uint32_t xs_bmbt_2[__XBTS_MAX]; +#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX) + __uint32_t xs_ibt_2[__XBTS_MAX]; +#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX) + __uint32_t xs_fibt_2[__XBTS_MAX]; +#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX) + __uint32_t xs_rmap_2[__XBTS_MAX]; +#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX) + __uint32_t xs_refcbt_2[__XBTS_MAX]; #define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6) __uint32_t xs_qm_dqreclaims; __uint32_t xs_qm_dqreclaim_misses; @@ -245,26 +175,58 @@ struct xfsstats { __uint64_t xs_read_bytes; }; +struct xfsstats { + union { + struct __xfsstats s; + uint32_t a[XFSSTAT_END_XQMSTAT]; + }; +}; + +/* + * simple wrapper for getting the array index of s struct member offset + */ +#define XFS_STATS_CALC_INDEX(member) \ + (offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t)) + + int xfs_stats_format(struct xfsstats __percpu *stats, char *buf); void xfs_stats_clearall(struct xfsstats __percpu *stats); extern struct xstats xfsstats; #define XFS_STATS_INC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++; \ } while (0) #define XFS_STATS_DEC(mp, v) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--; \ } while (0) #define XFS_STATS_ADD(mp, v, inc) \ do { \ - per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \ - per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc); \ +} while (0) + +#define XFS_STATS_INC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++; \ +} while (0) + +#define XFS_STATS_DEC_OFF(mp, off) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]; \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]; \ +} while (0) + +#define XFS_STATS_ADD_OFF(mp, off, inc) \ +do { \ + per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc); \ + per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc); \ } while (0) #if defined(CONFIG_PROC_FS) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ade4691e3f74..eecbaac08eba 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -104,9 +104,6 @@ static const match_table_t tokens = { {Opt_sysvgroups,"sysvgroups"}, /* group-ID from current process */ {Opt_allocsize, "allocsize=%s"},/* preferred allocation size */ {Opt_norecovery,"norecovery"}, /* don't run XFS recovery */ - {Opt_barrier, "barrier"}, /* use writer barriers for log write and - * unwritten extent conversion */ - {Opt_nobarrier, "nobarrier"}, /* .. disable */ {Opt_inode64, "inode64"}, /* inodes can be allocated anywhere */ {Opt_inode32, "inode32"}, /* inode allocation limited to * XFS_MAXINUMBER_32 */ @@ -134,6 +131,12 @@ static const match_table_t tokens = { {Opt_nodiscard, "nodiscard"}, /* Do not discard unused blocks */ {Opt_dax, "dax"}, /* Enable direct access to bdev pages */ + + /* Deprecated mount options scheduled for removal */ + {Opt_barrier, "barrier"}, /* use writer barriers for log write and + * unwritten extent conversion */ + {Opt_nobarrier, "nobarrier"}, /* .. disable */ + {Opt_err, NULL}, }; @@ -301,12 +304,6 @@ xfs_parseargs( case Opt_nouuid: mp->m_flags |= XFS_MOUNT_NOUUID; break; - case Opt_barrier: - mp->m_flags |= XFS_MOUNT_BARRIER; - break; - case Opt_nobarrier: - mp->m_flags &= ~XFS_MOUNT_BARRIER; - break; case Opt_ikeep: mp->m_flags |= XFS_MOUNT_IKEEP; break; @@ -374,6 +371,14 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DAX; break; #endif + case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags |= XFS_MOUNT_BARRIER; + break; + case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); + mp->m_flags &= ~XFS_MOUNT_BARRIER; + break; default: xfs_warn(mp, "unknown mount option [%s].", p); return -EINVAL; @@ -943,7 +948,7 @@ xfs_fs_destroy_inode( trace_xfs_destroy_inode(ip); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); XFS_STATS_INC(ip->i_mount, vn_rele); XFS_STATS_INC(ip->i_mount, vn_remove); @@ -1238,9 +1243,11 @@ xfs_fs_remount( token = match_token(p, tokens, args); switch (token) { case Opt_barrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags |= XFS_MOUNT_BARRIER; break; case Opt_nobarrier: + xfs_warn(mp, "%s option is deprecated, ignoring.", p); mp->m_flags &= ~XFS_MOUNT_BARRIER; break; case Opt_inode64: diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 58142aeeeea6..f2cb45ed1d54 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -238,8 +238,7 @@ xfs_symlink( if (error) goto out_release_inode; - xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL | - XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT); + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; /* @@ -287,7 +286,7 @@ xfs_symlink( * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ - xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); unlock_dp_on_error = false; /* @@ -412,7 +411,7 @@ out_release_inode: xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) - xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); + xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 276d3023d60f..de6195e38910 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -396,7 +396,7 @@ max_retries_show( int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + if (cfg->max_retries == XFS_ERR_RETRY_FOREVER) retries = -1; else retries = cfg->max_retries; @@ -422,7 +422,7 @@ max_retries_store( return -EINVAL; if (val == -1) - cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + cfg->max_retries = XFS_ERR_RETRY_FOREVER; else cfg->max_retries = val; return count; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 0907752be62d..69c5bcd9a51b 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); DEFINE_BUF_EVENT(xfs_buf_submit); DEFINE_BUF_EVENT(xfs_buf_submit_wait); -DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_trylock_fail); @@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_queued); DEFINE_BUF_EVENT(xfs_buf_delwri_split); DEFINE_BUF_EVENT(xfs_buf_get_uncached); -DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); -DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); -DEFINE_BUF_EVENT(xfs_da_btree_corrupt); DEFINE_BUF_EVENT(xfs_reset_dqcounts); -DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ DECLARE_EVENT_CLASS(xfs_buf_flags_class, @@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); -DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered); DECLARE_EVENT_CLASS(xfs_filestream_class, TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno), @@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr); DEFINE_INODE_EVENT(xfs_dir_fsync); DEFINE_INODE_EVENT(xfs_file_fsync); DEFINE_INODE_EVENT(xfs_destroy_inode); -DEFINE_INODE_EVENT(xfs_evict_inode); DEFINE_INODE_EVENT(xfs_update_time); DEFINE_INODE_EVENT(xfs_dquot_dqalloc); @@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post, DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ TP_ARGS(ip, caller_ip)) -DEFINE_IREF_EVENT(xfs_ihold); DEFINE_IREF_EVENT(xfs_irele); DEFINE_IREF_EVENT(xfs_inode_pin); DEFINE_IREF_EVENT(xfs_inode_unpin); @@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss); DEFINE_DQUOT_EVENT(xfs_dqget_freeing); DEFINE_DQUOT_EVENT(xfs_dqget_dup); DEFINE_DQUOT_EVENT(xfs_dqput); -DEFINE_DQUOT_EVENT(xfs_dqput_wait); DEFINE_DQUOT_EVENT(xfs_dqput_free); DEFINE_DQUOT_EVENT(xfs_dqrele); DEFINE_DQUOT_EVENT(xfs_dqflush); @@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname); DEFINE_ATTR_EVENT(xfs_attr_sf_create); DEFINE_ATTR_EVENT(xfs_attr_sf_lookup); DEFINE_ATTR_EVENT(xfs_attr_sf_remove); -DEFINE_ATTR_EVENT(xfs_attr_sf_removename); DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf); DEFINE_ATTR_EVENT(xfs_attr_leaf_add); @@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall); DEFINE_ATTR_EVENT(xfs_attr_node_addname); DEFINE_ATTR_EVENT(xfs_attr_node_get); -DEFINE_ATTR_EVENT(xfs_attr_node_lookup); DEFINE_ATTR_EVENT(xfs_attr_node_replace); DEFINE_ATTR_EVENT(xfs_attr_node_removename); @@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done); DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error); DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error); -DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work); DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel); -DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish); DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); @@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name, \ struct xfs_inode *dest, xfs_off_t doffset), \ TP_ARGS(src, soffset, len, dest, doffset)) -/* two-file vfs io tracepoint class */ -DECLARE_EVENT_CLASS(xfs_double_vfs_io_class, - TP_PROTO(struct inode *src, u64 soffset, u64 len, - struct inode *dest, u64 doffset), - TP_ARGS(src, soffset, len, dest, doffset), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(unsigned long, src_ino) - __field(loff_t, src_isize) - __field(loff_t, src_offset) - __field(size_t, len) - __field(unsigned long, dest_ino) - __field(loff_t, dest_isize) - __field(loff_t, dest_offset) - ), - TP_fast_assign( - __entry->dev = src->i_sb->s_dev; - __entry->src_ino = src->i_ino; - __entry->src_isize = i_size_read(src); - __entry->src_offset = soffset; - __entry->len = len; - __entry->dest_ino = dest->i_ino; - __entry->dest_isize = i_size_read(dest); - __entry->dest_offset = doffset; - ), - TP_printk("dev %d:%d count %zd " - "ino 0x%lx isize 0x%llx offset 0x%llx -> " - "ino 0x%lx isize 0x%llx offset 0x%llx", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->len, - __entry->src_ino, - __entry->src_isize, - __entry->src_offset, - __entry->dest_ino, - __entry->dest_isize, - __entry->dest_offset) -) - -#define DEFINE_DOUBLE_VFS_IO_EVENT(name) \ -DEFINE_EVENT(xfs_double_vfs_io_class, name, \ - TP_PROTO(struct inode *src, u64 soffset, u64 len, \ - struct inode *dest, u64 doffset), \ - TP_ARGS(src, soffset, len, dest, doffset)) - -/* CoW write tracepoint */ -DECLARE_EVENT_CLASS(xfs_copy_on_write_class, - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, - xfs_extlen_t len, xfs_fsblock_t new_pblk), - TP_ARGS(ip, lblk, pblk, len, new_pblk), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fileoff_t, lblk) - __field(xfs_fsblock_t, pblk) - __field(xfs_extlen_t, len) - __field(xfs_fsblock_t, new_pblk) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->lblk = lblk; - __entry->pblk = pblk; - __entry->len = len; - __entry->new_pblk = new_pblk; - ), - TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx " - "len 0x%x new_pblk %llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->lblk, - __entry->pblk, - __entry->len, - __entry->new_pblk) -) - -#define DEFINE_COW_EVENT(name) \ -DEFINE_EVENT(xfs_copy_on_write_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \ - xfs_extlen_t len, xfs_fsblock_t new_pblk), \ - TP_ARGS(ip, lblk, pblk, len, new_pblk)) - /* inode/irec events */ DECLARE_EVENT_CLASS(xfs_inode_irec_class, TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), @@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error); @@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents); DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error); /* ioctl tracepoints */ -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range); -DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same); TRACE_EVENT(xfs_ioctl_clone, TP_PROTO(struct inode *src, struct inode *dest), TP_ARGS(src, dest), @@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone, /* unshare tracepoints */ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare); -DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block); -DEFINE_PAGE_EVENT(xfs_reflink_unshare_page); DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error); /* copy on write */ DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared); @@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); -DEFINE_COW_EVENT(xfs_reflink_fork_buf); -DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error); -DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow); -DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error); /* rmap swapext tracepoints */ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 62900938f26d..0594db435972 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static int +static void __xfs_xattr_put_listent( struct xfs_attr_list_context *context, char *prefix, @@ -148,7 +148,7 @@ __xfs_xattr_put_listent( if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ context->seen_enough = 1; - return 0; + return; } offset = (char *)context->alist + context->count; strncpy(offset, prefix, prefix_len); @@ -159,10 +159,10 @@ __xfs_xattr_put_listent( compute_size: context->count += prefix_len + namelen + 1; - return 0; + return; } -static int +static void xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, @@ -180,23 +180,19 @@ xfs_xattr_put_listent( if (namelen == SGI_ACL_FILE_SIZE && strncmp(name, SGI_ACL_FILE, SGI_ACL_FILE_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_ACCESS, strlen(XATTR_POSIX_ACL_ACCESS)); - if (ret) - return ret; } else if (namelen == SGI_ACL_DEFAULT_SIZE && strncmp(name, SGI_ACL_DEFAULT, SGI_ACL_DEFAULT_SIZE) == 0) { - int ret = __xfs_xattr_put_listent( + __xfs_xattr_put_listent( context, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN, XATTR_POSIX_ACL_DEFAULT, strlen(XATTR_POSIX_ACL_DEFAULT)); - if (ret) - return ret; } #endif @@ -205,7 +201,7 @@ xfs_xattr_put_listent( * see them. */ if (!capable(CAP_SYS_ADMIN)) - return 0; + return; prefix = XATTR_TRUSTED_PREFIX; prefix_len = XATTR_TRUSTED_PREFIX_LEN; @@ -217,8 +213,9 @@ xfs_xattr_put_listent( prefix_len = XATTR_USER_PREFIX_LEN; } - return __xfs_xattr_put_listent(context, prefix, prefix_len, name, - namelen); + __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); + return; } ssize_t |