From 684666e51585f3b136a3f505f8173d7580bc52cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Mon, 23 Jan 2017 18:50:23 +0100 Subject: jfs: atomically read inode size See i_size_read() comments in include/linux/fs.h Signed-off-by: Fabian Frederick Signed-off-by: Dave Kleikamp --- fs/jfs/resize.c | 4 ++-- fs/jfs/super.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bd9b641ada2c..7ddcb445a3d9 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -98,7 +98,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (VolumeSize) { if (newLVSize > VolumeSize) { @@ -211,7 +211,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = sb->s_bdev->bd_inode->i_size; + sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2be7c9ce6663..51881eb8ccff 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -283,7 +283,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = sb->s_bdev->bd_inode->i_size >> + *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); @@ -549,7 +549,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) goto out_unload; } inode->i_ino = 0; - inode->i_size = sb->s_bdev->bd_inode->i_size; + inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; hlist_add_fake(&inode->i_hash); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); -- cgit v1.2.3 From 00422483ad415d6267d36711f0e51f4bbbd653ed Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Thu, 29 Jun 2017 06:34:53 -0700 Subject: nfs: add export operations This support for opening files on NFS by file handle, both through the open_by_handle syscall, and for re-exporting NFS (for example using a different version). The support is very basic for now, as each open by handle will have to do an NFSv4 open operation on the wire. In the future this will hopefully be mitigated by an open file cache, as well as various optimizations in NFS for this specific case. Signed-off-by: Peng Tao [hch: incorporated various changes, resplit the patches, new changelog] Signed-off-by: Christoph Hellwig --- fs/nfs/Makefile | 2 +- fs/nfs/export.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/nfs/internal.h | 2 + fs/nfs/super.c | 2 + 4 files changed, 182 insertions(+), 1 deletion(-) create mode 100644 fs/nfs/export.c (limited to 'fs') diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 98f4e5728a67..1fb118902d57 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o CFLAGS_nfstrace.o += -I$(src) nfs-y := client.o dir.o file.o getroot.o inode.o super.o \ io.o direct.o pagelist.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o nfstrace.o + write.o namespace.o mount_clnt.o nfstrace.o export.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_SYSCTL) += sysctl.o nfs-$(CONFIG_NFS_FSCACHE) += fscache.o fscache-index.o diff --git a/fs/nfs/export.c b/fs/nfs/export.c new file mode 100644 index 000000000000..249cb96cc5b5 --- /dev/null +++ b/fs/nfs/export.c @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2015, Primary Data, Inc. All rights reserved. + * + * Tao Peng + */ +#include +#include +#include +#include + +#include "internal.h" +#include "nfstrace.h" + +#define NFSDBG_FACILITY NFSDBG_VFS + +enum { + FILEID_HIGH_OFF = 0, /* inode fileid high */ + FILEID_LOW_OFF, /* inode fileid low */ + FILE_I_TYPE_OFF, /* inode type */ + EMBED_FH_OFF /* embeded server fh */ +}; + + +static struct nfs_fh *nfs_exp_embedfh(__u32 *p) +{ + return (struct nfs_fh *)(p + EMBED_FH_OFF); +} + +/* + * Let's break subtree checking for now... otherwise we'll have to embed parent fh + * but there might not be enough space. + */ +static int +nfs_encode_fh(struct inode *inode, __u32 *p, int *max_len, struct inode *parent) +{ + struct nfs_fh *server_fh = NFS_FH(inode); + struct nfs_fh *clnt_fh = nfs_exp_embedfh(p); + size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size; + int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size); + + dprintk("%s: max fh len %d inode %p parent %p", + __func__, *max_len, inode, parent); + + if (*max_len < len || IS_AUTOMOUNT(inode)) { + dprintk("%s: fh len %d too small, required %d\n", + __func__, *max_len, len); + *max_len = len; + return FILEID_INVALID; + } + if (IS_AUTOMOUNT(inode)) { + *max_len = FILEID_INVALID; + goto out; + } + + p[FILEID_HIGH_OFF] = NFS_FILEID(inode) >> 32; + p[FILEID_LOW_OFF] = NFS_FILEID(inode); + p[FILE_I_TYPE_OFF] = inode->i_mode & S_IFMT; + p[len - 1] = 0; /* Padding */ + nfs_copy_fh(clnt_fh, server_fh); + *max_len = len; +out: + dprintk("%s: result fh fileid %llu mode %u size %d\n", + __func__, NFS_FILEID(inode), inode->i_mode, *max_len); + return *max_len; +} + +static struct dentry * +nfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + struct nfs4_label *label = NULL; + struct nfs_fattr *fattr = NULL; + struct nfs_fh *server_fh = nfs_exp_embedfh(fid->raw); + size_t fh_size = offsetof(struct nfs_fh, data) + server_fh->size; + const struct nfs_rpc_ops *rpc_ops; + struct dentry *dentry; + struct inode *inode; + int len = EMBED_FH_OFF + XDR_QUADLEN(fh_size); + u32 *p = fid->raw; + int ret; + + /* NULL translates to ESTALE */ + if (fh_len < len || fh_type != len) + return NULL; + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) { + dentry = ERR_PTR(-ENOMEM); + goto out; + } + + fattr->fileid = ((u64)p[FILEID_HIGH_OFF] << 32) + p[FILEID_LOW_OFF]; + fattr->mode = p[FILE_I_TYPE_OFF]; + fattr->valid |= NFS_ATTR_FATTR_FILEID | NFS_ATTR_FATTR_TYPE; + + dprintk("%s: fileid %llu mode %d\n", __func__, fattr->fileid, fattr->mode); + + inode = nfs_ilookup(sb, fattr, server_fh); + if (inode) + goto out_found; + + label = nfs4_label_alloc(NFS_SB(sb), GFP_KERNEL); + if (IS_ERR(label)) { + dentry = ERR_CAST(label); + goto out_free_fattr; + } + + rpc_ops = NFS_SB(sb)->nfs_client->rpc_ops; + ret = rpc_ops->getattr(NFS_SB(sb), server_fh, fattr, label); + if (ret) { + dprintk("%s: getattr failed %d\n", __func__, ret); + dentry = ERR_PTR(ret); + goto out_free_label; + } + + inode = nfs_fhget(sb, server_fh, fattr, label); + +out_found: + dentry = d_obtain_alias(inode); + +out_free_label: + nfs4_label_free(label); +out_free_fattr: + nfs_free_fattr(fattr); +out: + return dentry; +} + +static struct dentry * +nfs_get_parent(struct dentry *dentry) +{ + int ret; + struct inode *inode = d_inode(dentry), *pinode; + struct super_block *sb = inode->i_sb; + struct nfs_server *server = NFS_SB(sb); + struct nfs_fattr *fattr = NULL; + struct nfs4_label *label = NULL; + struct dentry *parent; + struct nfs_rpc_ops const *ops = server->nfs_client->rpc_ops; + struct nfs_fh fh; + + if (!ops->lookupp) + return ERR_PTR(-EACCES); + + fattr = nfs_alloc_fattr(); + if (fattr == NULL) { + parent = ERR_PTR(-ENOMEM); + goto out; + } + + label = nfs4_label_alloc(server, GFP_KERNEL); + if (IS_ERR(label)) { + parent = ERR_CAST(label); + goto out_free_fattr; + } + + ret = ops->lookupp(inode, &fh, fattr, label); + if (ret) { + parent = ERR_PTR(ret); + goto out_free_label; + } + + pinode = nfs_fhget(sb, &fh, fattr, label); + parent = d_obtain_alias(pinode); +out_free_label: + nfs4_label_free(label); +out_free_fattr: + nfs_free_fattr(fattr); +out: + return parent; +} + +const struct export_operations nfs_export_ops = { + .encode_fh = nfs_encode_fh, + .fh_to_dentry = nfs_fh_to_dentry, + .get_parent = nfs_get_parent, +}; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c5054edb0157..2ebd57498986 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -10,6 +10,8 @@ #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) +extern const struct export_operations nfs_export_ops; + struct nfs_string; /* Maximum number of readahead requests diff --git a/fs/nfs/super.c b/fs/nfs/super.c index b4176393f049..b5271644b472 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2339,6 +2339,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) */ sb->s_flags |= MS_POSIXACL; sb->s_time_gran = 1; + sb->s_export_op = &nfs_export_ops; } nfs_initialise_sb(sb); @@ -2360,6 +2361,7 @@ static void nfs_clone_super(struct super_block *sb, sb->s_xattr = old_sb->s_xattr; sb->s_op = old_sb->s_op; sb->s_time_gran = 1; + sb->s_export_op = old_sb->s_export_op; if (server->nfs_client->rpc_ops->version != 2) { /* The VFS shouldn't apply the umask to mode bits. We will do -- cgit v1.2.3 From 8fc646b44385ff0a9853f6590497e43049eeb311 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 11 Jul 2017 15:58:35 +0300 Subject: ovl: fix random return value on mount On failure to prepare_creds(), mount fails with a random return value, as err was last set to an integer cast of a valid lower mnt pointer or set to 0 if inodes index feature is enabled. Reported-by: Dan Carpenter Fixes: 3fe6e52f0626 ("ovl: override creds with the ones from ...") Cc: # v4.7 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 44dc2d6ffe0f..1cf5d3538309 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1090,6 +1090,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) else sb->s_d_op = &ovl_dentry_operations; + err = -ENOMEM; ufs->creator_cred = cred = prepare_creds(); if (!cred) goto out_put_indexdir; -- cgit v1.2.3 From ea3dad18dc5f778cfd931311a91a9315aa0065a3 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 11 Jul 2017 15:58:34 +0300 Subject: ovl: mark parent impure on ovl_link() When linking a file with copy up origin into a new parent, mark the new parent dir "impure". Fixes: ee1d6d37b6b8 ("ovl: mark upper dir with type origin entries "impure"") Cc: # v4.12 Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/dir.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 641d9ee97f91..48b70e6490f3 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -481,17 +481,30 @@ out_cleanup: } static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, - struct cattr *attr, struct dentry *hardlink) + struct cattr *attr, struct dentry *hardlink, + bool origin) { int err; const struct cred *old_cred; struct cred *override_cred; + struct dentry *parent = dentry->d_parent; - err = ovl_copy_up(dentry->d_parent); + err = ovl_copy_up(parent); if (err) return err; old_cred = ovl_override_creds(dentry->d_sb); + + /* + * When linking a file with copy up origin into a new parent, mark the + * new parent dir "impure". + */ + if (origin) { + err = ovl_set_impure(parent, ovl_dentry_upper(parent)); + if (err) + goto out_revert_creds; + } + err = -ENOMEM; override_cred = prepare_creds(); if (override_cred) { @@ -550,7 +563,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode_init_owner(inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; - err = ovl_create_or_link(dentry, inode, &attr, NULL); + err = ovl_create_or_link(dentry, inode, &attr, NULL, false); if (err) iput(inode); @@ -609,7 +622,8 @@ static int ovl_link(struct dentry *old, struct inode *newdir, inode = d_inode(old); ihold(inode); - err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old)); + err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old), + ovl_type_origin(old)); if (err) iput(inode); -- cgit v1.2.3 From 961af647fc9ebcdb3e98c8f11b399ebe01169fb1 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 11 Jul 2017 15:58:36 +0300 Subject: ovl: fix origin verification of index dir Commit 54fb347e836f ("ovl: verify index dir matches upper dir") introduced a new ovl_fh flag OVL_FH_FLAG_PATH_UPPER to indicate an upper file handle, but forgot to add the flag to the mask of valid flags, so index dir origin verification always discards existing origin and stores a new one. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/overlayfs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 60d26605e039..032120a761c4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -47,7 +47,8 @@ enum ovl_flag { /* Is the real inode encoded in fid an upper inode? */ #define OVL_FH_FLAG_PATH_UPPER (1 << 2) -#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN) +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN | \ + OVL_FH_FLAG_PATH_UPPER) #if defined(__LITTLE_ENDIAN) #define OVL_FH_FLAG_CPU_ENDIAN 0 -- cgit v1.2.3 From a59f97ff66f0058702ed6f6e26dd8fa3c34caf62 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 11 Jul 2017 15:58:37 +0300 Subject: ovl: remove unneeded check for IS_ERR() ovl_workdir_create() returns a valid index dentry or NULL. Reported-by: Dan Carpenter Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/super.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 1cf5d3538309..c88493b01d8d 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -1058,10 +1058,6 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry, OVL_INDEXDIR_NAME, true); - err = PTR_ERR(ufs->indexdir); - if (IS_ERR(ufs->indexdir)) - goto out_put_lower_mnt; - if (ufs->indexdir) { /* Verify upper root is index dir origin */ err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt, -- cgit v1.2.3 From 301bfa483016d48b7fb9cbad87c0a04a15c25b90 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:53:48 -0400 Subject: NFS: Don't run wake_up_bit() when nobody is waiting... "perf lock" shows fairly heavy contention for the bit waitqueue locks when doing an I/O heavy workload. Use a bit to tell whether or not there has been contention for a lock so that we can optimise away the bit waitqueue options in those cases. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pagelist.c | 17 ++++++++++++++++- include/linux/nfs_page.h | 2 ++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 8a23e2b40b04..de9066a92c0d 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -155,9 +155,12 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) return 0; - if (!nonblock) + if (!nonblock) { + set_bit(PG_CONTENDED1, &head->wb_flags); + smp_mb__after_atomic(); return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); + } return -EAGAIN; } @@ -175,6 +178,10 @@ nfs_page_group_lock_wait(struct nfs_page *req) WARN_ON_ONCE(head != head->wb_head); + if (!test_bit(PG_HEADLOCK, &head->wb_flags)) + return; + set_bit(PG_CONTENDED1, &head->wb_flags); + smp_mb__after_atomic(); wait_on_bit(&head->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); } @@ -193,6 +200,8 @@ nfs_page_group_unlock(struct nfs_page *req) smp_mb__before_atomic(); clear_bit(PG_HEADLOCK, &head->wb_flags); smp_mb__after_atomic(); + if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + return; wake_up_bit(&head->wb_flags, PG_HEADLOCK); } @@ -383,6 +392,8 @@ void nfs_unlock_request(struct nfs_page *req) smp_mb__before_atomic(); clear_bit(PG_BUSY, &req->wb_flags); smp_mb__after_atomic(); + if (!test_bit(PG_CONTENDED2, &req->wb_flags)) + return; wake_up_bit(&req->wb_flags, PG_BUSY); } @@ -465,6 +476,10 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { + if (!test_bit(PG_BUSY, &req->wb_flags)) + return 0; + set_bit(PG_CONTENDED2, &req->wb_flags); + smp_mb__after_atomic(); return wait_on_bit_io(&req->wb_flags, PG_BUSY, TASK_UNINTERRUPTIBLE); } diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index abbee2d15dce..d67b67ae6c8b 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -33,6 +33,8 @@ enum { PG_UPTODATE, /* page group sync bit in read path */ PG_WB_END, /* page group sync bit in write path */ PG_REMOVE, /* page group sync bit in write path */ + PG_CONTENDED1, /* Is someone waiting for a lock? */ + PG_CONTENDED2, /* Is someone waiting for a lock? */ }; struct nfs_inode; -- cgit v1.2.3 From 9bcf66c72d726322441ec82962994e69157613e4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 22 Jun 2017 15:31:10 +0200 Subject: jfs: Don't clear SGID when inheriting ACLs When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __jfs_set_acl() into jfs_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef CC: stable@vger.kernel.org CC: jfs-discussion@lists.sourceforge.net Signed-off-by: Jan Kara Signed-off-by: Dave Kleikamp --- fs/jfs/acl.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 7bc186f4ed4d..1be45c8d460d 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -77,13 +77,6 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: ea_name = XATTR_NAME_POSIX_ACL_ACCESS; - if (acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (rc) - return rc; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); - } break; case ACL_TYPE_DEFAULT: ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; @@ -118,9 +111,17 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); + if (type == ACL_TYPE_ACCESS && acl) { + rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (rc) + goto end_tx; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = __jfs_set_acl(tid, inode, type, acl); if (!rc) rc = txCommit(tid, 1, &inode, 0); +end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); return rc; -- cgit v1.2.3 From f070e5ac9bc7de71c34402048ce5526dccbd347c Mon Sep 17 00:00:00 2001 From: "Ernesto A. Fernández" Date: Wed, 12 Jul 2017 06:55:35 -0300 Subject: jfs: preserve i_mode if __jfs_set_acl() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When changing a file's acl mask, __jfs_set_acl() will first set the group bits of i_mode to the value of the mask, and only then set the actual extended attribute representing the new acl. If the second part fails (due to lack of space, for example) and the file had no acl attribute to begin with, the system will from now on assume that the mask permission bits are actual group permission bits, potentially granting access to the wrong users. Prevent this by only changing the inode mode after the acl has been set. Signed-off-by: Ernesto A. Fernández Signed-off-by: Dave Kleikamp --- fs/jfs/acl.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 1be45c8d460d..2e71b6e7e646 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -108,19 +108,26 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int rc; tid_t tid; + int update_mode = 0; + umode_t mode = inode->i_mode; tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(inode, &inode->i_mode, &acl); + rc = posix_acl_update_mode(inode, &mode, &acl); if (rc) goto end_tx; - inode->i_ctime = current_time(inode); - mark_inode_dirty(inode); + update_mode = 1; } rc = __jfs_set_acl(tid, inode, type, acl); - if (!rc) + if (!rc) { + if (update_mode) { + inode->i_mode = mode; + inode->i_ctime = current_time(inode); + mark_inode_dirty(inode); + } rc = txCommit(tid, 1, &inode, 0); + } end_tx: txEnd(tid); mutex_unlock(&JFS_IP(inode)->commit_mutex); -- cgit v1.2.3 From e33bf72361bdd764c827e160249a3e06d2a8e2fe Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jul 2017 20:34:02 +0100 Subject: Btrfs: fix dir item validation when replaying xattr deletes We were passing an incorrect slot number to the function that validates directory items when we are replaying xattr deletes from a log tree. The correct slot is stored at variable 'i' and not at 'path->slots[0]', so the call to the validation function was only correct for the first iteration of the loop, when 'i == path->slots[0]'. After this fix, the fstest generic/066 passes again. Fixes: 8ee8c2d62d5f ("btrfs: Verify dir_item in replay_xattr_deletes") Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f20ef211a73d..3a11ae63676e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2153,8 +2153,7 @@ process_leaf: u32 this_len = sizeof(*di) + name_len + data_len; char *name; - ret = verify_dir_item(fs_info, path->nodes[0], - path->slots[0], di); + ret = verify_dir_item(fs_info, path->nodes[0], i, di); if (ret) { ret = -EIO; goto out; -- cgit v1.2.3 From 89a6814d9b665b196aa3a102f96b6dc7e8cb669e Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Thu, 29 Jun 2017 11:48:26 -0400 Subject: mount: copy the port field into the cloned nfs_server structure. Doing this copy eliminates the "port=0" entry in the /proc/mounts entries Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=69241 Signed-off-by: Steve Dickson Signed-off-by: Anna Schumaker --- fs/nfs/client.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ee5ddbd36088..efebe6cf4378 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -820,6 +820,7 @@ void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *sour target->caps = source->caps; target->options = source->options; target->auth_info = source->auth_info; + target->port = source->port; } EXPORT_SYMBOL_GPL(nfs_server_copy_userdata); -- cgit v1.2.3 From e39928f942ff7a9d1cb9005423e9e35a0a4bb2e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Jul 2017 19:10:56 -0400 Subject: NFS: Fix a COMMIT race in pNFS We must make sure that cinfo->ds->nwritten is in sync with the commit list, since it is checked as part of pnfs_scan_commit_lists(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs_nfs.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index d40755a0984b..3e6de85faf42 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -159,13 +159,18 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, { struct pnfs_commit_bucket *b; struct pnfs_layout_segment *freeme; + int nwritten; int i; lockdep_assert_held(&cinfo->inode->i_lock); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - if (pnfs_generic_transfer_commit_list(&b->written, dst, - cinfo, 0)) { + nwritten = pnfs_generic_transfer_commit_list(&b->written, + dst, cinfo, 0); + if (!nwritten) + continue; + cinfo->ds->nwritten -= nwritten; + if (list_empty(&b->written)) { freeme = b->wlseg; b->wlseg = NULL; spin_unlock(&cinfo->inode->i_lock); @@ -174,7 +179,6 @@ restart: goto restart; } } - cinfo->ds->nwritten = 0; } EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs); -- cgit v1.2.3 From 41181886459b281ed1346369f27b3c3bb8e2dde3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Jul 2017 19:10:57 -0400 Subject: NFS: Fix another COMMIT race in pNFS We must make sure that cinfo->ds->ncommitting is in sync with the commit list, since it is checked as part of pnfs_commit_list(). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs_nfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 3e6de85faf42..7ceb86627e54 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -187,6 +187,7 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; + struct list_head *pos; LIST_HEAD(pages); int i; @@ -197,6 +198,8 @@ static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx) continue; freeme = bucket->clseg; bucket->clseg = NULL; + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; list_splice_init(&bucket->committing, &pages); spin_unlock(&cinfo->inode->i_lock); nfs_retry_commit(&pages, freeme, cinfo, i); @@ -247,9 +250,12 @@ void pnfs_fetch_commit_bucket_list(struct list_head *pages, struct nfs_commit_info *cinfo) { struct pnfs_commit_bucket *bucket; + struct list_head *pos; bucket = &cinfo->ds->buckets[data->ds_commit_index]; spin_lock(&cinfo->inode->i_lock); + list_for_each(pos, &bucket->committing) + cinfo->ds->ncommitting--; list_splice_init(&bucket->committing, pages); data->lseg = bucket->clseg; bucket->clseg = NULL; @@ -334,7 +340,6 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } } out: - cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); -- cgit v1.2.3 From 4b75053e9bb6db4b700526d2d67c67a0d07f867e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Jul 2017 19:10:58 -0400 Subject: pNFS/flexfiles: Handle expired layout segments in ff_layout_initiate_commit() If the layout has expired due to a fencing event, then we should not attempt to commit to the DS. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/flexfilelayout/flexfilelayout.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 1f2ac3dd0fe5..b0fa83a60754 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1842,6 +1842,10 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) int vers, ret; struct nfs_fh *fh; + if (!lseg || !(pnfs_is_valid_lseg(lseg) || + test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) + goto out_err; + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) -- cgit v1.2.3 From 213297369cf4900eba906dd32ce845074e30f487 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Jul 2017 19:10:59 -0400 Subject: Revert commit 722f0b891198 ("pNFS: Don't send COMMITs to the DSes if...") Doing the test without taking any locks is racy, and so really it makes more sense to do it in the flexfiles code (which is the only case that cares). Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/pnfs_nfs.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7ceb86627e54..25f28fa64c57 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -224,13 +224,6 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - /* - * If the layout segment is invalid, then let - * pnfs_generic_retry_commit() clean up the bucket. - */ - if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && - !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) - break; data = nfs_commitdata_alloc(false); if (!data) break; -- cgit v1.2.3 From 1d88f183734c0d916428911df006e645a6162cab Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 20 Jul 2017 11:08:21 +0200 Subject: ovl: fix xattr get and set with selinux inode_doinit_with_dentry() in SELinux wants to read the upper inode's xattr to get security label, and ovl_xattr_get() calls ovl_dentry_real(), which depends on dentry->d_inode, but d_inode is null and not initialized yet at this point resulting in an Oops. Fix by getting the upperdentry info from the inode directly in this case. Reported-by: Eryu Guan Fixes: 09d8b586731b ("ovl: move __upperdentry to ovl_inode") Signed-off-by: Miklos Szeredi --- fs/overlayfs/inode.c | 32 +++++++++++++++++--------------- fs/overlayfs/overlayfs.h | 7 ++++--- fs/overlayfs/super.c | 8 ++++---- fs/overlayfs/util.c | 7 ++++++- 4 files changed, 31 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 69f4fc26ee39..5bc71642b226 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -202,37 +202,38 @@ bool ovl_is_private_xattr(const char *name) sizeof(OVL_XATTR_PREFIX) - 1) == 0; } -int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags) { int err; - struct path realpath; - enum ovl_path_type type = ovl_path_real(dentry, &realpath); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); + struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); const struct cred *old_cred; err = ovl_want_write(dentry); if (err) goto out; - if (!value && !OVL_TYPE_UPPER(type)) { - err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (!value && !upperdentry) { + err = vfs_getxattr(realdentry, name, NULL, 0); if (err < 0) goto out_drop_write; } - err = ovl_copy_up(dentry); - if (err) - goto out_drop_write; + if (!upperdentry) { + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; - if (!OVL_TYPE_UPPER(type)) - ovl_path_upper(dentry, &realpath); + realdentry = ovl_dentry_upper(dentry); + } old_cred = ovl_override_creds(dentry->d_sb); if (value) - err = vfs_setxattr(realpath.dentry, name, value, size, flags); + err = vfs_setxattr(realdentry, name, value, size, flags); else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(realpath.dentry, name); + err = vfs_removexattr(realdentry, name); } revert_creds(old_cred); @@ -242,12 +243,13 @@ out: return err; } -int ovl_xattr_get(struct dentry *dentry, const char *name, +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size) { - struct dentry *realdentry = ovl_dentry_real(dentry); ssize_t res; const struct cred *old_cred; + struct dentry *realdentry = + ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); old_cred = ovl_override_creds(dentry->d_sb); res = vfs_getxattr(realdentry, name, value, size); diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 032120a761c4..e927a62c97ae 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -200,6 +200,7 @@ enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); @@ -271,9 +272,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr); int ovl_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ovl_permission(struct inode *inode, int mask); -int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int ovl_xattr_get(struct dentry *dentry, const char *name, +int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, + const void *value, size_t size, int flags); +int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); struct posix_acl *ovl_get_acl(struct inode *inode, int type); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c88493b01d8d..d86e89f97201 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -692,7 +692,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - return ovl_xattr_get(dentry, handler->name, buffer, size); + return ovl_xattr_get(dentry, inode, handler->name, buffer, size); } static int __maybe_unused @@ -742,7 +742,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, return err; } - err = ovl_xattr_set(dentry, handler->name, value, size, flags); + err = ovl_xattr_set(dentry, inode, handler->name, value, size, flags); if (!err) ovl_copyattr(ovl_inode_real(inode), inode); @@ -772,7 +772,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { - return ovl_xattr_get(dentry, name, buffer, size); + return ovl_xattr_get(dentry, inode, name, buffer, size); } static int ovl_other_xattr_set(const struct xattr_handler *handler, @@ -780,7 +780,7 @@ static int ovl_other_xattr_set(const struct xattr_handler *handler, const char *name, const void *value, size_t size, int flags) { - return ovl_xattr_set(dentry, name, value, size, flags); + return ovl_xattr_set(dentry, inode, name, value, size, flags); } static const struct xattr_handler __maybe_unused diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index c492ba75c659..f46ad75dc96a 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -157,9 +157,14 @@ struct dentry *ovl_dentry_real(struct dentry *dentry) return ovl_dentry_upper(dentry) ?: ovl_dentry_lower(dentry); } +struct dentry *ovl_i_dentry_upper(struct inode *inode) +{ + return ovl_upperdentry_dereference(OVL_I(inode)); +} + struct inode *ovl_inode_upper(struct inode *inode) { - struct dentry *upperdentry = ovl_upperdentry_dereference(OVL_I(inode)); + struct dentry *upperdentry = ovl_i_dentry_upper(inode); return upperdentry ? d_inode(upperdentry) : NULL; } -- cgit v1.2.3 From 61b674710cd9afa2a8b17bdd1ac80670c9b79f1d Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Jul 2017 21:07:42 +0300 Subject: ovl: do not cleanup directory and whiteout index entries Directory index entries are going to be used for looking up redirected upper dirs by lower dir fh when decoding an overlay file handle of a merge dir. Whiteout index entries are going to be used as an indication that an exported overlay file handle should be treated as stale (i.e. after unlink of the overlay inode). We don't know the verification rules for directory and whiteout index entries, because they have not been implemented yet, so fail to mount overlay rw if those entries are found to avoid corrupting an index that was created by a newer kernel. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 19 +++++++++++++++---- fs/overlayfs/readdir.c | 5 ++++- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 9bc0e580a5b3..229a88ff335c 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -397,8 +397,19 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, if (!d_inode(index)) return 0; - err = -EISDIR; - if (d_is_dir(index)) + /* + * Directory index entries are going to be used for looking up + * redirected upper dirs by lower dir fh when decoding an overlay + * file handle of a merge dir. Whiteout index entries are going to be + * used as an indication that an exported overlay file handle should + * be treated as stale (i.e. after unlink of the overlay inode). + * We don't know the verification rules for directory and whiteout + * index entries, because they have not been implemented yet, so return + * EROFS if those entries are found to avoid corrupting an index that + * was created by a newer kernel. + */ + err = -EROFS; + if (d_is_dir(index) || ovl_is_whiteout(index)) goto fail; err = -EINVAL; @@ -436,8 +447,8 @@ out: return err; fail: - pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, err=%i)\n", - index, err); + pr_warn_ratelimited("overlayfs: failed to verify index (%pd2, ftype=%x, err=%i)\n", + index, d_inode(index)->i_mode & S_IFMT, err); goto out; } diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 0298463cf9c3..3d424a51cabb 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -703,7 +703,10 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, err = PTR_ERR(index); break; } - if (ovl_verify_index(index, lowerstack, numlower)) { + err = ovl_verify_index(index, lowerstack, numlower); + if (err) { + if (err == -EROFS) + break; err = ovl_cleanup(dir, index); if (err) break; -- cgit v1.2.3 From 0e082555cec9510d276965fe391f709acb32c0f4 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Tue, 18 Jul 2017 21:07:43 +0300 Subject: ovl: check for bad and whiteout index on lookup Index should always be of the same file type as origin, except for the case of a whiteout index. A whiteout index should only exist if all lower aliases have been unlinked, which means that finding a lower origin on lookup whose index is a whiteout should be treated as a lookup error. Signed-off-by: Amir Goldstein Signed-off-by: Miklos Szeredi --- fs/overlayfs/namei.c | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 229a88ff335c..8aef2b304b2d 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -513,6 +513,7 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, goto out; } + inode = d_inode(index); if (d_is_negative(index)) { if (upper && d_inode(origin)->i_nlink > 1) { pr_warn_ratelimited("overlayfs: hard link with origin but no index (ino=%lu).\n", @@ -522,11 +523,22 @@ static struct dentry *ovl_lookup_index(struct dentry *dentry, dput(index); index = NULL; - } else if (upper && d_inode(index) != d_inode(upper)) { - inode = d_inode(index); - pr_warn_ratelimited("overlayfs: wrong index found (index ino: %lu, upper ino: %lu).\n", - d_inode(index)->i_ino, - d_inode(upper)->i_ino); + } else if (upper && d_inode(upper) != inode) { + pr_warn_ratelimited("overlayfs: wrong index found (index=%pd2, ino=%lu, upper ino=%lu).\n", + index, inode->i_ino, d_inode(upper)->i_ino); + goto fail; + } else if (ovl_dentry_weird(index) || ovl_is_whiteout(index) || + ((inode->i_mode ^ d_inode(origin)->i_mode) & S_IFMT)) { + /* + * Index should always be of the same file type as origin + * except for the case of a whiteout index. A whiteout + * index should only exist if all lower aliases have been + * unlinked, which means that finding a lower origin on lookup + * whose index is a whiteout should be treated as an error. + */ + pr_warn_ratelimited("overlayfs: bad index found (index=%pd2, ftype=%x, origin ftype=%x).\n", + index, d_inode(index)->i_mode & S_IFMT, + d_inode(origin)->i_mode & S_IFMT); goto fail; } -- cgit v1.2.3 From 1e86eabe73b73c82e1110c746ed3ec6d5e1c0a0d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:45 -0700 Subject: xfs: check _btree_check_block value Check the _btree_check_block return value for the firstrec and lastrec functions, since we have the ability to signal that the repositioning did not succeed. Fixes-coverity-id: 114067 Fixes-coverity-id: 114068 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_btree.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 4da85fff69ad..e0bcc4a59efd 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -728,7 +728,8 @@ xfs_btree_firstrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ @@ -757,7 +758,8 @@ xfs_btree_lastrec( * Get the block pointer for this level. */ block = xfs_btree_get_block(cur, level, &bp); - xfs_btree_check_block(cur, block, level, bp); + if (xfs_btree_check_block(cur, block, level, bp)) + return 0; /* * It's empty, there is no such record. */ -- cgit v1.2.3 From 4c1a67bd3606540b9b42caff34a1d5cd94b1cf65 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:51 -0700 Subject: xfs: set firstfsb to NULLFSBLOCK before feeding it to _bmapi_write We must initialize the firstfsb parameter to _bmapi_write so that it doesn't incorrectly treat stack garbage as a restriction on which AGs it can search for free space. Fixes-coverity-id: 1402025 Fixes-coverity-id: 1415167 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_bmap.c | 9 +++++++++ fs/xfs/xfs_reflink.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0a9880777c9c..ee118ceb702f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -6499,6 +6499,15 @@ xfs_bmap_finish_one( xfs_fsblock_t firstfsb; int error = 0; + /* + * firstfsb is tied to the transaction lifetime and is used to + * ensure correct AG locking order and schedule work item + * continuations. XFS_BUI_MAX_FAST_EXTENTS (== 1) restricts us + * to only making one bmap call per transaction, so it should + * be safe to have it as a local variable here. + */ + firstfsb = NULLFSBLOCK; + trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index ab2270a87196..d9b3d57a1921 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -329,7 +329,7 @@ xfs_reflink_convert_cow_extent( xfs_filblks_t count_fsb, struct xfs_defer_ops *dfops) { - xfs_fsblock_t first_block; + xfs_fsblock_t first_block = NULLFSBLOCK; int nimaps = 1; if (imap->br_state == XFS_EXT_NORM) -- cgit v1.2.3 From 10479e2dea83d4c421ad05dfc55d918aa8dfc0cd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 17 Jul 2017 14:30:57 -0700 Subject: xfs: check _alloc_read_agf buffer pointer before using In some circumstances, _alloc_read_agf can return an error code of zero but also a null AGF buffer pointer. Check for this and jump out. Fixes-coverity-id: 1415250 Fixes-coverity-id: 1415320 Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_refcount.c | 4 ++++ fs/xfs/xfs_reflink.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 900ea231f9a3..45b1c3b4e047 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1638,6 +1638,10 @@ xfs_refcount_recover_cow_leftovers( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) goto out_trans; + if (!agbp) { + error = -ENOMEM; + goto out_trans; + } cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); /* Find all the leftover CoW staging extents. */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index d9b3d57a1921..f45fbf0db9bb 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -170,6 +170,8 @@ xfs_reflink_find_shared( error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (error) return error; + if (!agbp) + return -ENOMEM; cur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, NULL); -- cgit v1.2.3 From ecc7b435d2beb025d7506af74cf749af2cef5734 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Tue, 18 Jul 2017 13:32:32 +0800 Subject: nfs: count correct array for mnt3_counts array size Array size of mnt3_counts should be the size of array mnt3_procedures, not mnt_procedures, though they're same in size right now. Found this by code inspection. Fixes: 1c5876ddbdb4 ("sunrpc: move p_count out of struct rpc_procinfo") Cc: Christoph Hellwig Signed-off-by: Eryu Guan Reviewed-by: Christoph Hellwig Signed-off-by: Anna Schumaker --- fs/nfs/mount_clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 3efe946672be..60bad882c123 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -512,7 +512,7 @@ static const struct rpc_version mnt_version1 = { .counts = mnt_counts, }; -static unsigned int mnt3_counts[ARRAY_SIZE(mnt_procedures)]; +static unsigned int mnt3_counts[ARRAY_SIZE(mnt3_procedures)]; static const struct rpc_version mnt_version3 = { .number = 3, .nrprocs = ARRAY_SIZE(mnt3_procedures), -- cgit v1.2.3 From 15d4b73ac2232d6f2beb61d8b2400ea66e4da606 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:54:32 -0400 Subject: NFS: Refactor NFS access to kernel access mask calculation Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1255891e5695..24b3a6748062 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2375,16 +2375,31 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) } EXPORT_SYMBOL_GPL(nfs_access_add_cache); +#define NFS_MAY_READ (NFS4_ACCESS_READ) +#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ + NFS4_ACCESS_EXTEND | \ + NFS4_ACCESS_DELETE) +#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) +#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) +static int +nfs_access_calc_mask(u32 access_result) +{ + int mask = 0; + + if (access_result & NFS_MAY_READ) + mask |= MAY_READ; + if (access_result & NFS_MAY_WRITE) + mask |= MAY_WRITE; + if (access_result & NFS_MAY_LOOKUP) + mask |= MAY_EXEC; + if (access_result & NFS_MAY_EXECUTE) + mask |= MAY_EXEC; + return mask; +} + void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { - entry->mask = 0; - if (access_result & NFS4_ACCESS_READ) - entry->mask |= MAY_READ; - if (access_result & - (NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (access_result & (NFS4_ACCESS_LOOKUP|NFS4_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; + entry->mask = nfs_access_calc_mask(access_result); } EXPORT_SYMBOL_GPL(nfs_access_set_mask); -- cgit v1.2.3 From eda3e20847788c453aa7ab478aeaceb56ed29cb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:54:33 -0400 Subject: NFSv3: Convert nfs3_proc_access() to use nfs_access_set_mask() Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/nfs3proc.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index df4a7d3ab915..d1e87ec0df84 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -220,15 +220,8 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); nfs_refresh_inode(inode, res.fattr); - if (status == 0) { - entry->mask = 0; - if (res.access & NFS3_ACCESS_READ) - entry->mask |= MAY_READ; - if (res.access & (NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE)) - entry->mask |= MAY_WRITE; - if (res.access & (NFS3_ACCESS_LOOKUP|NFS3_ACCESS_EXECUTE)) - entry->mask |= MAY_EXEC; - } + if (status == 0) + nfs_access_set_mask(entry, res.access); nfs_free_fattr(res.fattr); out: dprintk("NFS reply access: %d\n", status); -- cgit v1.2.3 From bd8b2441742b49c76bec707757bd9c028ea9838e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:54:34 -0400 Subject: NFS: Store the raw NFS access mask in the inode's access cache Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 9 ++++++--- include/linux/nfs_fs.h | 2 +- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 24b3a6748062..8fae8b00b8f5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2399,7 +2399,7 @@ nfs_access_calc_mask(u32 access_result) void nfs_access_set_mask(struct nfs_access_entry *entry, u32 access_result) { - entry->mask = nfs_access_calc_mask(access_result); + entry->mask = access_result; } EXPORT_SYMBOL_GPL(nfs_access_set_mask); @@ -2407,6 +2407,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) { struct nfs_access_entry cache; bool may_block = (mask & MAY_NOT_BLOCK) == 0; + int cache_mask; int status; trace_nfs_access_enter(inode); @@ -2422,7 +2423,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) goto out; /* Be clever: ask server to check for all possible rights */ - cache.mask = MAY_EXEC | MAY_WRITE | MAY_READ; + cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE + | NFS_MAY_WRITE | NFS_MAY_READ; cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); @@ -2436,7 +2438,8 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) } nfs_access_add_cache(inode, &cache); out_cached: - if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) + cache_mask = nfs_access_calc_mask(cache.mask); + if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: trace_nfs_access_exit(inode, status); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index e52cc55ac300..5cc91d6381a3 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -51,7 +51,7 @@ struct nfs_access_entry { struct list_head lru; unsigned long jiffies; struct rpc_cred * cred; - int mask; + __u32 mask; struct rcu_head rcu_head; }; -- cgit v1.2.3 From ecbb903c56745d59c301db26dd7d8b74b520eb84 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 Jul 2017 17:54:35 -0400 Subject: NFS: Be more careful about mapping file permissions When mapping a directory, we want the MAY_WRITE permissions to reflect whether or not we have permission to modify, add and delete the directory entries. MAY_EXEC must map to lookup permissions. On the other hand, for files, we want MAY_WRITE to reflect a permission to modify and extend the file. Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/dir.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 8fae8b00b8f5..37a6180ee2e8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2379,21 +2379,30 @@ EXPORT_SYMBOL_GPL(nfs_access_add_cache); #define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ NFS4_ACCESS_EXTEND | \ NFS4_ACCESS_DELETE) +#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \ + NFS4_ACCESS_EXTEND) +#define NFS_DIR_MAY_WRITE NFS_MAY_WRITE #define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) #define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) static int -nfs_access_calc_mask(u32 access_result) +nfs_access_calc_mask(u32 access_result, umode_t umode) { int mask = 0; if (access_result & NFS_MAY_READ) mask |= MAY_READ; - if (access_result & NFS_MAY_WRITE) - mask |= MAY_WRITE; - if (access_result & NFS_MAY_LOOKUP) - mask |= MAY_EXEC; - if (access_result & NFS_MAY_EXECUTE) - mask |= MAY_EXEC; + if (S_ISDIR(umode)) { + if ((access_result & NFS_DIR_MAY_WRITE) == NFS_DIR_MAY_WRITE) + mask |= MAY_WRITE; + if ((access_result & NFS_MAY_LOOKUP) == NFS_MAY_LOOKUP) + mask |= MAY_EXEC; + } else if (S_ISREG(umode)) { + if ((access_result & NFS_FILE_MAY_WRITE) == NFS_FILE_MAY_WRITE) + mask |= MAY_WRITE; + if ((access_result & NFS_MAY_EXECUTE) == NFS_MAY_EXECUTE) + mask |= MAY_EXEC; + } else if (access_result & NFS_MAY_WRITE) + mask |= MAY_WRITE; return mask; } @@ -2438,7 +2447,7 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) } nfs_access_add_cache(inode, &cache); out_cached: - cache_mask = nfs_access_calc_mask(cache.mask); + cache_mask = nfs_access_calc_mask(cache.mask, inode->i_mode); if ((mask & ~cache_mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) != 0) status = -EACCES; out: -- cgit v1.2.3 From 1ebf980127924c639e2b85c08468311ba1c95b70 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Jul 2017 17:00:02 -0400 Subject: NFS/filelayout: Fix racy setting of fl->dsaddr in filelayout_check_deviceid() We must set fl->dsaddr once, and once only, even if there are multiple processes calling filelayout_check_deviceid() for the same layout segment. Reported-by: Olga Kornievskaia Signed-off-by: Trond Myklebust Signed-off-by: Anna Schumaker --- fs/nfs/filelayout/filelayout.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 080fc6b278bd..44c638b7876c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -542,6 +542,10 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, struct nfs4_file_layout_dsaddr *dsaddr; int status = -EINVAL; + /* Is the deviceid already set? If so, we're good. */ + if (fl->dsaddr != NULL) + return 0; + /* find and reference the deviceid */ d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &fl->deviceid, lo->plh_lc_cred, gfp_flags); @@ -553,8 +557,6 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, if (filelayout_test_devid_unavailable(&dsaddr->id_node)) goto out_put; - fl->dsaddr = dsaddr; - if (fl->first_stripe_index >= dsaddr->stripe_count) { dprintk("%s Bad first_stripe_index %u\n", __func__, fl->first_stripe_index); @@ -570,6 +572,13 @@ filelayout_check_deviceid(struct pnfs_layout_hdr *lo, goto out_put; } status = 0; + + /* + * Atomic compare and xchange to ensure we don't scribble + * over a non-NULL pointer. + */ + if (cmpxchg(&fl->dsaddr, NULL, dsaddr) != NULL) + goto out_put; out: return status; out_put: -- cgit v1.2.3 From 144439376bfcd1d178e37bc08e27a58f82719bdb Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 19 Jul 2017 23:25:51 -0400 Subject: btrfs: fix lockup in find_free_extent with read-only block groups If we have a block group that is all of the following: 1) uncached in memory 2) is read-only 3) has a disk cache state that indicates we need to recreate the cache AND the file system has enough free space fragmentation such that the request for an extent of a given size can't be honored; AND have a single CPU core; AND it's the block group with the highest starting offset such that there are no opportunities (like reading from disk) for the loop to yield the CPU; We can end up with a lockup. The root cause is simple. Once we're in the position that we've read in all of the other block groups directly and none of those block groups can honor the request, there are no more opportunities to sleep. We end up trying to start a caching thread which never gets run if we only have one core. This *should* present as a hung task waiting on the caching thread to make some progress, but it doesn't. Instead, it degrades into a busy loop because of the placement of the read-only check. During the first pass through the loop, block_group->cached will be set to BTRFS_CACHE_STARTED and have_caching_bg will be set. Then we hit the read-only check and short circuit the loop. We're not yet in LOOP_CACHING_WAIT, so we skip that loop back before going through the loop again for other raid groups. Then we move to LOOP_CACHING_WAIT state. During the this pass through the loop, ->cached will still be BTRFS_CACHE_STARTED, which means it's not cached, so we'll enter cache_block_group, do a lot of nothing, and return, and also set have_caching_bg again. Then we hit the read-only check and short circuit the loop. The same thing happens as before except now we DO trigger the LOOP_CACHING_WAIT && have_caching_bg check and loop back up to the top. We do this forever. There are two fixes in this patch since they address the same underlying bug. The first is to add a cond_resched to the end of the loop to ensure that the caching thread always has an opportunity to run. This will fix the soft lockup issue, but find_free_extent will still loop doing nothing until the thread has completed. The second is to move the read-only check to the top of the loop. We're never going to return an allocation within a read-only block group so we may as well skip it early. The check for ->cached == BTRFS_CACHE_ERROR would cause the same problem except that BTRFS_CACHE_ERROR is considered a "done" state and we won't re-set have_caching_bg again. Many thanks to Stephan Kulow for his excellent help in the testing process. Signed-off-by: Jeff Mahoney Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 375f8c728d91..a6635f07b8f1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7589,6 +7589,10 @@ search: u64 offset; int cached; + /* If the block group is read-only, we can skip it entirely. */ + if (unlikely(block_group->ro)) + continue; + btrfs_grab_block_group(block_group, delalloc); search_start = block_group->key.objectid; @@ -7624,8 +7628,6 @@ have_block_group: if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) goto loop; - if (unlikely(block_group->ro)) - goto loop; /* * Ok we want to try and use the cluster allocator, so @@ -7839,6 +7841,7 @@ loop: failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); btrfs_release_block_group(block_group, delalloc); + cond_resched(); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From 17024ad0a0fdfcfe53043afb969b813d3e020c21 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 20 Jul 2017 15:10:35 -0700 Subject: Btrfs: fix early ENOSPC due to delalloc If a lot of metadata is reserved for outstanding delayed allocations, we rely on shrink_delalloc() to reclaim metadata space in order to fulfill reservation tickets. However, shrink_delalloc() has a shortcut where if it determines that space can be overcommitted, it will stop early. This made sense before the ticketed enospc system, but now it means that shrink_delalloc() will often not reclaim enough space to fulfill any tickets, leading to an early ENOSPC. (Reservation tickets don't care about being able to overcommit, they need every byte accounted for.) Fix it by getting rid of the shortcut so that shrink_delalloc() reclaims all of the metadata it is supposed to. This fixes early ENOSPCs we were seeing when doing a btrfs receive to populate a new filesystem, as well as early ENOSPCs Christoph saw when doing a big cp -r onto Btrfs. Fixes: 957780eb2788 ("Btrfs: introduce ticketed enospc infrastructure") Tested-by: Christoph Anton Mitterer Cc: stable@vger.kernel.org Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a6635f07b8f1..e3b0b4196d3d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4825,10 +4825,6 @@ skip_async: else flush = BTRFS_RESERVE_NO_FLUSH; spin_lock(&space_info->lock); - if (can_overcommit(fs_info, space_info, orig, flush, false)) { - spin_unlock(&space_info->lock); - break; - } if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { spin_unlock(&space_info->lock); -- cgit v1.2.3 From 0e4324a4c36b3eb5cd1f71cbbc38d888f919ebfc Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 21 Jul 2017 11:28:24 +0300 Subject: btrfs: round down size diff when shrinking/growing device Further testing showed that the fix introduced in 7dfb8be11b5d ("btrfs: Round down values which are written for total_bytes_size") was insufficient and it could still lead to discrepancies between the total_bytes in the super block and the device total bytes. So this patch also ensures that the difference between old/new sizes when shrinking/growing is also rounded down. This ensure that we won't be subtracting/adding a non-sectorsize multiples to the superblock/device total sizees. Fixes: 7dfb8be11b5d ("btrfs: Round down values which are written for total_bytes_size") Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c95f018d4a1e..b3c30719bf5d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2702,7 +2702,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, mutex_lock(&fs_info->chunk_mutex); old_total = btrfs_super_total_bytes(super_copy); - diff = new_size - device->total_bytes; + diff = round_down(new_size - device->total_bytes, fs_info->sectorsize); if (new_size <= device->total_bytes || device->is_tgtdev_for_dev_replace) { @@ -4406,7 +4406,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 diff; new_size = round_down(new_size, fs_info->sectorsize); - diff = old_size - new_size; + diff = round_down(old_size - new_size, fs_info->sectorsize); if (device->is_tgtdev_for_dev_replace) return -EINVAL; -- cgit v1.2.3 From cfaf2d034360166e569a4929dd83ae9698bed856 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 24 Jul 2017 08:33:25 -0700 Subject: xfs: fix quotacheck dquot id overflow infinite loop If a dquot has an id of U32_MAX, the next lookup index increment overflows the uint32_t back to 0. This starts the lookup sequence over from the beginning, repeats indefinitely and results in a livelock. Update xfs_qm_dquot_walk() to explicitly check for the lookup overflow and exit the loop. Signed-off-by: Brian Foster Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_qm.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6ce948c436d5..15751dc2a27d 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -111,6 +111,9 @@ restart: skipped = 0; break; } + /* we're done if id overflows back to zero */ + if (!next_index) + break; } if (skipped) { -- cgit v1.2.3 From 6215894e11de224183c89b001f5363912442b489 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Fri, 21 Jul 2017 11:04:23 -0700 Subject: xfs: check that dir block entries don't off the end of the buffer When we're checking the entries in a directory buffer, make sure that the entry length doesn't push us off the end of the buffer. Found via xfs/388 writing ones to the length fields. Signed-off-by: Darrick J. Wong Reviewed-by: Brian Foster --- fs/xfs/libxfs/xfs_dir2_data.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index d478065b9544..8727a43115ef 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -136,6 +136,8 @@ __xfs_dir3_data_check( */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + be16_to_cpu(dup->length)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); @@ -164,6 +166,8 @@ __xfs_dir3_data_check( XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); + XFS_WANT_CORRUPTED_RETURN(mp, endp >= + p + ops->data_entsize(dep->namelen)); XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); -- cgit v1.2.3 From 5b094d6dac0451ad89b1dc088395c7b399b7e9e8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 18 Jul 2017 11:16:51 -0700 Subject: xfs: fix multi-AG deadlock in xfs_bunmapi Just like in the allocator we must avoid touching multiple AGs out of order when freeing blocks, as freeing still locks the AGF and can cause the same AB-BA deadlocks as in the allocation path. Signed-off-by: Christoph Hellwig Reported-by: Nikolay Borisov Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/libxfs/xfs_bmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index ee118ceb702f..c09c16b1ad3b 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5435,6 +5435,7 @@ __xfs_bunmapi( xfs_fsblock_t sum; xfs_filblks_t len = *rlen; /* length to unmap in file */ xfs_fileoff_t max_len; + xfs_agnumber_t prev_agno = NULLAGNUMBER, agno; trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_); @@ -5534,6 +5535,17 @@ __xfs_bunmapi( */ del = got; wasdel = isnullstartblock(del.br_startblock); + + /* + * Make sure we don't touch multiple AGF headers out of order + * in a single transaction, as that could cause AB-BA deadlocks. + */ + if (!wasdel) { + agno = XFS_FSB_TO_AGNO(mp, del.br_startblock); + if (prev_agno != NULLAGNUMBER && prev_agno > agno) + break; + prev_agno = agno; + } if (got.br_startoff < start) { del.br_startoff = start; del.br_blockcount -= start - got.br_startoff; -- cgit v1.2.3 From 1e6f209515a08de29ec53b653eac73b50efd949c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 25 Jul 2017 16:10:47 -0400 Subject: NFS: Use raw NFS access mask in nfs4_opendata_access() Commit bd8b2441742b ("NFS: Store the raw NFS access mask in the inode's access cache") changed how the access results are stored after an access() call. An NFS v4 OPEN might have access bits returned with the opendata, so we should use the NFS4_ACCESS values when determining the return value in nfs4_opendata_access(). Fixes: bd8b2441742b ("NFS: Store the raw NFS access mask in the inode's access cache") Reported-by: Eryu Guan Signed-off-by: Anna Schumaker Tested-by: Takashi Iwai --- fs/nfs/nfs4proc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1a26c653e78..583c2b38c908 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2236,7 +2236,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, int openflags) { struct nfs_access_entry cache; - u32 mask; + u32 mask, flags; /* access call failed or for some reason the server doesn't * support any access modes -- defer access call until later */ @@ -2250,16 +2250,20 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ if (openflags & __FMODE_EXEC) { /* ONLY check for exec rights */ - mask = MAY_EXEC; + if (S_ISDIR(state->inode->i_mode)) + mask = NFS4_ACCESS_LOOKUP; + else + mask = NFS4_ACCESS_EXECUTE; } else if ((fmode & FMODE_READ) && !opendata->file_created) - mask = MAY_READ; + mask = NFS4_ACCESS_READ; cache.cred = cred; cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); - if ((mask & ~cache.mask & (MAY_READ | MAY_EXEC)) == 0) + flags = NFS4_ACCESS_READ | NFS4_ACCESS_EXECUTE | NFS4_ACCESS_LOOKUP; + if ((mask & ~cache.mask & flags) == 0) return 0; return -EACCES; -- cgit v1.2.3 From 442ce0499c0535f8972b68fa1fda357357a5c953 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 24 Jul 2017 13:18:50 +1000 Subject: NFS: invalidate file size when taking a lock. Prior to commit ca0daa277aca ("NFS: Cache aggressively when file is open for writing"), NFS would revalidate, or invalidate, the file size when taking a lock. Since that commit it only invalidates the file content. If the file size is changed on the server while wait for the lock, the client will have an incorrect understanding of the file size and could corrupt data. This particularly happens when writing beyond the (supposed) end of file and can be easily be demonstrated with posix_fallocate(). If an application opens an empty file, waits for a write lock, and then calls posix_fallocate(), glibc will determine that the underlying filesystem doesn't support fallocate (assuming version 4.1 or earlier) and will write out a '0' byte at the end of each 4K page in the region being fallocated that is after the end of the file. NFS will (usually) detect that these writes are beyond EOF and will expand them to cover the whole page, and then will merge the pages. Consequently, NFS will write out large blocks of zeroes beyond where it thought EOF was. If EOF had moved, the pre-existing part of the file will be over-written. Locking should have protected against this, but it doesn't. This patch restores the use of nfs_zap_caches() which invalidated the cached attributes. When posix_fallocate() asks for the file size, the request will go to the server and get a correct answer. cc: stable@vger.kernel.org (v4.8+) Fixes: ca0daa277aca ("NFS: Cache aggressively when file is open for writing") Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5713eb32a45e..d264363559db 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -750,7 +750,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) */ nfs_sync_mapping(filp->f_mapping); if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - nfs_zap_mapping(inode, filp->f_mapping); + nfs_zap_caches(inode); out: return status; } -- cgit v1.2.3 From 6ba80d4348bd8cce2a0a6bbc21e5e1e760de42a9 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 24 Jul 2017 13:18:50 +1000 Subject: NFS: Optimize fallocate by refreshing mapping when needed. posix_fallocate() will allocate space in an NFS file by considering the last byte of every 4K block. If it is before EOF, it will read the byte and if it is zero, a zero is written out. If it is after EOF, the zero is unconditionally written. For the blocks beyond EOF, if NFS believes its cache is valid, it will expand these writes to write full pages, and then will merge the pages. This results if (typically) 1MB writes. If NFS believes its cache is not valid (particularly if NFS_INO_INVALID_DATA or NFS_INO_REVAL_PAGECACHE are set - see nfs_write_pageuptodate()), it will send the individual 1-byte writes. This results in (typically) 256 times as many RPC requests, and can be substantially slower. Currently nfs_revalidate_mapping() is only used when reading a file or mmapping a file, as these are times when the content needs to be up-to-date. Writes don't generally need the cache to be up-to-date, but writes beyond EOF can benefit, particularly in the posix_fallocate() case. So this patch calls nfs_revalidate_mapping() when writing beyond EOF - i.e. when there is a gap between the end of the file and the start of the write. If the cache is thought to be out of date (as happens after taking a file lock), this will cause a GETATTR, and the two flags mentioned above will be cleared. With this, posix_fallocate() on a newly locked file does not generate excessive tiny writes. Signed-off-by: NeilBrown Signed-off-by: Anna Schumaker --- fs/nfs/file.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d264363559db..af330c31f627 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -617,6 +617,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result) goto out; } + if (iocb->ki_pos > i_size_read(inode)) + nfs_revalidate_mapping(inode, file->f_mapping); nfs_start_io_write(inode); result = generic_write_checks(iocb, from); -- cgit v1.2.3 From b7dbcc0e433f0f61acb89ed9861ec996be4f2b38 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 28 Jul 2017 12:33:54 -0400 Subject: NFSv4.1: Fix a race where CB_NOTIFY_LOCK fails to wake a waiter nfs4_retry_setlk() sets the task's state to TASK_INTERRUPTIBLE within the same region protected by the wait_queue's lock after checking for a notification from CB_NOTIFY_LOCK callback. However, after releasing that lock, a wakeup for that task may race in before the call to freezable_schedule_timeout_interruptible() and set TASK_WAKING, then freezable_schedule_timeout_interruptible() will set the state back to TASK_INTERRUPTIBLE before the task will sleep. The result is that the task will sleep for the entire duration of the timeout. Since we've already set TASK_INTERRUPTIBLE in the locked section, just use freezable_schedule_timout() instead. Fixes: a1d617d8f134 ("nfs: allow blocking locks to be awoken by lock callbacks") Signed-off-by: Benjamin Coddington Reviewed-by: Jeff Layton Cc: stable@vger.kernel.org # v4.9+ Signed-off-by: Anna Schumaker --- fs/nfs/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 583c2b38c908..93c1d7352238 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6495,7 +6495,7 @@ nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irqrestore(&q->lock, flags); - freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT); + freezable_schedule_timeout(NFS4_LOCK_MAXTIMEOUT); } finish_wait(q, &wait); -- cgit v1.2.3