summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/acl.c2
-rw-r--r--fs/9p/fid.h4
-rw-r--r--fs/9p/vfs_addr.c1
-rw-r--r--fs/9p/vfs_inode.c10
-rw-r--r--fs/9p/vfs_inode_dotl.c16
-rw-r--r--fs/Kconfig6
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/adfs/dir.c2
-rw-r--r--fs/affs/namei.c8
-rw-r--r--fs/attr.c19
-rw-r--r--fs/autofs4/waitq.c2
-rw-r--r--fs/binfmt_elf.c34
-rw-r--r--fs/binfmt_elf_fdpic.c38
-rw-r--r--fs/binfmt_em86.c3
-rw-r--r--fs/binfmt_flat.c525
-rw-r--r--fs/binfmt_misc.c12
-rw-r--r--fs/block_dev.c31
-rw-r--r--fs/btrfs/acl.c3
-rw-r--r--fs/btrfs/async-thread.c31
-rw-r--r--fs/btrfs/async-thread.h6
-rw-r--r--fs/btrfs/backref.c4
-rw-r--r--fs/btrfs/compression.c10
-rw-r--r--fs/btrfs/ctree.c91
-rw-r--r--fs/btrfs/ctree.h116
-rw-r--r--fs/btrfs/dedupe.h24
-rw-r--r--fs/btrfs/delayed-inode.c72
-rw-r--r--fs/btrfs/delayed-ref.c17
-rw-r--r--fs/btrfs/dev-replace.c4
-rw-r--r--fs/btrfs/disk-io.c101
-rw-r--r--fs/btrfs/disk-io.h3
-rw-r--r--fs/btrfs/extent-tree.c855
-rw-r--r--fs/btrfs/extent_io.c38
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/file.c28
-rw-r--r--fs/btrfs/free-space-cache.c8
-rw-r--r--fs/btrfs/free-space-tree.c16
-rw-r--r--fs/btrfs/inode-map.c16
-rw-r--r--fs/btrfs/inode.c225
-rw-r--r--fs/btrfs/ioctl.c40
-rw-r--r--fs/btrfs/ordered-data.c2
-rw-r--r--fs/btrfs/props.c6
-rw-r--r--fs/btrfs/qgroup.c25
-rw-r--r--fs/btrfs/qgroup.h9
-rw-r--r--fs/btrfs/relocation.c65
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/scrub.c12
-rw-r--r--fs/btrfs/super.c218
-rw-r--r--fs/btrfs/sysfs.c2
-rw-r--r--fs/btrfs/tests/btrfs-tests.c67
-rw-r--r--fs/btrfs/tests/btrfs-tests.h36
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c23
-rw-r--r--fs/btrfs/tests/free-space-tests.c14
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c18
-rw-r--r--fs/btrfs/tests/inode-tests.c46
-rw-r--r--fs/btrfs/tests/qgroup-tests.c23
-rw-r--r--fs/btrfs/transaction.c37
-rw-r--r--fs/btrfs/transaction.h1
-rw-r--r--fs/btrfs/tree-log.c22
-rw-r--r--fs/btrfs/volumes.c138
-rw-r--r--fs/cachefiles/proc.c1
-rw-r--r--fs/ceph/addr.c77
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c873
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/file.c77
-rw-r--r--fs/ceph/inode.c44
-rw-r--r--fs/ceph/ioctl.c30
-rw-r--r--fs/ceph/mds_client.c360
-rw-r--r--fs/ceph/mds_client.h19
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.c43
-rw-r--r--fs/ceph/super.h48
-rw-r--r--fs/ceph/xattr.c101
-rw-r--r--fs/cifs/cifs_debug.c7
-rw-r--r--fs/cifs/cifs_fs_sb.h4
-rw-r--r--fs/cifs/cifsencrypt.c16
-rw-r--r--fs/cifs/cifsfs.c14
-rw-r--r--fs/cifs/connect.c53
-rw-r--r--fs/cifs/dir.c46
-rw-r--r--fs/cifs/inode.c22
-rw-r--r--fs/cifs/smb2ops.c32
-rw-r--r--fs/coda/pioctl.c1
-rw-r--r--fs/configfs/file.c6
-rw-r--r--fs/dax.c13
-rw-r--r--fs/dcache.c231
-rw-r--r--fs/debugfs/inode.c7
-rw-r--r--fs/devpts/inode.c3
-rw-r--r--fs/efivarfs/super.c4
-rw-r--r--fs/exec.c44
-rw-r--r--fs/ext4/mballoc.c1
-rw-r--r--fs/ext4/sysfs.c1
-rw-r--r--fs/f2fs/data.c1
-rw-r--r--fs/f2fs/super.c1
-rw-r--r--fs/fat/inode.c4
-rw-r--r--fs/fat/namei_msdos.c2
-rw-r--r--fs/fat/namei_vfat.c4
-rw-r--r--fs/freevxfs/Kconfig13
-rw-r--r--fs/freevxfs/vxfs.h185
-rw-r--r--fs/freevxfs/vxfs_bmap.c70
-rw-r--r--fs/freevxfs/vxfs_dir.h17
-rw-r--r--fs/freevxfs/vxfs_extern.h10
-rw-r--r--fs/freevxfs/vxfs_fshead.c37
-rw-r--r--fs/freevxfs/vxfs_fshead.h29
-rw-r--r--fs/freevxfs/vxfs_inode.c265
-rw-r--r--fs/freevxfs/vxfs_inode.h146
-rw-r--r--fs/freevxfs/vxfs_lookup.c226
-rw-r--r--fs/freevxfs/vxfs_olt.c15
-rw-r--r--fs/freevxfs/vxfs_olt.h70
-rw-r--r--fs/freevxfs/vxfs_super.c162
-rw-r--r--fs/fs-writeback.c5
-rw-r--r--fs/fscache/histogram.c1
-rw-r--r--fs/fscache/object-list.c1
-rw-r--r--fs/fscache/stats.c1
-rw-r--r--fs/fuse/dev.c32
-rw-r--r--fs/fuse/dir.c3
-rw-r--r--fs/fuse/file.c43
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/hfs/inode.c2
-rw-r--r--fs/hfs/string.c2
-rw-r--r--fs/hfsplus/inode.c2
-rw-r--r--fs/hfsplus/unicode.c2
-rw-r--r--fs/hostfs/hostfs_kern.c7
-rw-r--r--fs/hpfs/dentry.c2
-rw-r--r--fs/inode.c9
-rw-r--r--fs/ioctl.c1
-rw-r--r--fs/isofs/compress.c1
-rw-r--r--fs/isofs/inode.c14
-rw-r--r--fs/jffs2/dir.c8
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/scan.c2
-rw-r--r--fs/jffs2/summary.c2
-rw-r--r--fs/jffs2/write.c4
-rw-r--r--fs/jfs/jfs_debug.c1
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/jfs/jfs_metapage.c1
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/jfs/jfs_xtree.c1
-rw-r--r--fs/jfs/namei.c2
-rw-r--r--fs/kernfs/dir.c4
-rw-r--r--fs/kernfs/mount.c5
-rw-r--r--fs/lockd/procfs.c1
-rw-r--r--fs/logfs/dir.c6
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c137
-rw-r--r--fs/namespace.c99
-rw-r--r--fs/ncpfs/dir.c2
-rw-r--r--fs/nfs/Makefile2
-rw-r--r--fs/nfs/blocklayout/dev.c110
-rw-r--r--fs/nfs/blocklayout/extent_tree.c27
-rw-r--r--fs/nfs/callback_proc.c64
-rw-r--r--fs/nfs/callback_xdr.c6
-rw-r--r--fs/nfs/client.c24
-rw-r--r--fs/nfs/dir.c67
-rw-r--r--fs/nfs/direct.c97
-rw-r--r--fs/nfs/file.c100
-rw-r--r--fs/nfs/filelayout/filelayout.c18
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c23
-rw-r--r--fs/nfs/inode.c138
-rw-r--r--fs/nfs/internal.h64
-rw-r--r--fs/nfs/io.c147
-rw-r--r--fs/nfs/nfs3client.c14
-rw-r--r--fs/nfs/nfs42proc.c24
-rw-r--r--fs/nfs/nfs42xdr.c12
-rw-r--r--fs/nfs/nfs4_fs.h1
-rw-r--r--fs/nfs/nfs4client.c26
-rw-r--r--fs/nfs/nfs4file.c16
-rw-r--r--fs/nfs/nfs4proc.c153
-rw-r--r--fs/nfs/nfs4trace.h4
-rw-r--r--fs/nfs/nfs4xdr.c11
-rw-r--r--fs/nfs/nfstrace.h5
-rw-r--r--fs/nfs/pnfs.c191
-rw-r--r--fs/nfs/pnfs.h34
-rw-r--r--fs/nfs/pnfs_nfs.c13
-rw-r--r--fs/nfs/super.c14
-rw-r--r--fs/nfs/write.c46
-rw-r--r--fs/nfsd/Kconfig19
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/blocklayout.c2
-rw-r--r--fs/nfsd/blocklayoutxdr.c4
-rw-r--r--fs/nfsd/export.c14
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/flexfilelayout.c133
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c115
-rw-r--r--fs/nfsd/flexfilelayoutxdr.h49
-rw-r--r--fs/nfsd/nfs4layouts.c16
-rw-r--r--fs/nfsd/nfs4proc.c48
-rw-r--r--fs/nfsd/nfs4state.c74
-rw-r--r--fs/nfsd/nfs4xdr.c81
-rw-r--r--fs/nfsd/nfsctl.c16
-rw-r--r--fs/nfsd/nfsd.h5
-rw-r--r--fs/nfsd/nfsfh.c18
-rw-r--r--fs/nfsd/nfsproc.c7
-rw-r--r--fs/nfsd/nfsxdr.c2
-rw-r--r--fs/nfsd/pnfs.h4
-rw-r--r--fs/nfsd/state.h1
-rw-r--r--fs/nfsd/stats.c1
-rw-r--r--fs/nfsd/vfs.c142
-rw-r--r--fs/nfsd/vfs.h3
-rw-r--r--fs/nfsd/xdr4.h5
-rw-r--r--fs/nilfs2/alloc.c45
-rw-r--r--fs/nilfs2/bmap.c4
-rw-r--r--fs/nilfs2/bmap.h2
-rw-r--r--fs/nilfs2/btnode.c4
-rw-r--r--fs/nilfs2/btree.c61
-rw-r--r--fs/nilfs2/btree.h2
-rw-r--r--fs/nilfs2/cpfile.c23
-rw-r--r--fs/nilfs2/cpfile.h3
-rw-r--r--fs/nilfs2/dat.c19
-rw-r--r--fs/nilfs2/dat.h1
-rw-r--r--fs/nilfs2/dir.c60
-rw-r--r--fs/nilfs2/direct.c10
-rw-r--r--fs/nilfs2/direct.h10
-rw-r--r--fs/nilfs2/gcinode.c9
-rw-r--r--fs/nilfs2/ifile.c7
-rw-r--r--fs/nilfs2/ifile.h1
-rw-r--r--fs/nilfs2/inode.c36
-rw-r--r--fs/nilfs2/ioctl.c48
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h48
-rw-r--r--fs/nilfs2/page.c45
-rw-r--r--fs/nilfs2/recovery.c72
-rw-r--r--fs/nilfs2/segbuf.c6
-rw-r--r--fs/nilfs2/segment.c61
-rw-r--r--fs/nilfs2/segment.h1
-rw-r--r--fs/nilfs2/sufile.c44
-rw-r--r--fs/nilfs2/sufile.h1
-rw-r--r--fs/nilfs2/super.c206
-rw-r--r--fs/nilfs2/sysfs.c74
-rw-r--r--fs/nilfs2/the_nilfs.c134
-rw-r--r--fs/nilfs2/the_nilfs.h11
-rw-r--r--fs/ntfs/inode.c2
-rw-r--r--fs/ntfs/namei.c2
-rw-r--r--fs/ocfs2/alloc.c37
-rw-r--r--fs/ocfs2/alloc.h2
-rw-r--r--fs/ocfs2/aops.c39
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c53
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c29
-rw-r--r--fs/ocfs2/dlm/dlmthread.c57
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/stack_user.c11
-rw-r--r--fs/ocfs2/suballoc.c20
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/open.c8
-rw-r--r--fs/orangefs/dcache.c4
-rw-r--r--fs/orangefs/inode.c8
-rw-r--r--fs/orangefs/namei.c22
-rw-r--r--fs/orangefs/orangefs-kernel.h12
-rw-r--r--fs/orangefs/orangefs-mod.c2
-rw-r--r--fs/orangefs/orangefs-sysfs.c43
-rw-r--r--fs/orangefs/orangefs-utils.c38
-rw-r--r--fs/orangefs/protocol.h8
-rw-r--r--fs/orangefs/symlink.c2
-rw-r--r--fs/overlayfs/copy_up.c1
-rw-r--r--fs/overlayfs/dir.c226
-rw-r--r--fs/overlayfs/inode.c271
-rw-r--r--fs/overlayfs/overlayfs.h32
-rw-r--r--fs/overlayfs/super.c198
-rw-r--r--fs/posix_acl.c8
-rw-r--r--fs/proc/Makefile1
-rw-r--r--fs/proc/base.c192
-rw-r--r--fs/proc/inode.c15
-rw-r--r--fs/proc/internal.h3
-rw-r--r--fs/proc/meminfo.c22
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/root.c61
-rw-r--r--fs/proc/stat.c10
-rw-r--r--fs/pstore/ram.c29
-rw-r--r--fs/quota/dquot.c24
-rw-r--r--fs/quota/quota.c14
-rw-r--r--fs/reiserfs/ibalance.c3
-rw-r--r--fs/super.c69
-rw-r--r--fs/sysfs/mount.c5
-rw-r--r--fs/sysv/namei.c2
-rw-r--r--fs/tracefs/inode.c7
-rw-r--r--fs/ubifs/gc.c4
-rw-r--r--fs/ubifs/super.c19
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c6
-rw-r--r--fs/ufs/dir.c17
-rw-r--r--fs/xattr.c7
-rw-r--r--fs/xfs/Makefile3
-rw-r--r--fs/xfs/xfs_export.c2
-rw-r--r--fs/xfs/xfs_pnfs.h4
-rw-r--r--fs/xfs/xfs_stats.c1
288 files changed, 7025 insertions, 5054 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 0576eaeb60b9..5b6a1743ea17 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -266,7 +266,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
if (IS_ERR(acl))
return PTR_ERR(acl);
else if (acl) {
- retval = posix_acl_valid(acl);
+ retval = posix_acl_valid(inode->i_sb->s_user_ns, acl);
if (retval)
goto err_out;
}
diff --git a/fs/9p/fid.h b/fs/9p/fid.h
index 2b6787fcb626..12700df0bb51 100644
--- a/fs/9p/fid.h
+++ b/fs/9p/fid.h
@@ -24,6 +24,10 @@
#include <linux/list.h>
struct p9_fid *v9fs_fid_lookup(struct dentry *dentry);
+static inline struct p9_fid *v9fs_parent_fid(struct dentry *dentry)
+{
+ return v9fs_fid_lookup(dentry->d_parent);
+}
struct p9_fid *v9fs_fid_clone(struct dentry *dentry);
void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid);
struct p9_fid *v9fs_writeback_fid(struct dentry *dentry);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index c37fb9c08970..6181ad79e1a5 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -231,7 +231,6 @@ static int v9fs_launder_page(struct page *page)
/**
* v9fs_direct_IO - 9P address space operation for direct I/O
* @iocb: target I/O control block
- * @pos: offset in file to begin the operation
*
* The presence of v9fs_direct_IO() in the address space ops vector
* allowes open() O_DIRECT flags which would have failed otherwise.
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index e2e7c749925a..7da9a8354fad 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -595,7 +595,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags)
v9ses = v9fs_inode2v9ses(dir);
inode = d_inode(dentry);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
retval = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", retval);
@@ -653,7 +653,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
ofid = NULL;
fid = NULL;
name = (char *) dentry->d_name.name;
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -798,7 +798,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
v9ses = v9fs_inode2v9ses(dir);
/* We can walk d_parent because we hold the dir->i_mutex */
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid))
return ERR_CAST(dfid);
@@ -975,13 +975,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ERR(oldfid))
return PTR_ERR(oldfid);
- olddirfid = v9fs_fid_clone(old_dentry->d_parent);
+ olddirfid = v9fs_parent_fid(old_dentry);
if (IS_ERR(olddirfid)) {
retval = PTR_ERR(olddirfid);
goto done;
}
- newdirfid = v9fs_fid_clone(new_dentry->d_parent);
+ newdirfid = v9fs_parent_fid(new_dentry);
if (IS_ERR(newdirfid)) {
retval = PTR_ERR(newdirfid);
goto clunk_olddir;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1b51eaa5e2dd..2ed04c2fe7af 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -273,7 +273,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n",
name, flags, omode);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -389,7 +389,6 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
umode_t mode;
struct inode *inode;
struct p9_qid qid;
- struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
@@ -400,8 +399,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
if (dir->i_mode & S_ISGID)
omode |= S_ISGID;
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -691,7 +689,7 @@ v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry,
p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
v9ses = v9fs_inode2v9ses(dir);
- dfid = v9fs_fid_lookup(dentry->d_parent);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
@@ -762,7 +760,6 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
struct dentry *dentry)
{
int err;
- struct dentry *dir_dentry;
struct p9_fid *dfid, *oldfid;
struct v9fs_session_info *v9ses;
@@ -770,8 +767,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
dir->i_ino, old_dentry, dentry);
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid))
return PTR_ERR(dfid);
@@ -822,7 +818,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
struct p9_fid *fid = NULL, *dfid = NULL;
struct inode *inode;
struct p9_qid qid;
- struct dentry *dir_dentry;
struct posix_acl *dacl = NULL, *pacl = NULL;
p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n",
@@ -830,8 +825,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
MAJOR(rdev), MINOR(rdev));
v9ses = v9fs_inode2v9ses(dir);
- dir_dentry = dentry->d_parent;
- dfid = v9fs_fid_lookup(dir_dentry);
+ dfid = v9fs_parent_fid(dentry);
if (IS_ERR(dfid)) {
err = PTR_ERR(dfid);
p9_debug(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
diff --git a/fs/Kconfig b/fs/Kconfig
index 4524916fa200..2bc7ad775842 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -70,6 +70,12 @@ config FS_POSIX_ACL
config EXPORTFS
tristate
+config EXPORTFS_BLOCK_OPS
+ bool "Enable filesystem export operations for block IO"
+ help
+ This option enables the export operations for a filesystem to support
+ external block IO.
+
config FILE_LOCKING
bool "Enable POSIX file locking API" if EXPERT
default y
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 72c03354c14b..c7efddf6e038 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -89,7 +89,8 @@ config BINFMT_SCRIPT
config BINFMT_FLAT
bool "Kernel support for flat binaries"
- depends on !MMU && (!FRV || BROKEN)
+ depends on !MMU || M68K
+ depends on !FRV || BROKEN
help
Support uClinux FLAT format binaries.
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index fd4cf2c48e48..bec25f7017c0 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -207,7 +207,7 @@ adfs_hash(const struct dentry *parent, struct qstr *qstr)
*/
qstr->len = i = name_len;
name = qstr->name;
- hash = init_name_hash();
+ hash = init_name_hash(parent);
while (i--) {
char c;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 00d3002a6780..eb32029bc776 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -61,7 +61,7 @@ affs_get_toupper(struct super_block *sb)
* Note: the dentry argument is the parent dentry.
*/
static inline int
-__affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
+__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
@@ -72,7 +72,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
if (retval)
return retval;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
len = min(qstr->len, AFFSNAMEMAX);
for (; len > 0; name++, len--)
hash = partial_name_hash(toupper(*name), hash);
@@ -84,7 +84,7 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
static int
affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_toupper,
+ return __affs_hash_dentry(dentry, qstr, affs_toupper,
affs_nofilenametruncate(dentry));
}
@@ -92,7 +92,7 @@ affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
static int
affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
{
- return __affs_hash_dentry(qstr, affs_intl_toupper,
+ return __affs_hash_dentry(dentry, qstr, affs_intl_toupper,
affs_nofilenametruncate(dentry));
}
diff --git a/fs/attr.c b/fs/attr.c
index 25b24d0f6c88..42bb42bb3c72 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -255,6 +255,25 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
if (!(attr->ia_valid & ~(ATTR_KILL_SUID | ATTR_KILL_SGID)))
return 0;
+ /*
+ * Verify that uid/gid changes are valid in the target
+ * namespace of the superblock.
+ */
+ if (ia_valid & ATTR_UID &&
+ !kuid_has_mapping(inode->i_sb->s_user_ns, attr->ia_uid))
+ return -EOVERFLOW;
+ if (ia_valid & ATTR_GID &&
+ !kgid_has_mapping(inode->i_sb->s_user_ns, attr->ia_gid))
+ return -EOVERFLOW;
+
+ /* Don't allow modifications of files with invalid uids or
+ * gids unless those uids & gids are being made valid.
+ */
+ if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid))
+ return -EOVERFLOW;
+ if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid))
+ return -EOVERFLOW;
+
error = security_inode_setattr(dentry, attr);
if (error)
return error;
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 631f1554c87b..708214457d16 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -398,7 +398,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
}
}
qstr.name = name;
- qstr.hash = full_name_hash(name, qstr.len);
+ qstr.hash = full_name_hash(dentry, name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
kfree(qstr.name);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index a7a28110dc80..7f6aff3f72eb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -605,28 +605,30 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
* Do the same thing for the memory mapping - between
* elf_bss and last_bss is the bss section.
*/
- k = load_addr + eppnt->p_memsz + eppnt->p_vaddr;
+ k = load_addr + eppnt->p_vaddr + eppnt->p_memsz;
if (k > last_bss)
last_bss = k;
}
}
+ /*
+ * Now fill out the bss section: first pad the last page from
+ * the file up to the page boundary, and zero it from elf_bss
+ * up to the end of the page.
+ */
+ if (padzero(elf_bss)) {
+ error = -EFAULT;
+ goto out;
+ }
+ /*
+ * Next, align both the file and mem bss up to the page size,
+ * since this is where elf_bss was just zeroed up to, and where
+ * last_bss will end after the vm_brk() below.
+ */
+ elf_bss = ELF_PAGEALIGN(elf_bss);
+ last_bss = ELF_PAGEALIGN(last_bss);
+ /* Finally, if there is still more bss to allocate, do it. */
if (last_bss > elf_bss) {
- /*
- * Now fill out the bss section. First pad the last page up
- * to the page boundary, and then perform a mmap to make sure
- * that there are zero-mapped pages up to and including the
- * last bss page.
- */
- if (padzero(elf_bss)) {
- error = -EFAULT;
- goto out;
- }
-
- /* What we have mapped so far */
- elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
-
- /* Map the last of the bss segment */
error = vm_brk(elf_bss, last_bss - elf_bss);
if (error)
goto out;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 203589311bf8..464a972e88c1 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -67,8 +67,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *, struct mm_struct *,
struct elf_fdpic_params *);
#ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *,
- unsigned long *);
static int elf_fdpic_map_file_constdisp_on_uclinux(struct elf_fdpic_params *,
struct file *,
struct mm_struct *);
@@ -515,8 +513,9 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
sp = mm->start_stack;
/* stack the program arguments and environment */
- if (elf_fdpic_transfer_args_to_stack(bprm, &sp) < 0)
+ if (transfer_args_to_stack(bprm, &sp) < 0)
return -EFAULT;
+ sp &= ~15;
#endif
/*
@@ -711,39 +710,6 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
/*****************************************************************************/
/*
- * transfer the program arguments and environment from the holding pages onto
- * the stack
- */
-#ifndef CONFIG_MMU
-static int elf_fdpic_transfer_args_to_stack(struct linux_binprm *bprm,
- unsigned long *_sp)
-{
- unsigned long index, stop, sp;
- char *src;
- int ret = 0;
-
- stop = bprm->p >> PAGE_SHIFT;
- sp = *_sp;
-
- for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
- src = kmap(bprm->page[index]);
- sp -= PAGE_SIZE;
- if (copy_to_user((void *) sp, src, PAGE_SIZE) != 0)
- ret = -EFAULT;
- kunmap(bprm->page[index]);
- if (ret < 0)
- goto out;
- }
-
- *_sp = (*_sp - (MAX_ARG_PAGES * PAGE_SIZE - bprm->p)) & ~15;
-
-out:
- return ret;
-}
-#endif
-
-/*****************************************************************************/
-/*
* load the appropriate binary image (executable or interpreter) into memory
* - we assume no MMU is available
* - if no other PIC bits are set in params->hdr->e_flags
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 490538536cb4..dd2d3f0cd55d 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -24,7 +24,8 @@
static int load_em86(struct linux_binprm *bprm)
{
- char *interp, *i_name, *i_arg;
+ const char *i_name, *i_arg;
+ char *interp;
struct file * file;
int retval;
struct elfhdr elf_ex;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index caf9e39bb82b..9b2917a30294 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -15,7 +15,8 @@
* JAN/99 -- coded full program relocation (gerg@snapgear.com)
*/
-#include <linux/export.h>
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
@@ -25,8 +26,6 @@
#include <linux/string.h>
#include <linux/fs.h>
#include <linux/file.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
#include <linux/ptrace.h>
#include <linux/user.h>
#include <linux/slab.h>
@@ -34,26 +33,16 @@
#include <linux/personality.h>
#include <linux/init.h>
#include <linux/flat.h>
-#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
#include <asm/byteorder.h>
-#include <asm/uaccess.h>
#include <asm/unaligned.h>
#include <asm/cacheflush.h>
#include <asm/page.h>
/****************************************************************************/
-#if 0
-#define DEBUG 1
-#endif
-
-#ifdef DEBUG
-#define DBG_FLT(a...) printk(a)
-#else
-#define DBG_FLT(a...)
-#endif
-
/*
* User data (data section and bss) needs to be aligned.
* We pick 0x20 here because it is the max value elf2flt has always
@@ -80,7 +69,7 @@ struct lib_info {
unsigned long text_len; /* Length of text segment */
unsigned long entry; /* Start address for this module */
unsigned long build_date; /* When this one was compiled */
- short loaded; /* Has this library been loaded? */
+ bool loaded; /* Has this library been loaded? */
} lib_list[MAX_SHARED_LIBS];
};
@@ -106,59 +95,67 @@ static struct linux_binfmt flat_format = {
static int flat_core_dump(struct coredump_params *cprm)
{
- printk("Process %s:%d received signr %d and should have core dumped\n",
- current->comm, current->pid, (int) cprm->siginfo->si_signo);
- return(1);
+ pr_warn("Process %s:%d received signr %d and should have core dumped\n",
+ current->comm, current->pid, cprm->siginfo->si_signo);
+ return 1;
}
/****************************************************************************/
/*
* create_flat_tables() parses the env- and arg-strings in new user
* memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
+ * addresses on the "stack", recording the new stack pointer value.
*/
-static unsigned long create_flat_tables(
- unsigned long pp,
- struct linux_binprm * bprm)
+static int create_flat_tables(struct linux_binprm *bprm, unsigned long arg_start)
{
- unsigned long *argv,*envp;
- unsigned long * sp;
- char * p = (char*)pp;
- int argc = bprm->argc;
- int envc = bprm->envc;
- char uninitialized_var(dummy);
-
- sp = (unsigned long *)p;
- sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
- sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
- argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
- envp = argv + (argc + 1);
+ char __user *p;
+ unsigned long __user *sp;
+ long i, len;
+
+ p = (char __user *)arg_start;
+ sp = (unsigned long __user *)current->mm->start_stack;
+
+ sp -= bprm->envc + 1;
+ sp -= bprm->argc + 1;
+ sp -= flat_argvp_envp_on_stack() ? 2 : 0;
+ sp -= 1; /* &argc */
+ current->mm->start_stack = (unsigned long)sp & -FLAT_STACK_ALIGN;
+ sp = (unsigned long __user *)current->mm->start_stack;
+
+ __put_user(bprm->argc, sp++);
if (flat_argvp_envp_on_stack()) {
- put_user((unsigned long) envp, sp + 2);
- put_user((unsigned long) argv, sp + 1);
- }
-
- put_user(argc, sp);
- current->mm->arg_start = (unsigned long) p;
- while (argc-->0) {
- put_user((unsigned long) p, argv++);
- do {
- get_user(dummy, p); p++;
- } while (dummy);
- }
- put_user((unsigned long) NULL, argv);
- current->mm->arg_end = current->mm->env_start = (unsigned long) p;
- while (envc-->0) {
- put_user((unsigned long)p, envp); envp++;
- do {
- get_user(dummy, p); p++;
- } while (dummy);
- }
- put_user((unsigned long) NULL, envp);
- current->mm->env_end = (unsigned long) p;
- return (unsigned long)sp;
+ unsigned long argv, envp;
+ argv = (unsigned long)(sp + 2);
+ envp = (unsigned long)(sp + 2 + bprm->argc + 1);
+ __put_user(argv, sp++);
+ __put_user(envp, sp++);
+ }
+
+ current->mm->arg_start = (unsigned long)p;
+ for (i = bprm->argc; i > 0; i--) {
+ __put_user((unsigned long)p, sp++);
+ len = strnlen_user(p, MAX_ARG_STRLEN);
+ if (!len || len > MAX_ARG_STRLEN)
+ return -EINVAL;
+ p += len;
+ }
+ __put_user(0, sp++);
+ current->mm->arg_end = (unsigned long)p;
+
+ current->mm->env_start = (unsigned long) p;
+ for (i = bprm->envc; i > 0; i--) {
+ __put_user((unsigned long)p, sp++);
+ len = strnlen_user(p, MAX_ARG_STRLEN);
+ if (!len || len > MAX_ARG_STRLEN)
+ return -EINVAL;
+ p += len;
+ }
+ __put_user(0, sp++);
+ current->mm->env_end = (unsigned long)p;
+
+ return 0;
}
/****************************************************************************/
@@ -190,17 +187,17 @@ static int decompress_exec(
loff_t fpos;
int ret, retval;
- DBG_FLT("decompress_exec(offset=%x,buf=%x,len=%x)\n",(int)offset, (int)dst, (int)len);
+ pr_debug("decompress_exec(offset=%lx,buf=%p,len=%lx)\n", offset, dst, len);
memset(&strm, 0, sizeof(strm));
strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL);
if (strm.workspace == NULL) {
- DBG_FLT("binfmt_flat: no memory for decompress workspace\n");
+ pr_debug("no memory for decompress workspace\n");
return -ENOMEM;
}
buf = kmalloc(LBUFSIZE, GFP_KERNEL);
if (buf == NULL) {
- DBG_FLT("binfmt_flat: no memory for read buffer\n");
+ pr_debug("no memory for read buffer\n");
retval = -ENOMEM;
goto out_free;
}
@@ -218,49 +215,49 @@ static int decompress_exec(
/* Check minimum size -- gzip header */
if (ret < 10) {
- DBG_FLT("binfmt_flat: file too small?\n");
+ pr_debug("file too small?\n");
goto out_free_buf;
}
/* Check gzip magic number */
if ((buf[0] != 037) || ((buf[1] != 0213) && (buf[1] != 0236))) {
- DBG_FLT("binfmt_flat: unknown compression magic?\n");
+ pr_debug("unknown compression magic?\n");
goto out_free_buf;
}
/* Check gzip method */
if (buf[2] != 8) {
- DBG_FLT("binfmt_flat: unknown compression method?\n");
+ pr_debug("unknown compression method?\n");
goto out_free_buf;
}
/* Check gzip flags */
if ((buf[3] & ENCRYPTED) || (buf[3] & CONTINUATION) ||
(buf[3] & RESERVED)) {
- DBG_FLT("binfmt_flat: unknown flags?\n");
+ pr_debug("unknown flags?\n");
goto out_free_buf;
}
ret = 10;
if (buf[3] & EXTRA_FIELD) {
ret += 2 + buf[10] + (buf[11] << 8);
- if (unlikely(LBUFSIZE <= ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n");
+ if (unlikely(ret >= LBUFSIZE)) {
+ pr_debug("buffer overflow (EXTRA)?\n");
goto out_free_buf;
}
}
if (buf[3] & ORIG_NAME) {
while (ret < LBUFSIZE && buf[ret++] != 0)
;
- if (unlikely(LBUFSIZE == ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n");
+ if (unlikely(ret == LBUFSIZE)) {
+ pr_debug("buffer overflow (ORIG_NAME)?\n");
goto out_free_buf;
}
}
if (buf[3] & COMMENT) {
while (ret < LBUFSIZE && buf[ret++] != 0)
;
- if (unlikely(LBUFSIZE == ret)) {
- DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n");
+ if (unlikely(ret == LBUFSIZE)) {
+ pr_debug("buffer overflow (COMMENT)?\n");
goto out_free_buf;
}
}
@@ -273,7 +270,7 @@ static int decompress_exec(
strm.total_out = 0;
if (zlib_inflateInit2(&strm, -MAX_WBITS) != Z_OK) {
- DBG_FLT("binfmt_flat: zlib init failed?\n");
+ pr_debug("zlib init failed?\n");
goto out_free_buf;
}
@@ -290,7 +287,7 @@ static int decompress_exec(
}
if (ret < 0) {
- DBG_FLT("binfmt_flat: decompression failed (%d), %s\n",
+ pr_debug("decompression failed (%d), %s\n",
ret, strm.msg);
goto out_zlib;
}
@@ -327,24 +324,23 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
r &= 0x00ffffff; /* Trim ID off here */
}
if (id >= MAX_SHARED_LIBS) {
- printk("BINFMT_FLAT: reference 0x%x to shared library %d",
- (unsigned) r, id);
+ pr_err("reference 0x%lx to shared library %d", r, id);
goto failed;
}
if (curid != id) {
if (internalp) {
- printk("BINFMT_FLAT: reloc address 0x%x not in same module "
- "(%d != %d)", (unsigned) r, curid, id);
+ pr_err("reloc address 0x%lx not in same module "
+ "(%d != %d)", r, curid, id);
goto failed;
- } else if ( ! p->lib_list[id].loaded &&
- load_flat_shared_library(id, p) < 0) {
- printk("BINFMT_FLAT: failed to load library %d", id);
+ } else if (!p->lib_list[id].loaded &&
+ load_flat_shared_library(id, p) < 0) {
+ pr_err("failed to load library %d", id);
goto failed;
}
/* Check versioning information (i.e. time stamps) */
if (p->lib_list[id].build_date && p->lib_list[curid].build_date &&
p->lib_list[curid].build_date < p->lib_list[id].build_date) {
- printk("BINFMT_FLAT: library %d is younger than %d", id, curid);
+ pr_err("library %d is younger than %d", id, curid);
goto failed;
}
}
@@ -358,8 +354,8 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
text_len = p->lib_list[id].text_len;
if (!flat_reloc_valid(r, start_brk - start_data + text_len)) {
- printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)",
- (int) r,(int)(start_brk-start_data+text_len),(int)text_len);
+ pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)",
+ r, start_brk-start_data+text_len, text_len);
goto failed;
}
@@ -369,10 +365,10 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp)
addr = r - text_len + start_data;
/* Range checked already above so doing the range tests is redundant...*/
- return(addr);
+ return addr;
failed:
- printk(", killing %s!\n", current->comm);
+ pr_cont(", killing %s!\n", current->comm);
send_sig(SIGSEGV, current, 0);
return RELOC_FAILED;
@@ -382,62 +378,57 @@ failed:
static void old_reloc(unsigned long rl)
{
-#ifdef DEBUG
- char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
-#endif
+ static const char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" };
flat_v2_reloc_t r;
- unsigned long *ptr;
-
+ unsigned long __user *ptr;
+ unsigned long val;
+
r.value = rl;
#if defined(CONFIG_COLDFIRE)
- ptr = (unsigned long *) (current->mm->start_code + r.reloc.offset);
+ ptr = (unsigned long __user *)(current->mm->start_code + r.reloc.offset);
#else
- ptr = (unsigned long *) (current->mm->start_data + r.reloc.offset);
+ ptr = (unsigned long __user *)(current->mm->start_data + r.reloc.offset);
#endif
+ get_user(val, ptr);
+
+ pr_debug("Relocation of variable at DATASEG+%x "
+ "(address %p, currently %lx) into segment %s\n",
+ r.reloc.offset, ptr, val, segment[r.reloc.type]);
-#ifdef DEBUG
- printk("Relocation of variable at DATASEG+%x "
- "(address %p, currently %x) into segment %s\n",
- r.reloc.offset, ptr, (int)*ptr, segment[r.reloc.type]);
-#endif
-
switch (r.reloc.type) {
case OLD_FLAT_RELOC_TYPE_TEXT:
- *ptr += current->mm->start_code;
+ val += current->mm->start_code;
break;
case OLD_FLAT_RELOC_TYPE_DATA:
- *ptr += current->mm->start_data;
+ val += current->mm->start_data;
break;
case OLD_FLAT_RELOC_TYPE_BSS:
- *ptr += current->mm->end_data;
+ val += current->mm->end_data;
break;
default:
- printk("BINFMT_FLAT: Unknown relocation type=%x\n", r.reloc.type);
+ pr_err("Unknown relocation type=%x\n", r.reloc.type);
break;
}
+ put_user(val, ptr);
-#ifdef DEBUG
- printk("Relocation became %x\n", (int)*ptr);
-#endif
-}
+ pr_debug("Relocation became %lx\n", val);
+}
/****************************************************************************/
-static int load_flat_file(struct linux_binprm * bprm,
+static int load_flat_file(struct linux_binprm *bprm,
struct lib_info *libinfo, int id, unsigned long *extra_stack)
{
- struct flat_hdr * hdr;
- unsigned long textpos = 0, datapos = 0, result;
- unsigned long realdatastart = 0;
- unsigned long text_len, data_len, bss_len, stack_len, flags;
- unsigned long full_data;
- unsigned long len, memp = 0;
- unsigned long memp_size, extra, rlim;
- unsigned long *reloc = 0, *rp;
+ struct flat_hdr *hdr;
+ unsigned long textpos, datapos, realdatastart;
+ unsigned long text_len, data_len, bss_len, stack_len, full_data, flags;
+ unsigned long len, memp, memp_size, extra, rlim;
+ unsigned long __user *reloc, *rp;
struct inode *inode;
- int i, rev, relocs = 0;
+ int i, rev, relocs;
loff_t fpos;
unsigned long start_code, end_code;
+ ssize_t result;
int ret;
hdr = ((struct flat_hdr *) bprm->buf); /* exec-header */
@@ -469,20 +460,30 @@ static int load_flat_file(struct linux_binprm * bprm,
}
if (flags & FLAT_FLAG_KTRACE)
- printk("BINFMT_FLAT: Loading file: %s\n", bprm->filename);
+ pr_info("Loading file: %s\n", bprm->filename);
if (rev != FLAT_VERSION && rev != OLD_FLAT_VERSION) {
- printk("BINFMT_FLAT: bad flat file version 0x%x (supported "
- "0x%lx and 0x%lx)\n",
- rev, FLAT_VERSION, OLD_FLAT_VERSION);
+ pr_err("bad flat file version 0x%x (supported 0x%lx and 0x%lx)\n",
+ rev, FLAT_VERSION, OLD_FLAT_VERSION);
ret = -ENOEXEC;
goto err;
}
-
+
/* Don't allow old format executables to use shared libraries */
if (rev == OLD_FLAT_VERSION && id != 0) {
- printk("BINFMT_FLAT: shared libraries are not available before rev 0x%x\n",
- (int) FLAT_VERSION);
+ pr_err("shared libraries are not available before rev 0x%lx\n",
+ FLAT_VERSION);
+ ret = -ENOEXEC;
+ goto err;
+ }
+
+ /*
+ * Make sure the header params are sane.
+ * 28 bits (256 MB) is way more than reasonable in this case.
+ * If some top bits are set we have probable binary corruption.
+ */
+ if ((text_len | data_len | bss_len | stack_len | full_data) >> 28) {
+ pr_err("bad header\n");
ret = -ENOEXEC;
goto err;
}
@@ -496,7 +497,7 @@ static int load_flat_file(struct linux_binprm * bprm,
#ifndef CONFIG_BINFMT_ZFLAT
if (flags & (FLAT_FLAG_GZIP|FLAT_FLAG_GZDATA)) {
- printk("Support for ZFLAT executables is not enabled.\n");
+ pr_err("Support for ZFLAT executables is not enabled.\n");
ret = -ENOEXEC;
goto err;
}
@@ -517,11 +518,9 @@ static int load_flat_file(struct linux_binprm * bprm,
/* Flush all traces of the currently running executable */
if (id == 0) {
- result = flush_old_exec(bprm);
- if (result) {
- ret = result;
+ ret = flush_old_exec(bprm);
+ if (ret)
goto err;
- }
/* OK, This is the point of no return */
set_personality(PER_LINUX_32BIT);
@@ -539,48 +538,48 @@ static int load_flat_file(struct linux_binprm * bprm,
* case, and then the fully copied to RAM case which lumps
* it all together.
*/
- if ((flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP)) == 0) {
+ if (!IS_ENABLED(CONFIG_MMU) && !(flags & (FLAT_FLAG_RAM|FLAT_FLAG_GZIP))) {
/*
* this should give us a ROM ptr, but if it doesn't we don't
* really care
*/
- DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
+ pr_debug("ROM mapping of file (we hope)\n");
textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
MAP_PRIVATE|MAP_EXECUTABLE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
- if (!textpos)
- textpos = (unsigned long) -ENOMEM;
- printk("Unable to mmap process text, errno %d\n", (int)-textpos);
ret = textpos;
+ if (!textpos)
+ ret = -ENOMEM;
+ pr_err("Unable to mmap process text, errno %d\n", ret);
goto err;
}
len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
- realdatastart = vm_mmap(0, 0, len,
+ realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
+ ret = realdatastart;
if (!realdatastart)
- realdatastart = (unsigned long) -ENOMEM;
- printk("Unable to allocate RAM for process data, errno %d\n",
- (int)-realdatastart);
+ ret = -ENOMEM;
+ pr_err("Unable to allocate RAM for process data, "
+ "errno %d\n", ret);
vm_munmap(textpos, text_len);
- ret = realdatastart;
goto err;
}
datapos = ALIGN(realdatastart +
MAX_SHARED_LIBS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
- DBG_FLT("BINFMT_FLAT: Allocated data+bss+stack (%d bytes): %x\n",
- (int)(data_len + bss_len + stack_len), (int)datapos);
+ pr_debug("Allocated data+bss+stack (%ld bytes): %lx\n",
+ data_len + bss_len + stack_len, datapos);
fpos = ntohl(hdr->data_start);
#ifdef CONFIG_BINFMT_ZFLAT
if (flags & FLAT_FLAG_GZDATA) {
- result = decompress_exec(bprm, fpos, (char *) datapos,
+ result = decompress_exec(bprm, fpos, (char *)datapos,
full_data, 0);
} else
#endif
@@ -589,29 +588,30 @@ static int load_flat_file(struct linux_binprm * bprm,
full_data);
}
if (IS_ERR_VALUE(result)) {
- printk("Unable to read data+bss, errno %d\n", (int)-result);
+ ret = result;
+ pr_err("Unable to read data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len);
vm_munmap(realdatastart, len);
- ret = result;
goto err;
}
- reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len));
+ reloc = (unsigned long __user *)
+ (datapos + (ntohl(hdr->reloc_start) - text_len));
memp = realdatastart;
memp_size = len;
} else {
len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
- textpos = vm_mmap(0, 0, len,
+ textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
if (!textpos || IS_ERR_VALUE(textpos)) {
- if (!textpos)
- textpos = (unsigned long) -ENOMEM;
- printk("Unable to allocate RAM for process text/data, errno %d\n",
- (int)-textpos);
ret = textpos;
+ if (!textpos)
+ ret = -ENOMEM;
+ pr_err("Unable to allocate RAM for process text/data, "
+ "errno %d\n", ret);
goto err;
}
@@ -620,7 +620,7 @@ static int load_flat_file(struct linux_binprm * bprm,
MAX_SHARED_LIBS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
- reloc = (unsigned long *)
+ reloc = (unsigned long __user *)
(datapos + (ntohl(hdr->reloc_start) - text_len));
memp = textpos;
memp_size = len;
@@ -629,21 +629,59 @@ static int load_flat_file(struct linux_binprm * bprm,
* load it all in and treat it like a RAM load from now on
*/
if (flags & FLAT_FLAG_GZIP) {
- result = decompress_exec(bprm, sizeof (struct flat_hdr),
- (((char *) textpos) + sizeof (struct flat_hdr)),
+#ifndef CONFIG_MMU
+ result = decompress_exec(bprm, sizeof(struct flat_hdr),
+ (((char *)textpos) + sizeof(struct flat_hdr)),
(text_len + full_data
- - sizeof (struct flat_hdr)),
+ - sizeof(struct flat_hdr)),
0);
memmove((void *) datapos, (void *) realdatastart,
full_data);
+#else
+ /*
+ * This is used on MMU systems mainly for testing.
+ * Let's use a kernel buffer to simplify things.
+ */
+ long unz_text_len = text_len - sizeof(struct flat_hdr);
+ long unz_len = unz_text_len + full_data;
+ char *unz_data = vmalloc(unz_len);
+ if (!unz_data) {
+ result = -ENOMEM;
+ } else {
+ result = decompress_exec(bprm, sizeof(struct flat_hdr),
+ unz_data, unz_len, 0);
+ if (result == 0 &&
+ (copy_to_user((void __user *)textpos + sizeof(struct flat_hdr),
+ unz_data, unz_text_len) ||
+ copy_to_user((void __user *)datapos,
+ unz_data + unz_text_len, full_data)))
+ result = -EFAULT;
+ vfree(unz_data);
+ }
+#endif
} else if (flags & FLAT_FLAG_GZDATA) {
result = read_code(bprm->file, textpos, 0, text_len);
- if (!IS_ERR_VALUE(result))
+ if (!IS_ERR_VALUE(result)) {
+#ifndef CONFIG_MMU
result = decompress_exec(bprm, text_len, (char *) datapos,
full_data, 0);
- }
- else
+#else
+ char *unz_data = vmalloc(full_data);
+ if (!unz_data) {
+ result = -ENOMEM;
+ } else {
+ result = decompress_exec(bprm, text_len,
+ unz_data, full_data, 0);
+ if (result == 0 &&
+ copy_to_user((void __user *)datapos,
+ unz_data, full_data))
+ result = -EFAULT;
+ vfree(unz_data);
+ }
#endif
+ }
+ } else
+#endif /* CONFIG_BINFMT_ZFLAT */
{
result = read_code(bprm->file, textpos, 0, text_len);
if (!IS_ERR_VALUE(result))
@@ -652,21 +690,19 @@ static int load_flat_file(struct linux_binprm * bprm,
full_data);
}
if (IS_ERR_VALUE(result)) {
- printk("Unable to read code+data+bss, errno %d\n",(int)-result);
+ ret = result;
+ pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
MAX_SHARED_LIBS * sizeof(unsigned long));
- ret = result;
goto err;
}
}
- if (flags & FLAT_FLAG_KTRACE)
- printk("Mapping is %x, Entry point is %x, data_start is %x\n",
- (int)textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+ start_code = textpos + sizeof(struct flat_hdr);
+ end_code = textpos + text_len;
+ text_len -= sizeof(struct flat_hdr); /* the real code len */
/* The main program needs a little extra setup in the task structure */
- start_code = textpos + sizeof (struct flat_hdr);
- end_code = textpos + text_len;
if (id == 0) {
current->mm->start_code = start_code;
current->mm->end_code = end_code;
@@ -681,19 +717,19 @@ static int load_flat_file(struct linux_binprm * bprm,
*/
current->mm->start_brk = datapos + data_len + bss_len;
current->mm->brk = (current->mm->start_brk + 3) & ~3;
+#ifndef CONFIG_MMU
current->mm->context.end_brk = memp + memp_size - stack_len;
+#endif
}
- if (flags & FLAT_FLAG_KTRACE)
- printk("%s %s: TEXT=%x-%x DATA=%x-%x BSS=%x-%x\n",
+ if (flags & FLAT_FLAG_KTRACE) {
+ pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n",
+ textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start));
+ pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n",
id ? "Lib" : "Load", bprm->filename,
- (int) start_code, (int) end_code,
- (int) datapos,
- (int) (datapos + data_len),
- (int) (datapos + data_len),
- (int) (((datapos + data_len + bss_len) + 3) & ~3));
-
- text_len -= sizeof(struct flat_hdr); /* the real code len */
+ start_code, end_code, datapos, datapos + data_len,
+ datapos + data_len, (datapos + data_len + bss_len + 3) & ~3);
+ }
/* Store the current module values into the global library structure */
libinfo->lib_list[id].start_code = start_code;
@@ -703,7 +739,7 @@ static int load_flat_file(struct linux_binprm * bprm,
libinfo->lib_list[id].loaded = 1;
libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos;
libinfo->lib_list[id].build_date = ntohl(hdr->build_date);
-
+
/*
* We just load the allocations into some temporary memory to
* help simplify all this mumbo jumbo
@@ -717,15 +753,20 @@ static int load_flat_file(struct linux_binprm * bprm,
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (unsigned long *)datapos; *rp != 0xffffffff; rp++) {
- unsigned long addr;
- if (*rp) {
- addr = calc_reloc(*rp, libinfo, id, 0);
+ for (rp = (unsigned long __user *)datapos; ; rp++) {
+ unsigned long addr, rp_val;
+ if (get_user(rp_val, rp))
+ return -EFAULT;
+ if (rp_val == 0xffffffff)
+ break;
+ if (rp_val) {
+ addr = calc_reloc(rp_val, libinfo, id, 0);
if (addr == RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
}
- *rp = addr;
+ if (put_user(addr, rp))
+ return -EFAULT;
}
}
}
@@ -742,19 +783,23 @@ static int load_flat_file(struct linux_binprm * bprm,
* __start to address 4 so that is okay).
*/
if (rev > OLD_FLAT_VERSION) {
- unsigned long persistent = 0;
- for (i=0; i < relocs; i++) {
+ unsigned long __maybe_unused persistent = 0;
+ for (i = 0; i < relocs; i++) {
unsigned long addr, relval;
- /* Get the address of the pointer to be
- relocated (of course, the address has to be
- relocated first). */
- relval = ntohl(reloc[i]);
- if (flat_set_persistent (relval, &persistent))
+ /*
+ * Get the address of the pointer to be
+ * relocated (of course, the address has to be
+ * relocated first).
+ */
+ if (get_user(relval, reloc + i))
+ return -EFAULT;
+ relval = ntohl(relval);
+ if (flat_set_persistent(relval, &persistent))
continue;
addr = flat_get_relocate_addr(relval);
- rp = (unsigned long *) calc_reloc(addr, libinfo, id, 1);
- if (rp == (unsigned long *)RELOC_FAILED) {
+ rp = (unsigned long __user *)calc_reloc(addr, libinfo, id, 1);
+ if (rp == (unsigned long __user *)RELOC_FAILED) {
ret = -ENOEXEC;
goto err;
}
@@ -780,17 +825,23 @@ static int load_flat_file(struct linux_binprm * bprm,
}
}
} else {
- for (i=0; i < relocs; i++)
- old_reloc(ntohl(reloc[i]));
+ for (i = 0; i < relocs; i++) {
+ unsigned long relval;
+ if (get_user(relval, reloc + i))
+ return -EFAULT;
+ relval = ntohl(relval);
+ old_reloc(relval);
+ }
}
-
+
flush_icache_range(start_code, end_code);
/* zero the BSS, BRK and stack areas */
- memset((void*)(datapos + data_len), 0, bss_len +
- (memp + memp_size - stack_len - /* end brk */
- libinfo->lib_list[id].start_brk) + /* start brk */
- stack_len);
+ if (clear_user((void __user *)(datapos + data_len), bss_len +
+ (memp + memp_size - stack_len - /* end brk */
+ libinfo->lib_list[id].start_brk) + /* start brk */
+ stack_len))
+ return -EFAULT;
return 0;
err:
@@ -846,7 +897,7 @@ out:
allow_write_access(bprm.file);
fput(bprm.file);
- return(res);
+ return res;
}
#endif /* CONFIG_BINFMT_SHARED_FLAT */
@@ -857,18 +908,17 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_flat_binary(struct linux_binprm * bprm)
+static int load_flat_binary(struct linux_binprm *bprm)
{
struct lib_info libinfo;
struct pt_regs *regs = current_pt_regs();
- unsigned long p = bprm->p;
- unsigned long stack_len;
+ unsigned long stack_len = 0;
unsigned long start_addr;
- unsigned long *sp;
int res;
int i, j;
memset(&libinfo, 0, sizeof(libinfo));
+
/*
* We have to add the size of our arguments to our stack size
* otherwise it's too easy for users to create stack overflows
@@ -876,38 +926,54 @@ static int load_flat_binary(struct linux_binprm * bprm)
* pedantic and include space for the argv/envp array as it may have
* a lot of entries.
*/
-#define TOP_OF_ARGS (PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *))
- stack_len = TOP_OF_ARGS - bprm->p; /* the strings */
- stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
- stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
- stack_len += FLAT_STACK_ALIGN - 1; /* reserve for upcoming alignment */
-
+#ifndef CONFIG_MMU
+ stack_len += PAGE_SIZE * MAX_ARG_PAGES - bprm->p; /* the strings */
+#endif
+ stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
+ stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
+ stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN);
+
res = load_flat_file(bprm, &libinfo, 0, &stack_len);
if (res < 0)
return res;
-
+
/* Update data segment pointers for all libraries */
- for (i=0; i<MAX_SHARED_LIBS; i++)
- if (libinfo.lib_list[i].loaded)
- for (j=0; j<MAX_SHARED_LIBS; j++)
- (-(j+1))[(unsigned long *)(libinfo.lib_list[i].start_data)] =
- (libinfo.lib_list[j].loaded)?
- libinfo.lib_list[j].start_data:UNLOADED_LIB;
+ for (i = 0; i < MAX_SHARED_LIBS; i++) {
+ if (!libinfo.lib_list[i].loaded)
+ continue;
+ for (j = 0; j < MAX_SHARED_LIBS; j++) {
+ unsigned long val = libinfo.lib_list[j].loaded ?
+ libinfo.lib_list[j].start_data : UNLOADED_LIB;
+ unsigned long __user *p = (unsigned long __user *)
+ libinfo.lib_list[i].start_data;
+ p -= j + 1;
+ if (put_user(val, p))
+ return -EFAULT;
+ }
+ }
install_exec_creds(bprm);
set_binfmt(&flat_format);
- p = ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
- DBG_FLT("p=%x\n", (int)p);
+#ifdef CONFIG_MMU
+ res = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
+ if (!res)
+ res = create_flat_tables(bprm, bprm->p);
+#else
+ /* Stash our initial stack pointer into the mm structure */
+ current->mm->start_stack =
+ ((current->mm->context.end_brk + stack_len + 3) & ~3) - 4;
+ pr_debug("sp=%lx\n", current->mm->start_stack);
- /* copy the arg pages onto the stack, this could be more efficient :-) */
- for (i = TOP_OF_ARGS - 1; i >= bprm->p; i--)
- * (char *) --p =
- ((char *) page_address(bprm->page[i/PAGE_SIZE]))[i % PAGE_SIZE];
+ /* copy the arg pages onto the stack */
+ res = transfer_args_to_stack(bprm, &current->mm->start_stack);
+ if (!res)
+ res = create_flat_tables(bprm, current->mm->start_stack);
+#endif
+ if (res)
+ return res;
- sp = (unsigned long *) create_flat_tables(p, bprm);
-
/* Fake some return addresses to ensure the call chain will
* initialise library in order for us. We are required to call
* lib 1 first, then 2, ... and finally the main program (id 0).
@@ -915,24 +981,24 @@ static int load_flat_binary(struct linux_binprm * bprm)
start_addr = libinfo.lib_list[0].entry;
#ifdef CONFIG_BINFMT_SHARED_FLAT
- for (i = MAX_SHARED_LIBS-1; i>0; i--) {
+ for (i = MAX_SHARED_LIBS-1; i > 0; i--) {
if (libinfo.lib_list[i].loaded) {
/* Push previos first to call address */
- --sp; put_user(start_addr, sp);
+ unsigned long __user *sp;
+ current->mm->start_stack -= sizeof(unsigned long);
+ sp = (unsigned long __user *)current->mm->start_stack;
+ __put_user(start_addr, sp);
start_addr = libinfo.lib_list[i].entry;
}
}
#endif
-
- /* Stash our initial stack pointer into the mm structure */
- current->mm->start_stack = (unsigned long )sp;
#ifdef FLAT_PLAT_INIT
FLAT_PLAT_INIT(regs);
#endif
- DBG_FLT("start_thread(regs=0x%x, entry=0x%x, start_stack=0x%x)\n",
- (int)regs, (int)start_addr, (int)current->mm->start_stack);
-
+
+ pr_debug("start_thread(regs=0x%p, entry=0x%lx, start_stack=0x%lx)\n",
+ regs, start_addr, current->mm->start_stack);
start_thread(regs, start_addr, current->mm->start_stack);
return 0;
@@ -945,9 +1011,6 @@ static int __init init_flat_binfmt(void)
register_binfmt(&flat_format);
return 0;
}
-
-/****************************************************************************/
-
core_initcall(init_flat_binfmt);
/****************************************************************************/
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 3a3ced779fc7..5417516f6e59 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -637,13 +637,12 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete this handler. */
- root = dget(file->f_path.dentry->d_sb->s_root);
+ root = file_inode(file)->i_sb->s_root;
inode_lock(d_inode(root));
kill_node(e);
inode_unlock(d_inode(root));
- dput(root);
break;
default:
return res;
@@ -665,8 +664,8 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
{
Node *e;
struct inode *inode;
- struct dentry *root, *dentry;
- struct super_block *sb = file->f_path.dentry->d_sb;
+ struct super_block *sb = file_inode(file)->i_sb;
+ struct dentry *root = sb->s_root, *dentry;
int err = 0;
e = create_entry(buffer, count);
@@ -674,7 +673,6 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
if (IS_ERR(e))
return PTR_ERR(e);
- root = dget(sb->s_root);
inode_lock(d_inode(root));
dentry = lookup_one_len(e->name, root, strlen(e->name));
err = PTR_ERR(dentry);
@@ -712,7 +710,6 @@ out2:
dput(dentry);
out:
inode_unlock(d_inode(root));
- dput(root);
if (err) {
kfree(e);
@@ -753,14 +750,13 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
break;
case 3:
/* Delete all handlers. */
- root = dget(file->f_path.dentry->d_sb->s_root);
+ root = file_inode(file)->i_sb->s_root;
inode_lock(d_inode(root));
while (!list_empty(&entries))
kill_node(list_entry(entries.next, Node, list));
inode_unlock(d_inode(root));
- dput(root);
break;
default:
return res;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index d012be4ab977..d402899ba135 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -416,7 +416,8 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
result = blk_queue_enter(bdev->bd_queue, false);
if (result)
return result;
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_READ);
blk_queue_exit(bdev->bd_queue);
return result;
}
@@ -445,7 +446,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
struct page *page, struct writeback_control *wbc)
{
int result;
- int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE;
const struct block_device_operations *ops = bdev->bd_disk->fops;
if (!ops->rw_page || bdev_get_integrity(bdev))
@@ -455,7 +455,8 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
return result;
set_page_writeback(page);
- result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw);
+ result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
+ REQ_OP_WRITE);
if (result)
end_page_writeback(page);
else
@@ -614,7 +615,6 @@ static void init_once(void *foo)
memset(bdev, 0, sizeof(*bdev));
mutex_init(&bdev->bd_mutex);
- INIT_LIST_HEAD(&bdev->bd_inodes);
INIT_LIST_HEAD(&bdev->bd_list);
#ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks);
@@ -624,24 +624,13 @@ static void init_once(void *foo)
mutex_init(&bdev->bd_fsfreeze_mutex);
}
-static inline void __bd_forget(struct inode *inode)
-{
- list_del_init(&inode->i_devices);
- inode->i_bdev = NULL;
- inode->i_mapping = &inode->i_data;
-}
-
static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
- struct list_head *p;
truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
- while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
- __bd_forget(list_entry(p, struct inode, i_devices));
- }
list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock);
}
@@ -805,7 +794,6 @@ static struct block_device *bd_acquire(struct inode *inode)
bdgrab(bdev);
inode->i_bdev = bdev;
inode->i_mapping = bdev->bd_inode->i_mapping;
- list_add(&inode->i_devices, &bdev->bd_inodes);
}
spin_unlock(&bdev_lock);
}
@@ -821,7 +809,8 @@ void bd_forget(struct inode *inode)
spin_lock(&bdev_lock);
if (!sb_is_blkdev_sb(inode->i_sb))
bdev = inode->i_bdev;
- __bd_forget(inode);
+ inode->i_bdev = NULL;
+ inode->i_mapping = &inode->i_data;
spin_unlock(&bdev_lock);
if (bdev)
@@ -1287,11 +1276,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = disk;
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
- if (IS_ENABLED(CONFIG_BLK_DEV_DAX) &&
- blk_queue_dax(disk->queue))
- bdev->bd_inode->i_flags = S_DAX;
- else
- bdev->bd_inode->i_flags = 0;
+ bdev->bd_inode->i_flags = 0;
if (!partno) {
ret = -ENXIO;
@@ -1858,7 +1843,7 @@ struct block_device *lookup_bdev(const char *pathname)
if (!S_ISBLK(inode->i_mode))
goto fail;
error = -EACCES;
- if (path.mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(&path))
goto fail;
error = -ENOMEM;
bdev = bd_acquire(inode);
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 67a607709d4f..53bb7af4e5f0 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -55,8 +55,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
}
if (size > 0) {
acl = posix_acl_from_xattr(&init_user_ns, value, size);
- } else if (size == -ENOENT || size == -ENODATA || size == 0) {
- /* FIXME, who returns -ENOENT? I think nobody */
+ } else if (size == -ERANGE || size == -ENODATA || size == 0) {
acl = NULL;
} else {
acl = ERR_PTR(-EIO);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 5fb60ea7eee2..e0f071f6b5a7 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -34,6 +34,10 @@
struct __btrfs_workqueue {
struct workqueue_struct *normal_wq;
+
+ /* File system this workqueue services */
+ struct btrfs_fs_info *fs_info;
+
/* List head pointing to ordered work list */
struct list_head ordered_list;
@@ -70,6 +74,18 @@ void btrfs_##name(struct work_struct *arg) \
normal_work_helper(work); \
}
+struct btrfs_fs_info *
+btrfs_workqueue_owner(struct __btrfs_workqueue *wq)
+{
+ return wq->fs_info;
+}
+
+struct btrfs_fs_info *
+btrfs_work_owner(struct btrfs_work *work)
+{
+ return work->wq->fs_info;
+}
+
BTRFS_WORK_HELPER(worker_helper);
BTRFS_WORK_HELPER(delalloc_helper);
BTRFS_WORK_HELPER(flush_delalloc_helper);
@@ -94,14 +110,15 @@ BTRFS_WORK_HELPER(scrubnc_helper);
BTRFS_WORK_HELPER(scrubparity_helper);
static struct __btrfs_workqueue *
-__btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
- int thresh)
+__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
+ unsigned int flags, int limit_active, int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
+ ret->fs_info = fs_info;
ret->limit_active = limit_active;
atomic_set(&ret->pending, 0);
if (thresh == 0)
@@ -143,7 +160,8 @@ __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name,
unsigned int flags,
int limit_active,
int thresh)
@@ -153,7 +171,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
if (!ret)
return NULL;
- ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI,
+ ret->normal = __btrfs_alloc_workqueue(fs_info, name,
+ flags & ~WQ_HIGHPRI,
limit_active, thresh);
if (!ret->normal) {
kfree(ret);
@@ -161,8 +180,8 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
}
if (flags & WQ_HIGHPRI) {
- ret->high = __btrfs_alloc_workqueue(name, flags, limit_active,
- thresh);
+ ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
+ limit_active, thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index ad4d0647d1a6..8e52484cd461 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -21,6 +21,7 @@
#define __BTRFS_ASYNC_THREAD_
#include <linux/workqueue.h>
+struct btrfs_fs_info;
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
@@ -67,7 +68,8 @@ BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
-struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
+struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
+ const char *name,
unsigned int flags,
int limit_active,
int thresh);
@@ -80,4 +82,6 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work);
+struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8bb3509099e8..2b88439c2ee8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -139,7 +139,7 @@ int __init btrfs_prelim_ref_init(void)
btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
sizeof(struct __prelim_ref),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_prelim_ref_cache)
return -ENOMEM;
@@ -361,7 +361,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
- if (btrfs_test_is_dummy_root(root)) {
+ if (btrfs_is_testing(fs_info)) {
srcu_read_unlock(&fs_info->subvol_srcu, index);
ret = -ENOENT;
goto out;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index cefedabf0a92..029db6e1105c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -403,7 +403,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
ret = btrfs_map_bio(root, bio, 0, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
bio_put(bio);
@@ -434,7 +437,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
}
ret = btrfs_map_bio(root, bio, 0, 1);
- BUG_ON(ret); /* -ENOMEM */
+ if (ret) {
+ bio->bi_error = ret;
+ bio_endio(bio);
+ }
bio_put(bio);
return 0;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a85cf7d23309..d1c56c94dd5a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1153,14 +1153,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
ret = btrfs_reloc_cow_block(trans, root, buf, cow);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -1198,7 +1198,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
if (last_ref) {
ret = tree_mod_log_free_eb(root->fs_info, buf);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -1505,7 +1505,7 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
{
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
/* ensure we can see the force_cow */
@@ -1771,6 +1771,14 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
unsigned long map_len = 0;
int err;
+ if (low > high) {
+ btrfs_err(eb->fs_info,
+ "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
+ __func__, low, high, eb->start,
+ btrfs_header_owner(eb), btrfs_header_level(eb));
+ return -EINVAL;
+ }
+
while (low < high) {
mid = (low + high) / 2;
offset = p + mid * item_size;
@@ -1858,7 +1866,6 @@ static void root_sub_used(struct btrfs_root *root, u32 size)
/* given a node and slot number, this reads the blocks it points to. The
* extent buffer is returned with a reference taken (but unlocked).
- * NULL is returned on error.
*/
static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
struct extent_buffer *parent, int slot)
@@ -1866,19 +1873,16 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
int level = btrfs_header_level(parent);
struct extent_buffer *eb;
- if (slot < 0)
- return NULL;
- if (slot >= btrfs_header_nritems(parent))
- return NULL;
+ if (slot < 0 || slot >= btrfs_header_nritems(parent))
+ return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
btrfs_node_ptr_generation(parent, slot));
- if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) {
- if (!IS_ERR(eb))
- free_extent_buffer(eb);
- eb = NULL;
+ if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ eb = ERR_PTR(-EIO);
}
return eb;
@@ -1931,8 +1935,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* promote the child to a root */
child = read_node_slot(root, mid, 0);
- if (!child) {
- ret = -EROFS;
+ if (IS_ERR(child)) {
+ ret = PTR_ERR(child);
btrfs_handle_fs_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -1970,6 +1974,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
return 0;
left = read_node_slot(root, parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
+
if (left) {
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
@@ -1980,7 +1987,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
}
+
right = read_node_slot(root, parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
+
if (right) {
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
@@ -2135,6 +2146,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
return 1;
left = read_node_slot(root, parent, pslot - 1);
+ if (IS_ERR(left))
+ left = NULL;
/* first, try to make some room in the middle buffer */
if (left) {
@@ -2185,6 +2198,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
free_extent_buffer(left);
}
right = read_node_slot(root, parent, pslot + 1);
+ if (IS_ERR(right))
+ right = NULL;
/*
* then try to empty the right most buffer into the middle
@@ -3240,7 +3255,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
push_items);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
@@ -3315,7 +3330,7 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
src_nritems - push_items, push_items);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(dst, src,
@@ -3519,7 +3534,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
mid, c_nritems - mid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
copy_extent_buffer(split, c,
@@ -3773,7 +3788,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
right = read_node_slot(root, upper, slot + 1);
- if (right == NULL)
+ /*
+ * slot + 1 is not valid or we fail to read the right node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(right))
return 1;
btrfs_tree_lock(right);
@@ -4003,7 +4022,11 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
btrfs_assert_tree_locked(path->nodes[1]);
left = read_node_slot(root, path->nodes[1], slot - 1);
- if (left == NULL)
+ /*
+ * slot - 1 is not valid or we fail to read the left node,
+ * no big deal, just return.
+ */
+ if (IS_ERR(left))
return 1;
btrfs_tree_lock(left);
@@ -5210,7 +5233,10 @@ find_next_key:
}
btrfs_set_path_blocking(path);
cur = read_node_slot(root, cur, slot);
- BUG_ON(!cur); /* -ENOMEM */
+ if (IS_ERR(cur)) {
+ ret = PTR_ERR(cur);
+ goto out;
+ }
btrfs_tree_read_lock(cur);
@@ -5229,15 +5255,21 @@ out:
return ret;
}
-static void tree_move_down(struct btrfs_root *root,
+static int tree_move_down(struct btrfs_root *root,
struct btrfs_path *path,
int *level, int root_level)
{
+ struct extent_buffer *eb;
+
BUG_ON(*level == 0);
- path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
- path->slots[*level]);
+ eb = read_node_slot(root, path->nodes[*level], path->slots[*level]);
+ if (IS_ERR(eb))
+ return PTR_ERR(eb);
+
+ path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
+ return 0;
}
static int tree_move_next_or_upnext(struct btrfs_root *root,
@@ -5282,8 +5314,7 @@ static int tree_advance(struct btrfs_root *root,
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(root, path, level, root_level);
} else {
- tree_move_down(root, path, level, root_level);
- ret = 0;
+ ret = tree_move_down(root, path, level, root_level);
}
if (ret >= 0) {
if (*level == 0)
@@ -5457,8 +5488,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
&left_key);
- if (ret < 0)
+ if (ret == -1)
left_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
advance_left = 0;
}
if (advance_right && !right_end_reached) {
@@ -5466,8 +5499,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
&right_key);
- if (ret < 0)
+ if (ret == -1)
right_end_reached = ADVANCE;
+ else if (ret < 0)
+ goto out;
advance_right = 0;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b2620d1f883f..2fe8f89091a3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -117,6 +117,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
#define BTRFS_FS_STATE_REMOUNTING 1
#define BTRFS_FS_STATE_TRANS_ABORTED 2
#define BTRFS_FS_STATE_DEV_REPLACING 3
+#define BTRFS_FS_STATE_DUMMY_FS_INFO 4
#define BTRFS_BACKREF_REV_MAX 256
#define BTRFS_BACKREF_REV_SHIFT 56
@@ -144,21 +145,6 @@ struct btrfs_header {
u8 level;
} __attribute__ ((__packed__));
-#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \
- sizeof(struct btrfs_header)) / \
- sizeof(struct btrfs_key_ptr))
-#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header))
-#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize))
-#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
- (offsetof(struct btrfs_file_extent_item, disk_bytenr))
-#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
- sizeof(struct btrfs_item) - \
- BTRFS_FILE_EXTENT_INLINE_DATA_START)
-#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \
- sizeof(struct btrfs_item) -\
- sizeof(struct btrfs_dir_item))
-
-
/*
* this is a very generous portion of the super block, giving us
* room to translate 14 chunks with 3 stripes each.
@@ -439,6 +425,8 @@ struct btrfs_space_info {
struct list_head list;
/* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
+ struct list_head priority_tickets;
+ struct list_head tickets;
struct rw_semaphore groups_sem;
/* for block groups in our same type */
@@ -1112,12 +1100,11 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_REF_COWS 1
#define BTRFS_ROOT_TRACK_DIRTY 2
#define BTRFS_ROOT_IN_RADIX 3
-#define BTRFS_ROOT_DUMMY_ROOT 4
-#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 5
-#define BTRFS_ROOT_DEFRAG_RUNNING 6
-#define BTRFS_ROOT_FORCE_COW 7
-#define BTRFS_ROOT_MULTI_LOG_TASKS 8
-#define BTRFS_ROOT_DIRTY 9
+#define BTRFS_ROOT_ORPHAN_ITEM_INSERTED 4
+#define BTRFS_ROOT_DEFRAG_RUNNING 5
+#define BTRFS_ROOT_FORCE_COW 6
+#define BTRFS_ROOT_MULTI_LOG_TASKS 7
+#define BTRFS_ROOT_DIRTY 8
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1179,8 +1166,10 @@ struct btrfs_root {
u64 highest_objectid;
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
u64 alloc_bytenr;
+#endif
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
@@ -1257,6 +1246,39 @@ struct btrfs_root {
atomic_t qgroup_meta_rsv;
};
+static inline u32 __BTRFS_LEAF_DATA_SIZE(u32 blocksize)
+{
+ return blocksize - sizeof(struct btrfs_header);
+}
+
+static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_root *root)
+{
+ return __BTRFS_LEAF_DATA_SIZE(root->nodesize);
+}
+
+static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+}
+
+static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_root *root)
+{
+ return BTRFS_LEAF_DATA_SIZE(root) / sizeof(struct btrfs_key_ptr);
+}
+
+#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
+ (offsetof(struct btrfs_file_extent_item, disk_bytenr))
+static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_MAX_ITEM_SIZE(root) -
+ BTRFS_FILE_EXTENT_INLINE_DATA_START;
+}
+
+static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_root *root)
+{
+ return BTRFS_MAX_ITEM_SIZE(root) - sizeof(struct btrfs_dir_item);
+}
+
/*
* Flags for mount options.
*
@@ -1297,21 +1319,21 @@ struct btrfs_root {
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
-#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \
+#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(root, opt, fmt, args...) \
+#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
{ \
- if (!btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_set_opt(root->fs_info->mount_opt, opt); \
+ if (!btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_set_opt(fs_info->mount_opt, opt); \
}
-#define btrfs_clear_and_info(root, opt, fmt, args...) \
+#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
{ \
- if (btrfs_test_opt(root, opt)) \
- btrfs_info(root->fs_info, fmt, ##args); \
- btrfs_clear_opt(root->fs_info->mount_opt, opt); \
+ if (btrfs_test_opt(fs_info, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+ btrfs_clear_opt(fs_info->mount_opt, opt); \
}
#ifdef CONFIG_BTRFS_DEBUG
@@ -1319,9 +1341,9 @@ static inline int
btrfs_should_fragment_free_space(struct btrfs_root *root,
struct btrfs_block_group_cache *block_group)
{
- return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ return (btrfs_test_opt(root->fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
- (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ (btrfs_test_opt(root->fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
#endif
@@ -2624,6 +2646,15 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
+enum btrfs_flush_state {
+ FLUSH_DELAYED_ITEMS_NR = 1,
+ FLUSH_DELAYED_ITEMS = 2,
+ FLUSH_DELALLOC = 3,
+ FLUSH_DELALLOC_WAIT = 4,
+ ALLOC_CHUNK = 5,
+ COMMIT_TRANS = 6,
+};
+
int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
@@ -2661,8 +2692,8 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes);
+ struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
+ int update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -2875,9 +2906,6 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
/* root-item.c */
-int btrfs_find_root_ref(struct btrfs_root *tree_root,
- struct btrfs_path *path,
- u64 root_id, u64 ref_id);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *tree_root,
u64 root_id, u64 ref_id, u64 dirid, u64 sequence,
@@ -3351,23 +3379,23 @@ const char *btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *function,
+ const char *function,
unsigned int line, int errno);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact line number is reported.
*/
-#define btrfs_abort_transaction(trans, root, errno) \
+#define btrfs_abort_transaction(trans, errno) \
do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
- &((root)->fs_info->fs_state))) { \
+ &((trans)->fs_info->fs_state))) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
(errno)); \
} \
- __btrfs_abort_transaction((trans), (root), __func__, \
+ __btrfs_abort_transaction((trans), __func__, \
__LINE__, (errno)); \
} while (0)
@@ -3599,13 +3627,13 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
void btrfs_test_destroy_inode(struct inode *inode);
#endif
-static inline int btrfs_test_is_dummy_root(struct btrfs_root *root)
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state)))
+ if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+ &fs_info->fs_state)))
return 1;
#endif
return 0;
}
-
#endif
diff --git a/fs/btrfs/dedupe.h b/fs/btrfs/dedupe.h
new file mode 100644
index 000000000000..83ebfe28da9e
--- /dev/null
+++ b/fs/btrfs/dedupe.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) 2016 Fujitsu. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_DEDUPE__
+#define __BTRFS_DEDUPE__
+
+/* later in-band dedupe will expand this struct */
+struct btrfs_dedupe_hash;
+#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index d3aaabbfada0..3eeb9cd8cfa5 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -34,7 +34,7 @@ int __init btrfs_delayed_inode_init(void)
delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
sizeof(struct btrfs_delayed_node),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!delayed_node_cache)
return -ENOMEM;
@@ -553,7 +553,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
dst_rsv = &root->fs_info->delayed_block_rsv;
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
if (!ret) {
trace_btrfs_space_reservation(root->fs_info, "delayed_item",
item->key.objectid,
@@ -598,6 +598,29 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes = btrfs_calc_trans_metadata_size(root, 1);
/*
+ * If our block_rsv is the delalloc block reserve then check and see if
+ * we have our extra reservation for updating the inode. If not fall
+ * through and try to reserve space quickly.
+ *
+ * We used to try and steal from the delalloc block rsv or the global
+ * reserve, but we'd steal a full reservation, which isn't kind. We are
+ * here through delalloc which means we've likely just cowed down close
+ * to the leaf that contains the inode, so we would steal less just
+ * doing the fallback inode update, so if we do end up having to steal
+ * from the global block rsv we hopefully only steal one or two blocks
+ * worth which is less likely to hurt us.
+ */
+ if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags))
+ release = true;
+ else
+ src_rsv = NULL;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+
+ /*
* btrfs_dirty_inode will update the inode under btrfs_join_transaction
* which doesn't reserve space for speed. This is a problem since we
* still need to reserve space for this update, so try to reserve the
@@ -626,51 +649,10 @@ static int btrfs_delayed_inode_reserve_metadata(
num_bytes, 1);
}
return ret;
- } else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
- spin_lock(&BTRFS_I(inode)->lock);
- if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- spin_unlock(&BTRFS_I(inode)->lock);
- release = true;
- goto migrate;
- }
- spin_unlock(&BTRFS_I(inode)->lock);
-
- /* Ok we didn't have space pre-reserved. This shouldn't happen
- * too often but it can happen if we do delalloc to an existing
- * inode which gets dirtied because of the time update, and then
- * isn't touched again until after the transaction commits and
- * then we try to write out the data. First try to be nice and
- * reserve something strictly for us. If not be a pain and try
- * to steal from the delalloc block rsv.
- */
- ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
- BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
- goto out;
-
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
- if (!ret)
- goto out;
-
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
- btrfs_debug(root->fs_info,
- "block rsv migrate returned %d", ret);
- WARN_ON(1);
- }
- /*
- * Ok this is a problem, let's just steal from the global rsv
- * since this really shouldn't happen that often.
- */
- ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
- dst_rsv, num_bytes);
- goto out;
}
-migrate:
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
-out:
/*
* Migrate only takes a reservation, it doesn't touch the size of the
* block_rsv. This is to simplify people who don't normally have things
@@ -1188,7 +1170,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_release_delayed_node(curr_node);
curr_node = NULL;
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 430b3689b112..b6d210e7a993 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -606,7 +606,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
- qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs,
+ qexisting = btrfs_qgroup_insert_dirty_extent(fs_info,
+ delayed_refs,
qrecord);
if (qexisting)
kfree(qrecord);
@@ -615,7 +616,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
spin_lock_init(&head_ref->lock);
mutex_init(&head_ref->mutex);
- trace_add_delayed_ref_head(ref, head_ref, action);
+ trace_add_delayed_ref_head(fs_info, ref, head_ref, action);
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
@@ -682,7 +683,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->type = BTRFS_TREE_BLOCK_REF_KEY;
full_ref->level = level;
- trace_add_delayed_tree_ref(ref, full_ref, action);
+ trace_add_delayed_tree_ref(fs_info, ref, full_ref, action);
ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
@@ -739,7 +740,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
full_ref->objectid = owner;
full_ref->offset = offset;
- trace_add_delayed_data_ref(ref, full_ref, action);
+ trace_add_delayed_data_ref(fs_info, ref, full_ref, action);
ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref);
@@ -940,28 +941,28 @@ int btrfs_delayed_ref_init(void)
btrfs_delayed_ref_head_cachep = kmem_cache_create(
"btrfs_delayed_ref_head",
sizeof(struct btrfs_delayed_ref_head), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_ref_head_cachep)
goto fail;
btrfs_delayed_tree_ref_cachep = kmem_cache_create(
"btrfs_delayed_tree_ref",
sizeof(struct btrfs_delayed_tree_ref), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_tree_ref_cachep)
goto fail;
btrfs_delayed_data_ref_cachep = kmem_cache_create(
"btrfs_delayed_data_ref",
sizeof(struct btrfs_delayed_data_ref), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_data_ref_cachep)
goto fail;
btrfs_delayed_extent_op_cachep = kmem_cache_create(
"btrfs_delayed_extent_op",
sizeof(struct btrfs_delayed_extent_op), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_delayed_extent_op_cachep)
goto fail;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 63ef9cdf0144..e9bbff3c0029 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -142,7 +142,7 @@ no_valid_dev_replace_entry_found:
* missing
*/
if (!dev_replace->srcdev &&
- !btrfs_test_opt(dev_root, DEGRADED)) {
+ !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
@@ -151,7 +151,7 @@ no_valid_dev_replace_entry_found:
src_devid);
}
if (!dev_replace->tgtdev &&
- !btrfs_test_opt(dev_root, DEGRADED)) {
+ !btrfs_test_opt(dev_root->fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 9a726ded2c6d..87dad552e39a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -101,7 +101,7 @@ int __init btrfs_end_io_wq_init(void)
btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq",
sizeof(struct btrfs_end_io_wq),
0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_end_io_wq_cache)
return -ENOMEM;
@@ -1140,7 +1140,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
u64 bytenr)
{
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return alloc_test_extent_buffer(root->fs_info, bytenr,
root->nodesize);
return alloc_extent_buffer(root->fs_info, bytenr);
@@ -1227,6 +1227,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
struct btrfs_root *root, struct btrfs_fs_info *fs_info,
u64 objectid)
{
+ bool dummy = test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
root->node = NULL;
root->commit_root = NULL;
root->sectorsize = sectorsize;
@@ -1281,14 +1282,14 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
- if (fs_info)
+ if (!dummy)
extent_io_tree_init(&root->dirty_log_pages,
fs_info->btree_inode->i_mapping);
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- if (fs_info)
+ if (!dummy)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
@@ -1309,17 +1310,20 @@ static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
-struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize)
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+ u32 sectorsize, u32 nodesize)
{
struct btrfs_root *root;
- root = btrfs_alloc_root(NULL, GFP_KERNEL);
+ if (!fs_info)
+ return ERR_PTR(-EINVAL);
+
+ root = btrfs_alloc_root(fs_info, GFP_KERNEL);
if (!root)
return ERR_PTR(-ENOMEM);
/* We don't use the stripesize in selftest, set it as sectorsize */
- __setup_root(nodesize, sectorsize, sectorsize, root, NULL,
+ __setup_root(nodesize, sectorsize, sectorsize, root, fs_info,
BTRFS_ROOT_TREE_OBJECTID);
- set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state);
root->alloc_bytenr = 0;
return root;
@@ -1594,14 +1598,14 @@ int btrfs_init_fs_root(struct btrfs_root *root)
ret = get_anon_bdev(&root->anon_dev);
if (ret)
- goto free_writers;
+ goto fail;
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
&root->highest_objectid);
if (ret) {
mutex_unlock(&root->objectid_mutex);
- goto free_root_dev;
+ goto fail;
}
ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID);
@@ -1609,14 +1613,8 @@ int btrfs_init_fs_root(struct btrfs_root *root)
mutex_unlock(&root->objectid_mutex);
return 0;
-
-free_root_dev:
- free_anon_bdev(root->anon_dev);
-free_writers:
- btrfs_free_subvolume_writers(root->subv_writers);
fail:
- kfree(root->free_ino_ctl);
- kfree(root->free_ino_pinned);
+ /* the caller is responsible to call free_fs_root */
return ret;
}
@@ -2310,17 +2308,19 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
- btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI,
- max_active, 16);
+ btrfs_alloc_workqueue(fs_info, "worker",
+ flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
- btrfs_alloc_workqueue("delalloc", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "delalloc",
+ flags, max_active, 2);
fs_info->flush_workers =
- btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "flush_delalloc",
+ flags, max_active, 0);
fs_info->caching_workers =
- btrfs_alloc_workqueue("cache", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
/*
* a higher idle thresh on the submit workers makes it much more
@@ -2328,41 +2328,48 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info,
* devices
*/
fs_info->submit_workers =
- btrfs_alloc_workqueue("submit", flags,
+ btrfs_alloc_workqueue(fs_info, "submit", flags,
min_t(u64, fs_devices->num_devices,
max_active), 64);
fs_info->fixup_workers =
- btrfs_alloc_workqueue("fixup", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0);
/*
* endios are largely parallel and should have a very
* low idle thresh
*/
fs_info->endio_workers =
- btrfs_alloc_workqueue("endio", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4);
fs_info->endio_meta_workers =
- btrfs_alloc_workqueue("endio-meta", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio-meta", flags,
+ max_active, 4);
fs_info->endio_meta_write_workers =
- btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags,
+ max_active, 2);
fs_info->endio_raid56_workers =
- btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4);
+ btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
+ max_active, 4);
fs_info->endio_repair_workers =
- btrfs_alloc_workqueue("endio-repair", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "endio-repair", flags, 1, 0);
fs_info->rmw_workers =
- btrfs_alloc_workqueue("rmw", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
fs_info->endio_write_workers =
- btrfs_alloc_workqueue("endio-write", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "endio-write", flags,
+ max_active, 2);
fs_info->endio_freespace_worker =
- btrfs_alloc_workqueue("freespace-write", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
+ max_active, 0);
fs_info->delayed_workers =
- btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0);
+ btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
+ max_active, 0);
fs_info->readahead_workers =
- btrfs_alloc_workqueue("readahead", flags, max_active, 2);
+ btrfs_alloc_workqueue(fs_info, "readahead", flags,
+ max_active, 2);
fs_info->qgroup_rescan_workers =
- btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->extent_workers =
- btrfs_alloc_workqueue("extent-refs", flags,
+ btrfs_alloc_workqueue(fs_info, "extent-refs", flags,
min_t(u64, fs_devices->num_devices,
max_active), 8);
@@ -3010,8 +3017,8 @@ retry_root_backup:
if (IS_ERR(fs_info->transaction_kthread))
goto fail_cleaner;
- if (!btrfs_test_opt(tree_root, SSD) &&
- !btrfs_test_opt(tree_root, NOSSD) &&
+ if (!btrfs_test_opt(tree_root->fs_info, SSD) &&
+ !btrfs_test_opt(tree_root->fs_info, NOSSD) &&
!fs_info->fs_devices->rotating) {
btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
btrfs_set_opt(fs_info->mount_opt, SSD);
@@ -3024,9 +3031,9 @@ retry_root_backup:
btrfs_apply_pending_changes(fs_info);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+ if (btrfs_test_opt(tree_root->fs_info, CHECK_INTEGRITY)) {
ret = btrfsic_mount(tree_root, fs_devices,
- btrfs_test_opt(tree_root,
+ btrfs_test_opt(tree_root->fs_info,
CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
1 : 0,
fs_info->check_integrity_print_mask);
@@ -3042,7 +3049,7 @@ retry_root_backup:
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
- !btrfs_test_opt(tree_root, NOLOGREPLAY)) {
+ !btrfs_test_opt(tree_root->fs_info, NOLOGREPLAY)) {
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3083,7 +3090,7 @@ retry_root_backup:
if (sb->s_flags & MS_RDONLY)
return 0;
- if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+ if (btrfs_test_opt(tree_root->fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
btrfs_info(fs_info, "creating free space tree");
ret = btrfs_create_free_space_tree(fs_info);
@@ -3120,7 +3127,7 @@ retry_root_backup:
btrfs_qgroup_rescan_resume(fs_info);
- if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+ if (btrfs_test_opt(tree_root->fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
btrfs_info(fs_info, "clearing free space tree");
ret = btrfs_clear_free_space_tree(fs_info);
@@ -3141,7 +3148,7 @@ retry_root_backup:
close_ctree(tree_root);
return ret;
}
- } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
+ } else if (btrfs_test_opt(tree_root->fs_info, RESCAN_UUID_TREE) ||
fs_info->generation !=
btrfs_super_uuid_tree_generation(disk_super)) {
btrfs_info(fs_info, "checking UUID tree");
@@ -3218,7 +3225,7 @@ fail:
return err;
recovery_tree_root:
- if (!btrfs_test_opt(tree_root, USEBACKUPROOT))
+ if (!btrfs_test_opt(tree_root->fs_info, USEBACKUPROOT))
goto fail_tree_roots;
free_root_pointers(fs_info, 0);
@@ -3634,7 +3641,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
int total_errors = 0;
u64 flags;
- do_barriers = !btrfs_test_opt(root, NOBARRIER);
+ do_barriers = !btrfs_test_opt(root->fs_info, NOBARRIER);
backup_super_roots(root->fs_info);
sb = root->fs_info->super_for_commit;
@@ -3918,7 +3925,7 @@ void close_ctree(struct btrfs_root *root)
iput(fs_info->btree_inode);
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ if (btrfs_test_opt(root->fs_info, CHECK_INTEGRITY))
btrfsic_unmount(root, fs_info->fs_devices);
#endif
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index dbf3e1aab69e..b3207a0e09f7 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -90,7 +90,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
void btrfs_free_fs_root(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-struct btrfs_root *btrfs_alloc_dummy_root(u32 sectorsize, u32 nodesize);
+struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info,
+ u32 sectorsize, u32 nodesize);
#endif
/*
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b480fd555774..61b494e8e604 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -111,6 +111,16 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes);
int btrfs_pin_extent(struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int reserved);
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush);
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -2170,7 +2180,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
path, bytenr, parent, root_objectid,
owner, offset, refs_to_add);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
@@ -2194,7 +2204,7 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ref = btrfs_delayed_node_to_data_ref(node);
- trace_run_delayed_data_ref(node, ref, node->action);
+ trace_run_delayed_data_ref(root->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_DATA_REF_KEY)
parent = ref->parent;
@@ -2349,7 +2359,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
SKINNY_METADATA);
ref = btrfs_delayed_node_to_tree_ref(node);
- trace_run_delayed_tree_ref(node, ref, node->action);
+ trace_run_delayed_tree_ref(root->fs_info, node, ref, node->action);
if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
parent = ref->parent;
@@ -2413,7 +2423,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
*/
BUG_ON(extent_op);
head = btrfs_delayed_node_to_head(node);
- trace_run_delayed_ref_head(node, head, node->action);
+ trace_run_delayed_ref_head(root->fs_info, node, head,
+ node->action);
if (insert_reserved) {
btrfs_pin_extent(root, node->bytenr,
@@ -2768,7 +2779,7 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes)
u64 num_csums_per_leaf;
u64 num_csums;
- csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+ csum_size = BTRFS_MAX_ITEM_SIZE(root);
num_csums_per_leaf = div64_u64(csum_size,
(u64)btrfs_super_csum_size(root->fs_info->super_copy));
num_csums = div64_u64(csum_bytes, root->sectorsize);
@@ -2960,7 +2971,7 @@ again:
trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, root, count);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3224,7 +3235,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
u64, u64, u64, u64, u64, u64);
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
ref_root = btrfs_header_owner(buf);
@@ -3419,7 +3430,7 @@ again:
* transaction, this only happens in really bad situations
* anyway.
*/
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
WARN_ON(ret);
@@ -3437,7 +3448,7 @@ again:
spin_lock(&block_group->lock);
if (block_group->cached != BTRFS_CACHE_FINISHED ||
- !btrfs_test_opt(root, SPACE_CACHE)) {
+ !btrfs_test_opt(root->fs_info, SPACE_CACHE)) {
/*
* don't bother trying to write stuff out _if_
* a) we're not cached,
@@ -3514,7 +3525,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans,
struct btrfs_path *path;
if (list_empty(&cur_trans->dirty_bgs) ||
- !btrfs_test_opt(root, SPACE_CACHE))
+ !btrfs_test_opt(root->fs_info, SPACE_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -3659,7 +3670,7 @@ again:
}
spin_unlock(&cur_trans->dirty_bgs_lock);
} else if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
}
@@ -3805,7 +3816,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
cache);
}
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
/* if its not on the io list, we need to put the block group */
@@ -3913,6 +3924,7 @@ static const char *alloc_name(u64 flags)
static int update_space_info(struct btrfs_fs_info *info, u64 flags,
u64 total_bytes, u64 bytes_used,
+ u64 bytes_readonly,
struct btrfs_space_info **space_info)
{
struct btrfs_space_info *found;
@@ -3933,8 +3945,11 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_total += total_bytes * factor;
found->bytes_used += bytes_used;
found->disk_used += bytes_used * factor;
+ found->bytes_readonly += bytes_readonly;
if (total_bytes > 0)
found->full = 0;
+ space_info_add_new_bytes(info, found, total_bytes -
+ bytes_used - bytes_readonly);
spin_unlock(&found->lock);
*space_info = found;
return 0;
@@ -3960,7 +3975,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->disk_used = bytes_used * factor;
found->bytes_pinned = 0;
found->bytes_reserved = 0;
- found->bytes_readonly = 0;
+ found->bytes_readonly = bytes_readonly;
found->bytes_may_use = 0;
found->full = 0;
found->max_extent_size = 0;
@@ -3969,6 +3984,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->flush = 0;
init_waitqueue_head(&found->wait);
INIT_LIST_HEAD(&found->ro_bgs);
+ INIT_LIST_HEAD(&found->tickets);
+ INIT_LIST_HEAD(&found->priority_tickets);
ret = kobject_init_and_add(&found->kobj, &space_info_ktype,
info->space_info_kobj, "%s",
@@ -4427,7 +4444,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans,
thresh = btrfs_calc_trunc_metadata_size(root, num_devs) +
btrfs_calc_trans_metadata_size(root, 1);
- if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (left < thresh && btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
left, thresh, type);
dump_space_info(info, 0, 0);
@@ -4470,7 +4487,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
space_info = __find_space_info(extent_root->fs_info, flags);
if (!space_info) {
ret = update_space_info(extent_root->fs_info, flags,
- 0, 0, &space_info);
+ 0, 0, 0, &space_info);
BUG_ON(ret); /* -ENOMEM */
}
BUG_ON(!space_info); /* Logic error */
@@ -4572,7 +4589,7 @@ out:
*/
if (trans->can_flush_pending_bgs &&
trans->chunk_bytes_reserved >= (u64)SZ_2M) {
- btrfs_create_pending_block_groups(trans, trans->root);
+ btrfs_create_pending_block_groups(trans, extent_root);
btrfs_trans_release_chunk_metadata(trans);
}
return ret;
@@ -4582,12 +4599,19 @@ static int can_overcommit(struct btrfs_root *root,
struct btrfs_space_info *space_info, u64 bytes,
enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
- u64 profile = btrfs_get_alloc_profile(root, 0);
+ struct btrfs_block_rsv *global_rsv;
+ u64 profile;
u64 space_size;
u64 avail;
u64 used;
+ /* Don't overcommit when in mixed mode. */
+ if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
+ return 0;
+
+ BUG_ON(root->fs_info == NULL);
+ global_rsv = &root->fs_info->global_block_rsv;
+ profile = btrfs_get_alloc_profile(root, 0);
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly;
@@ -4739,6 +4763,11 @@ skip_async:
spin_unlock(&space_info->lock);
break;
}
+ if (list_empty(&space_info->tickets) &&
+ list_empty(&space_info->priority_tickets)) {
+ spin_unlock(&space_info->lock);
+ break;
+ }
spin_unlock(&space_info->lock);
loops++;
@@ -4807,13 +4836,11 @@ commit:
return btrfs_commit_transaction(trans, root);
}
-enum flush_state {
- FLUSH_DELAYED_ITEMS_NR = 1,
- FLUSH_DELAYED_ITEMS = 2,
- FLUSH_DELALLOC = 3,
- FLUSH_DELALLOC_WAIT = 4,
- ALLOC_CHUNK = 5,
- COMMIT_TRANS = 6,
+struct reserve_ticket {
+ u64 bytes;
+ int error;
+ struct list_head list;
+ wait_queue_head_t wait;
};
static int flush_space(struct btrfs_root *root,
@@ -4866,6 +4893,8 @@ static int flush_space(struct btrfs_root *root,
break;
}
+ trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
+ orig_bytes, state, ret);
return ret;
}
@@ -4873,17 +4902,22 @@ static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
struct btrfs_space_info *space_info)
{
+ struct reserve_ticket *ticket;
u64 used;
u64 expected;
- u64 to_reclaim;
+ u64 to_reclaim = 0;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- spin_lock(&space_info->lock);
if (can_overcommit(root, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL)) {
- to_reclaim = 0;
- goto out;
- }
+ BTRFS_RESERVE_FLUSH_ALL))
+ return 0;
+
+ list_for_each_entry(ticket, &space_info->tickets, list)
+ to_reclaim += ticket->bytes;
+ list_for_each_entry(ticket, &space_info->priority_tickets, list)
+ to_reclaim += ticket->bytes;
+ if (to_reclaim)
+ return to_reclaim;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
@@ -4899,14 +4933,11 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
to_reclaim = 0;
to_reclaim = min(to_reclaim, space_info->bytes_may_use +
space_info->bytes_reserved);
-out:
- spin_unlock(&space_info->lock);
-
return to_reclaim;
}
static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info, u64 used)
+ struct btrfs_root *root, u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@@ -4914,73 +4945,177 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- return (used >= thresh && !btrfs_fs_closing(fs_info) &&
- !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
+ if (!btrfs_calc_reclaim_metadata_size(root, space_info))
+ return 0;
+
+ return (used >= thresh && !btrfs_fs_closing(root->fs_info) &&
+ !test_bit(BTRFS_FS_STATE_REMOUNTING,
+ &root->fs_info->fs_state));
}
-static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info,
- struct btrfs_fs_info *fs_info,
- int flush_state)
+static void wake_all_tickets(struct list_head *head)
{
- u64 used;
-
- spin_lock(&space_info->lock);
- /*
- * We run out of space and have not got any free space via flush_space,
- * so don't bother doing async reclaim.
- */
- if (flush_state > COMMIT_TRANS && space_info->full) {
- spin_unlock(&space_info->lock);
- return 0;
- }
+ struct reserve_ticket *ticket;
- used = space_info->bytes_used + space_info->bytes_reserved +
- space_info->bytes_pinned + space_info->bytes_readonly +
- space_info->bytes_may_use;
- if (need_do_async_reclaim(space_info, fs_info, used)) {
- spin_unlock(&space_info->lock);
- return 1;
+ while (!list_empty(head)) {
+ ticket = list_first_entry(head, struct reserve_ticket, list);
+ list_del_init(&ticket->list);
+ ticket->error = -ENOSPC;
+ wake_up(&ticket->wait);
}
- spin_unlock(&space_info->lock);
-
- return 0;
}
+/*
+ * This is for normal flushers, we can wait all goddamned day if we want to. We
+ * will loop and continuously try to flush as long as we are making progress.
+ * We count progress as clearing off tickets each time we have to loop.
+ */
static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
{
+ struct reserve_ticket *last_ticket = NULL;
struct btrfs_fs_info *fs_info;
struct btrfs_space_info *space_info;
u64 to_reclaim;
int flush_state;
+ int commit_cycles = 0;
fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+ spin_lock(&space_info->lock);
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
space_info);
- if (!to_reclaim)
+ if (!to_reclaim) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
return;
+ }
+ last_ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ spin_unlock(&space_info->lock);
flush_state = FLUSH_DELAYED_ITEMS_NR;
do {
+ struct reserve_ticket *ticket;
+ int ret;
+
+ ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
+ to_reclaim, flush_state);
+ spin_lock(&space_info->lock);
+ if (list_empty(&space_info->tickets)) {
+ space_info->flush = 0;
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ ticket = list_first_entry(&space_info->tickets,
+ struct reserve_ticket, list);
+ if (last_ticket == ticket) {
+ flush_state++;
+ } else {
+ last_ticket = ticket;
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ if (commit_cycles)
+ commit_cycles--;
+ }
+
+ if (flush_state > COMMIT_TRANS) {
+ commit_cycles++;
+ if (commit_cycles > 2) {
+ wake_all_tickets(&space_info->tickets);
+ space_info->flush = 0;
+ } else {
+ flush_state = FLUSH_DELAYED_ITEMS_NR;
+ }
+ }
+ spin_unlock(&space_info->lock);
+ } while (flush_state <= COMMIT_TRANS);
+}
+
+void btrfs_init_async_reclaim_work(struct work_struct *work)
+{
+ INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+}
+
+static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ u64 to_reclaim;
+ int flush_state = FLUSH_DELAYED_ITEMS_NR;
+
+ spin_lock(&space_info->lock);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info->fs_root,
+ space_info);
+ if (!to_reclaim) {
+ spin_unlock(&space_info->lock);
+ return;
+ }
+ spin_unlock(&space_info->lock);
+
+ do {
flush_space(fs_info->fs_root, space_info, to_reclaim,
to_reclaim, flush_state);
flush_state++;
- if (!btrfs_need_do_async_reclaim(space_info, fs_info,
- flush_state))
+ spin_lock(&space_info->lock);
+ if (ticket->bytes == 0) {
+ spin_unlock(&space_info->lock);
return;
+ }
+ spin_unlock(&space_info->lock);
+
+ /*
+ * Priority flushers can't wait on delalloc without
+ * deadlocking.
+ */
+ if (flush_state == FLUSH_DELALLOC ||
+ flush_state == FLUSH_DELALLOC_WAIT)
+ flush_state = ALLOC_CHUNK;
} while (flush_state < COMMIT_TRANS);
}
-void btrfs_init_async_reclaim_work(struct work_struct *work)
+static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket, u64 orig_bytes)
+
{
- INIT_WORK(work, btrfs_async_reclaim_metadata_space);
+ DEFINE_WAIT(wait);
+ int ret = 0;
+
+ spin_lock(&space_info->lock);
+ while (ticket->bytes > 0 && ticket->error == 0) {
+ ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
+ if (ret) {
+ ret = -EINTR;
+ break;
+ }
+ spin_unlock(&space_info->lock);
+
+ schedule();
+
+ finish_wait(&ticket->wait, &wait);
+ spin_lock(&space_info->lock);
+ }
+ if (!ret)
+ ret = ticket->error;
+ if (!list_empty(&ticket->list))
+ list_del_init(&ticket->list);
+ if (ticket->bytes && ticket->bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket->bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ }
+ spin_unlock(&space_info->lock);
+
+ return ret;
}
/**
* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
* @root - the root we're allocating for
- * @block_rsv - the block_rsv we're allocating for
+ * @space_info - the space info we want to allocate from
* @orig_bytes - the number of bytes we want
* @flush - whether or not we can flush to make our reservation
*
@@ -4991,81 +5126,36 @@ void btrfs_init_async_reclaim_work(struct work_struct *work)
* regain reservations will be made and this will fail if there is not enough
* space already.
*/
-static int reserve_metadata_bytes(struct btrfs_root *root,
- struct btrfs_block_rsv *block_rsv,
- u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush)
+static int __reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_space_info *space_info,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
{
- struct btrfs_space_info *space_info = block_rsv->space_info;
+ struct reserve_ticket ticket;
u64 used;
- u64 num_bytes = orig_bytes;
- int flush_state = FLUSH_DELAYED_ITEMS_NR;
int ret = 0;
- bool flushing = false;
-
-again:
- ret = 0;
- spin_lock(&space_info->lock);
- /*
- * We only want to wait if somebody other than us is flushing and we
- * are actually allowed to flush all things.
- */
- while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
- space_info->flush) {
- spin_unlock(&space_info->lock);
- /*
- * If we have a trans handle we can't wait because the flusher
- * may have to commit the transaction, which would mean we would
- * deadlock since we are waiting for the flusher to finish, but
- * hold the current transaction open.
- */
- if (current->journal_info)
- return -EAGAIN;
- ret = wait_event_killable(space_info->wait, !space_info->flush);
- /* Must have been killed, return */
- if (ret)
- return -EINTR;
- spin_lock(&space_info->lock);
- }
+ ASSERT(orig_bytes);
+ ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
+ spin_lock(&space_info->lock);
ret = -ENOSPC;
used = space_info->bytes_used + space_info->bytes_reserved +
space_info->bytes_pinned + space_info->bytes_readonly +
space_info->bytes_may_use;
/*
- * The idea here is that we've not already over-reserved the block group
- * then we can go ahead and save our reservation first and then start
- * flushing if we need to. Otherwise if we've already overcommitted
- * lets start flushing stuff first and then come back and try to make
- * our reservation.
+ * If we have enough space then hooray, make our reservation and carry
+ * on. If not see if we can overcommit, and if we can, hooray carry on.
+ * If not things get more complicated.
*/
- if (used <= space_info->total_bytes) {
- if (used + orig_bytes <= space_info->total_bytes) {
- space_info->bytes_may_use += orig_bytes;
- trace_btrfs_space_reservation(root->fs_info,
- "space_info", space_info->flags, orig_bytes, 1);
- ret = 0;
- } else {
- /*
- * Ok set num_bytes to orig_bytes since we aren't
- * overocmmitted, this way we only try and reclaim what
- * we need.
- */
- num_bytes = orig_bytes;
- }
- } else {
- /*
- * Ok we're over committed, set num_bytes to the overcommitted
- * amount plus the amount of bytes that we need for this
- * reservation.
- */
- num_bytes = used - space_info->total_bytes +
- (orig_bytes * 2);
- }
-
- if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+ if (used + orig_bytes <= space_info->total_bytes) {
+ space_info->bytes_may_use += orig_bytes;
+ trace_btrfs_space_reservation(root->fs_info, "space_info",
+ space_info->flags, orig_bytes,
+ 1);
+ ret = 0;
+ } else if (can_overcommit(root, space_info, orig_bytes, flush)) {
space_info->bytes_may_use += orig_bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
space_info->flags, orig_bytes,
@@ -5074,16 +5164,31 @@ again:
}
/*
- * Couldn't make our reservation, save our place so while we're trying
- * to reclaim space we can actually use it instead of somebody else
- * stealing it from us.
+ * If we couldn't make a reservation then setup our reservation ticket
+ * and kick the async worker if it's not already running.
*
- * We make the other tasks wait for the flush only when we can flush
- * all things.
+ * If we are a priority flusher then we just need to add our ticket to
+ * the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
- flushing = true;
- space_info->flush = 1;
+ ticket.bytes = orig_bytes;
+ ticket.error = 0;
+ init_waitqueue_head(&ticket.wait);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ list_add_tail(&ticket.list, &space_info->tickets);
+ if (!space_info->flush) {
+ space_info->flush = 1;
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "enospc");
+ queue_work(system_unbound_wq,
+ &root->fs_info->async_reclaim_work);
+ }
+ } else {
+ list_add_tail(&ticket.list,
+ &space_info->priority_tickets);
+ }
} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
used += orig_bytes;
/*
@@ -5092,39 +5197,67 @@ again:
* the async reclaim as we will panic.
*/
if (!root->fs_info->log_root_recovering &&
- need_do_async_reclaim(space_info, root->fs_info, used) &&
- !work_busy(&root->fs_info->async_reclaim_work))
+ need_do_async_reclaim(space_info, root, used) &&
+ !work_busy(&root->fs_info->async_reclaim_work)) {
+ trace_btrfs_trigger_flush(root->fs_info,
+ space_info->flags,
+ orig_bytes, flush,
+ "preempt");
queue_work(system_unbound_wq,
&root->fs_info->async_reclaim_work);
+ }
}
spin_unlock(&space_info->lock);
-
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
- goto out;
+ return ret;
- ret = flush_space(root, space_info, num_bytes, orig_bytes,
- flush_state);
- flush_state++;
+ if (flush == BTRFS_RESERVE_FLUSH_ALL)
+ return wait_reserve_ticket(root->fs_info, space_info, &ticket,
+ orig_bytes);
- /*
- * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
- * would happen. So skip delalloc flush.
- */
- if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- (flush_state == FLUSH_DELALLOC ||
- flush_state == FLUSH_DELALLOC_WAIT))
- flush_state = ALLOC_CHUNK;
+ ret = 0;
+ priority_reclaim_metadata_space(root->fs_info, space_info, &ticket);
+ spin_lock(&space_info->lock);
+ if (ticket.bytes) {
+ if (ticket.bytes < orig_bytes) {
+ u64 num_bytes = orig_bytes - ticket.bytes;
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(root->fs_info,
+ "space_info", space_info->flags,
+ num_bytes, 0);
- if (!ret)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
- flush_state < COMMIT_TRANS)
- goto again;
- else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- flush_state <= COMMIT_TRANS)
- goto again;
+ }
+ list_del_init(&ticket.list);
+ ret = -ENOSPC;
+ }
+ spin_unlock(&space_info->lock);
+ ASSERT(list_empty(&ticket.list));
+ return ret;
+}
-out:
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - whether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv. If there is not enough space it will make an attempt to
+ * flush out space to make room. It will do this by flushing delalloc if
+ * possible or committing the transaction. If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
+ struct btrfs_block_rsv *block_rsv,
+ u64 orig_bytes,
+ enum btrfs_reserve_flush_enum flush)
+{
+ int ret;
+
+ ret = __reserve_metadata_bytes(root, block_rsv->space_info, orig_bytes,
+ flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
struct btrfs_block_rsv *global_rsv =
@@ -5137,13 +5270,8 @@ out:
if (ret == -ENOSPC)
trace_btrfs_space_reservation(root->fs_info,
"space_info:enospc",
- space_info->flags, orig_bytes, 1);
- if (flushing) {
- spin_lock(&space_info->lock);
- space_info->flush = 0;
- wake_up_all(&space_info->wait);
- spin_unlock(&space_info->lock);
- }
+ block_rsv->space_info->flags,
+ orig_bytes, 1);
return ret;
}
@@ -5219,6 +5347,108 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
return 0;
}
+/*
+ * This is for space we already have accounted in space_info->bytes_may_use, so
+ * basically when we're returning space from block_rsv's.
+ */
+static void space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head;
+ u64 used;
+ enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
+ bool check_overcommit = false;
+
+ spin_lock(&space_info->lock);
+ head = &space_info->priority_tickets;
+
+ /*
+ * If we are over our limit then we need to check and see if we can
+ * overcommit, and if we can't then we just need to free up our space
+ * and not satisfy any requests.
+ */
+ used = space_info->bytes_used + space_info->bytes_reserved +
+ space_info->bytes_pinned + space_info->bytes_readonly +
+ space_info->bytes_may_use;
+ if (used - num_bytes >= space_info->total_bytes)
+ check_overcommit = true;
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ /*
+ * We use 0 bytes because this space is already reserved, so
+ * adding the ticket space would be a double count.
+ */
+ if (check_overcommit &&
+ !can_overcommit(fs_info->extent_root, space_info, 0,
+ flush))
+ break;
+ if (num_bytes >= ticket->bytes) {
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ flush = BTRFS_RESERVE_FLUSH_ALL;
+ goto again;
+ }
+ space_info->bytes_may_use -= num_bytes;
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags, num_bytes, 0);
+ spin_unlock(&space_info->lock);
+}
+
+/*
+ * This is for newly allocated space that isn't accounted in
+ * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
+ * we use this helper.
+ */
+static void space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ u64 num_bytes)
+{
+ struct reserve_ticket *ticket;
+ struct list_head *head = &space_info->priority_tickets;
+
+again:
+ while (!list_empty(head) && num_bytes) {
+ ticket = list_first_entry(head, struct reserve_ticket,
+ list);
+ if (num_bytes >= ticket->bytes) {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ ticket->bytes, 1);
+ list_del_init(&ticket->list);
+ num_bytes -= ticket->bytes;
+ space_info->bytes_may_use += ticket->bytes;
+ ticket->bytes = 0;
+ wake_up(&ticket->wait);
+ } else {
+ trace_btrfs_space_reservation(fs_info, "space_info",
+ space_info->flags,
+ num_bytes, 1);
+ space_info->bytes_may_use += num_bytes;
+ ticket->bytes -= num_bytes;
+ num_bytes = 0;
+ }
+ }
+
+ if (num_bytes && head == &space_info->priority_tickets) {
+ head = &space_info->tickets;
+ goto again;
+ }
+}
+
static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
struct btrfs_block_rsv *dest, u64 num_bytes)
@@ -5253,18 +5483,15 @@ static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
}
spin_unlock(&dest->lock);
}
- if (num_bytes) {
- spin_lock(&space_info->lock);
- space_info->bytes_may_use -= num_bytes;
- trace_btrfs_space_reservation(fs_info, "space_info",
- space_info->flags, num_bytes, 0);
- spin_unlock(&space_info->lock);
- }
+ if (num_bytes)
+ space_info_add_old_bytes(fs_info, space_info,
+ num_bytes);
}
}
-static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
- struct btrfs_block_rsv *dst, u64 num_bytes)
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
+ struct btrfs_block_rsv *dst, u64 num_bytes,
+ int update_size)
{
int ret;
@@ -5272,7 +5499,7 @@ static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
if (ret)
return ret;
- block_rsv_add_bytes(dst, num_bytes, 1);
+ block_rsv_add_bytes(dst, num_bytes, update_size);
return 0;
}
@@ -5379,13 +5606,6 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
return ret;
}
-int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
- struct btrfs_block_rsv *dst_rsv,
- u64 num_bytes)
-{
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
-}
-
void btrfs_block_rsv_release(struct btrfs_root *root,
struct btrfs_block_rsv *block_rsv,
u64 num_bytes)
@@ -5398,48 +5618,21 @@ void btrfs_block_rsv_release(struct btrfs_root *root,
num_bytes);
}
-/*
- * helper to calculate size of global block reservation.
- * the desired value is sum of space used by extent tree,
- * checksum tree and root tree
- */
-static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_space_info *sinfo;
- u64 num_bytes;
- u64 meta_used;
- u64 data_used;
- int csum_size = btrfs_super_csum_size(fs_info->super_copy);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
- spin_lock(&sinfo->lock);
- data_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
- spin_lock(&sinfo->lock);
- if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
- data_used = 0;
- meta_used = sinfo->bytes_used;
- spin_unlock(&sinfo->lock);
-
- num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
- csum_size * 2;
- num_bytes += div_u64(data_used + meta_used, 50);
-
- if (num_bytes * 3 > meta_used)
- num_bytes = div_u64(meta_used, 3);
-
- return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10);
-}
-
static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
struct btrfs_space_info *sinfo = block_rsv->space_info;
u64 num_bytes;
- num_bytes = calc_global_metadata_size(fs_info);
+ /*
+ * The global block rsv is based on the size of the extent tree, the
+ * checksum tree and the root tree. If the fs is empty we want to set
+ * it to a minimal amount for safety.
+ */
+ num_bytes = btrfs_root_used(&fs_info->extent_root->root_item) +
+ btrfs_root_used(&fs_info->csum_root->root_item) +
+ btrfs_root_used(&fs_info->tree_root->root_item);
+ num_bytes = max_t(u64, num_bytes, SZ_16M);
spin_lock(&sinfo->lock);
spin_lock(&block_rsv->lock);
@@ -5537,7 +5730,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
*/
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
- struct btrfs_fs_info *fs_info = trans->root->fs_info;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
if (!trans->chunk_bytes_reserved)
return;
@@ -5554,7 +5747,13 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
struct inode *inode)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+ /*
+ * We always use trans->block_rsv here as we will have reserved space
+ * for our orphan when starting the transaction, using get_block_rsv()
+ * here will sometimes make us choose the wrong block rsv as we could be
+ * doing a reloc inode for a non refcounted root.
+ */
+ struct btrfs_block_rsv *src_rsv = trans->block_rsv;
struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
/*
@@ -5565,7 +5764,7 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
trace_btrfs_space_reservation(root->fs_info, "orphan",
btrfs_ino(inode), num_bytes, 1);
- return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+ return btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
}
void btrfs_orphan_release_metadata(struct inode *inode)
@@ -5620,7 +5819,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
if (ret && *qgroup_reserved)
btrfs_qgroup_free_meta(root, *qgroup_reserved);
@@ -5730,21 +5929,26 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
u64 to_reserve = 0;
u64 csum_bytes;
unsigned nr_extents = 0;
- int extra_reserve = 0;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
int ret = 0;
bool delalloc_lock = true;
u64 to_free = 0;
unsigned dropped;
+ bool release_extra = false;
/* If we are a free space inode we need to not flush since we will be in
* the middle of a transaction commit. We also don't need the delalloc
* mutex since we won't race with anybody. We need this mostly to make
* lockdep shut its filthy mouth.
+ *
+ * If we have a transaction open (can happen if we call truncate_block
+ * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
delalloc_lock = false;
+ } else if (current->journal_info) {
+ flush = BTRFS_RESERVE_FLUSH_LIMIT;
}
if (flush != BTRFS_RESERVE_NO_FLUSH &&
@@ -5761,24 +5965,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
BTRFS_I(inode)->outstanding_extents += nr_extents;
- nr_extents = 0;
+ nr_extents = 0;
if (BTRFS_I(inode)->outstanding_extents >
BTRFS_I(inode)->reserved_extents)
- nr_extents = BTRFS_I(inode)->outstanding_extents -
+ nr_extents += BTRFS_I(inode)->outstanding_extents -
BTRFS_I(inode)->reserved_extents;
- /*
- * Add an item to reserve for updating the inode when we complete the
- * delalloc io.
- */
- if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags)) {
- nr_extents++;
- extra_reserve = 1;
- }
-
- to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
+ /* We always want to reserve a slot for updating the inode. */
+ to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents + 1);
to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
csum_bytes = BTRFS_I(inode)->csum_bytes;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5790,17 +5985,17 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
goto out_fail;
}
- ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+ ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
spin_lock(&BTRFS_I(inode)->lock);
- if (extra_reserve) {
- set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
- &BTRFS_I(inode)->runtime_flags);
- nr_extents--;
+ if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+ &BTRFS_I(inode)->runtime_flags)) {
+ to_reserve -= btrfs_calc_trans_metadata_size(root, 1);
+ release_extra = true;
}
BTRFS_I(inode)->reserved_extents += nr_extents;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5811,8 +6006,10 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
if (to_reserve)
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), to_reserve, 1);
- block_rsv_add_bytes(block_rsv, to_reserve, 1);
-
+ if (release_extra)
+ btrfs_block_rsv_release(root, block_rsv,
+ btrfs_calc_trans_metadata_size(root,
+ 1));
return 0;
out_fail:
@@ -5904,7 +6101,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
if (dropped > 0)
to_free += btrfs_calc_trans_metadata_size(root, dropped);
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
trace_btrfs_space_reservation(root->fs_info, "delalloc",
@@ -6019,7 +6216,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_lock(&cache->space_info->lock);
spin_lock(&cache->lock);
- if (btrfs_test_opt(root, SPACE_CACHE) &&
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -6044,6 +6241,9 @@ static int update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags,
+ num_bytes, 1);
set_extent_dirty(info->pinned_extents,
bytenr, bytenr + num_bytes - 1,
GFP_NOFS | __GFP_NOFAIL);
@@ -6118,10 +6318,10 @@ static int pin_down_extent(struct btrfs_root *root,
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
+ trace_btrfs_space_reservation(root->fs_info, "pinned",
+ cache->space_info->flags, num_bytes, 1);
set_extent_dirty(root->fs_info->pinned_extents, bytenr,
bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
- if (reserved)
- trace_btrfs_reserved_extent_free(root, bytenr, num_bytes);
return 0;
}
@@ -6398,7 +6598,7 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
u64 *empty_cluster)
{
struct btrfs_free_cluster *ret = NULL;
- bool ssd = btrfs_test_opt(root, SSD);
+ bool ssd = btrfs_test_opt(root->fs_info, SSD);
*empty_cluster = 0;
if (btrfs_mixed_space_info(space_info))
@@ -6476,6 +6676,9 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+
+ trace_btrfs_space_reservation(fs_info, "pinned",
+ space_info->flags, len, 0);
space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
@@ -6483,17 +6686,29 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
readonly = true;
}
spin_unlock(&cache->lock);
- if (!readonly && global_rsv->space_info == space_info) {
+ if (!readonly && return_free_space &&
+ global_rsv->space_info == space_info) {
+ u64 to_add = len;
+ WARN_ON(!return_free_space);
spin_lock(&global_rsv->lock);
if (!global_rsv->full) {
- len = min(len, global_rsv->size -
- global_rsv->reserved);
- global_rsv->reserved += len;
- space_info->bytes_may_use += len;
+ to_add = min(len, global_rsv->size -
+ global_rsv->reserved);
+ global_rsv->reserved += to_add;
+ space_info->bytes_may_use += to_add;
if (global_rsv->reserved >= global_rsv->size)
global_rsv->full = 1;
+ trace_btrfs_space_reservation(fs_info,
+ "space_info",
+ space_info->flags,
+ to_add, 1);
+ len -= to_add;
}
spin_unlock(&global_rsv->lock);
+ /* Add to any tickets we may have */
+ if (len)
+ space_info_add_new_bytes(fs_info, space_info,
+ len);
}
spin_unlock(&space_info->lock);
}
@@ -6528,7 +6743,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
break;
}
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start,
end + 1 - start, NULL);
@@ -6666,7 +6881,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
NULL, refs_to_drop,
is_data, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -6715,7 +6930,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
path->nodes[0]);
}
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
extent_slot = path->slots[0];
@@ -6726,10 +6941,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
"unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu",
bytenr, parent, root_objectid, owner_objectid,
owner_offset);
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
} else {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6741,7 +6956,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = convert_extent_item_v0(trans, extent_root, path,
owner_objectid, 0);
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6760,7 +6975,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root, path->nodes[0]);
}
if (ret < 0) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -6785,7 +7000,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_err(info, "trying to drop %d refs but we only have %Lu "
"for bytenr %Lu", refs_to_drop, refs, bytenr);
ret = -EINVAL;
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
refs -= refs_to_drop;
@@ -6808,7 +7023,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
iref, refs_to_drop,
is_data, &last_ref);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -6831,7 +7046,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -6839,7 +7054,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
if (is_data) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -6847,13 +7062,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
num_bytes);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -7002,7 +7217,7 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(fs_info))
return 0;
add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
@@ -7637,8 +7852,7 @@ loop:
* can do more things.
*/
if (ret < 0 && ret != -ENOSPC)
- btrfs_abort_transaction(trans,
- root, ret);
+ btrfs_abort_transaction(trans, ret);
else
ret = 0;
if (!exist)
@@ -7692,8 +7906,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n",
info->flags,
info->total_bytes - info->bytes_used - info->bytes_pinned -
- info->bytes_reserved - info->bytes_readonly,
- (info->full) ? "" : "not ");
+ info->bytes_reserved - info->bytes_readonly -
+ info->bytes_may_use, (info->full) ? "" : "not ");
printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, "
"reserved=%llu, may_use=%llu, readonly=%llu\n",
info->total_bytes, info->bytes_used, info->bytes_pinned,
@@ -7747,7 +7961,7 @@ again:
if (num_bytes == min_alloc_size)
final_tried = true;
goto again;
- } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ } else if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
struct btrfs_space_info *sinfo;
sinfo = __find_space_info(root->fs_info, flags);
@@ -7778,16 +7992,14 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
if (pin)
pin_down_extent(root, cache, start, len, 1);
else {
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(root->fs_info, DISCARD))
ret = btrfs_discard_extent(root, start, len, NULL);
btrfs_add_free_space(cache, start, len);
btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc);
+ trace_btrfs_reserved_extent_free(root, start, len);
}
btrfs_put_block_group(cache);
-
- trace_btrfs_reserved_extent_free(root, start, len);
-
return ret;
}
@@ -8088,7 +8300,7 @@ again:
goto again;
}
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
static DEFINE_RATELIMIT_STATE(_rs,
DEFAULT_RATELIMIT_INTERVAL * 10,
/*DEFAULT_RATELIMIT_BURST*/ 1);
@@ -8142,13 +8354,15 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (btrfs_test_is_dummy_root(root)) {
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ if (btrfs_is_testing(root->fs_info)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
}
+#endif
block_rsv = use_block_rsv(trans, root, blocksize);
if (IS_ERR(block_rsv))
@@ -8328,7 +8542,8 @@ static int record_one_subtree_extent(struct btrfs_trans_handle *trans,
delayed_refs = &trans->transaction->delayed_refs;
spin_lock(&delayed_refs->lock);
- if (btrfs_qgroup_insert_dirty_extent(delayed_refs, qrecord))
+ if (btrfs_qgroup_insert_dirty_extent(trans->fs_info,
+ delayed_refs, qrecord))
kfree(qrecord);
spin_unlock(&delayed_refs->lock);
@@ -9113,7 +9328,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
&root->root_key,
root_item);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -9140,7 +9355,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_del_root(trans, tree_root, &root->root_key);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -9148,7 +9363,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
} else if (ret > 0) {
@@ -9519,7 +9734,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
int full = 0;
int ret = 0;
- debug = btrfs_test_opt(root, ENOSPC_DEBUG);
+ debug = btrfs_test_opt(root->fs_info, ENOSPC_DEBUG);
block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
@@ -9675,7 +9890,22 @@ static int find_first_block_group(struct btrfs_root *root,
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
- ret = 0;
+ struct extent_map_tree *em_tree;
+ struct extent_map *em;
+
+ em_tree = &root->fs_info->mapping_tree.map_tree;
+ read_lock(&em_tree->lock);
+ em = lookup_extent_mapping(em_tree, found_key.objectid,
+ found_key.offset);
+ read_unlock(&em_tree->lock);
+ if (!em) {
+ btrfs_err(root->fs_info,
+ "logical %llu len %llu found bg but no related chunk",
+ found_key.objectid, found_key.offset);
+ ret = -ENOENT;
+ } else {
+ ret = 0;
+ }
goto out;
}
path->slots[0]++;
@@ -9791,13 +10021,15 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
space_info = list_entry(info->space_info.next,
struct btrfs_space_info,
list);
- if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
- if (WARN_ON(space_info->bytes_pinned > 0 ||
+
+ /*
+ * Do not hide this behind enospc_debug, this is actually
+ * important and indicates a real bug if this happens.
+ */
+ if (WARN_ON(space_info->bytes_pinned > 0 ||
space_info->bytes_reserved > 0 ||
- space_info->bytes_may_use > 0)) {
- dump_space_info(space_info, 0, 0);
- }
- }
+ space_info->bytes_may_use > 0))
+ dump_space_info(space_info, 0, 0);
list_del(&space_info->list);
for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
struct kobject *kobj;
@@ -9915,10 +10147,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
path->reada = READA_FORWARD;
cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
- if (btrfs_test_opt(root, SPACE_CACHE) &&
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE) &&
btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
need_clear = 1;
- if (btrfs_test_opt(root, CLEAR_CACHE))
+ if (btrfs_test_opt(root->fs_info, CLEAR_CACHE))
need_clear = 1;
while (1) {
@@ -9949,7 +10181,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
cache->disk_cache_state = BTRFS_DC_CLEAR;
}
@@ -10005,9 +10237,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
goto error;
}
+ trace_btrfs_add_block_group(root->fs_info, cache, 0);
ret = update_space_info(info, cache->flags, found_key.offset,
btrfs_block_group_used(&cache->item),
- &space_info);
+ cache->bytes_super, &space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&info->block_group_cache_lock);
@@ -10020,9 +10253,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
}
cache->space_info = space_info;
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
__link_block_group(space_info, cache);
@@ -10093,11 +10323,11 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
ret = btrfs_insert_item(trans, extent_root, &key, &item,
sizeof(item));
if (ret)
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
ret = btrfs_finish_chunk_alloc(trans, extent_root,
key.objectid, key.offset);
if (ret)
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
add_block_group_free_space(trans, root->fs_info, block_group);
/* already aborted the transaction if it failed. */
next:
@@ -10114,7 +10344,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
int ret;
struct btrfs_root *extent_root;
struct btrfs_block_group_cache *cache;
-
extent_root = root->fs_info->extent_root;
btrfs_set_log_full_commit(root->fs_info, trans);
@@ -10160,7 +10389,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* assigned to our block group, but don't update its counters just yet.
* We want our bg to be added to the rbtree with its ->space_info set.
*/
- ret = update_space_info(root->fs_info, cache->flags, 0, 0,
+ ret = update_space_info(root->fs_info, cache->flags, 0, 0, 0,
&cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
@@ -10179,8 +10408,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
* Now that our block group has its ->space_info set and is inserted in
* the rbtree, update the space info's counters.
*/
+ trace_btrfs_add_block_group(root->fs_info, cache, 1);
ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
- &cache->space_info);
+ cache->bytes_super, &cache->space_info);
if (ret) {
btrfs_remove_free_space_cache(cache);
spin_lock(&root->fs_info->block_group_cache_lock);
@@ -10193,16 +10423,11 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
}
update_global_block_rsv(root->fs_info);
- spin_lock(&cache->space_info->lock);
- cache->space_info->bytes_readonly += cache->bytes_super;
- spin_unlock(&cache->space_info->lock);
-
__link_block_group(cache->space_info, cache);
list_add_tail(&cache->bg_list, &trans->new_bgs);
set_avail_alloc_bits(extent_root->fs_info, type);
-
return 0;
}
@@ -10415,7 +10640,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
spin_lock(&block_group->space_info->lock);
list_del_init(&block_group->ro_list);
- if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+ if (btrfs_test_opt(root->fs_info, ENOSPC_DEBUG)) {
WARN_ON(block_group->space_info->total_bytes
< block_group->key.offset);
WARN_ON(block_group->space_info->bytes_readonly
@@ -10683,7 +10908,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_unlock(&space_info->lock);
/* DISCARD can flip during remount */
- trimming = btrfs_test_opt(root, DISCARD);
+ trimming = btrfs_test_opt(root->fs_info, DISCARD);
/* Implicit trim during transaction commit. */
if (trimming)
@@ -10747,21 +10972,21 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
mixed = 1;
flags = BTRFS_BLOCK_GROUP_SYSTEM;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
if (mixed) {
flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
} else {
flags = BTRFS_BLOCK_GROUP_METADATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
if (ret)
goto out;
flags = BTRFS_BLOCK_GROUP_DATA;
- ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+ ret = update_space_info(fs_info, flags, 0, 0, 0, &space_info);
}
out:
return ret;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index cee4cb99b8ce..44fe66b53c8b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -163,13 +163,13 @@ int __init extent_io_init(void)
{
extent_state_cache = kmem_cache_create("btrfs_extent_state",
sizeof(struct extent_state), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_state_cache)
return -ENOMEM;
extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
sizeof(struct extent_buffer), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_buffer_cache)
goto free_state_cache;
@@ -2049,7 +2049,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
bio->bi_bdev = dev->bdev;
- bio->bi_rw = WRITE_SYNC;
+ bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
bio_add_page(bio, page, length, pg_offset);
if (btrfsic_submit_bio_wait(bio)) {
@@ -2697,12 +2697,6 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
btrfs_bio->csum = NULL;
btrfs_bio->csum_allocated = NULL;
btrfs_bio->end_io = NULL;
-
-#ifdef CONFIG_BLK_CGROUP
- /* FIXME, put this into bio_clone_bioset */
- if (bio->bi_css)
- bio_associate_blkcg(new, bio->bi_css);
-#endif
}
return new;
}
@@ -2756,7 +2750,6 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page,
if (tree->ops && tree->ops->merge_bio_hook)
ret = tree->ops->merge_bio_hook(page, offset, size, bio,
bio_flags);
- BUG_ON(ret < 0);
return ret;
}
@@ -2879,6 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
* into the tree that are removed when the IO is done (by the end_io
* handlers)
* XXX JDM: This needs looking at to ensure proper page locking
+ * return 0 on success, otherwise return error
*/
static int __do_readpage(struct extent_io_tree *tree,
struct page *page,
@@ -2900,7 +2894,7 @@ static int __do_readpage(struct extent_io_tree *tree,
sector_t sector;
struct extent_map *em;
struct block_device *bdev;
- int ret;
+ int ret = 0;
int nr = 0;
size_t pg_offset = 0;
size_t iosize;
@@ -3081,6 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
} else {
SetPageError(page);
unlock_extent(tree, cur, cur + iosize - 1);
+ goto out;
}
cur = cur + iosize;
pg_offset += iosize;
@@ -3091,7 +3086,7 @@ out:
SetPageUptodate(page);
unlock_page(page);
}
- return 0;
+ return ret;
}
static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
@@ -5230,14 +5225,31 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
atomic_set(&eb->io_pages, num_reads);
for (i = start_i; i < num_pages; i++) {
page = eb->pages[i];
+
if (!PageUptodate(page)) {
+ if (ret) {
+ atomic_dec(&eb->io_pages);
+ unlock_page(page);
+ continue;
+ }
+
ClearPageError(page);
err = __extent_read_full_page(tree, page,
get_extent, &bio,
mirror_num, &bio_flags,
REQ_META);
- if (err)
+ if (err) {
ret = err;
+ /*
+ * We use &bio in above __extent_read_full_page,
+ * so we ensure that if it returns error, the
+ * current page fails to add itself to bio and
+ * it's been unlocked.
+ *
+ * We must dec io_pages by ourselves.
+ */
+ atomic_dec(&eb->io_pages);
+ }
} else {
unlock_page(page);
}
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index e0715fcfb11e..26f9ac719d20 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -13,7 +13,7 @@ int __init extent_map_init(void)
{
extent_map_cache = kmem_cache_create("btrfs_extent_map",
sizeof(struct extent_map), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!extent_map_cache)
return -ENOMEM;
return 0;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 62a81ee13a5f..d0d571c47d33 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -250,7 +250,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
offset + root->sectorsize - 1,
EXTENT_NODATASUM);
} else {
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_info_rl(BTRFS_I(inode)->root->fs_info,
"no csum found for inode %llu start %llu",
btrfs_ino(inode), offset);
}
@@ -699,7 +699,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
*/
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2234e88cf674..9404121fd5f7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -132,7 +132,7 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
static inline int __need_auto_defrag(struct btrfs_root *root)
{
- if (!btrfs_test_opt(root, AUTO_DEFRAG))
+ if (!btrfs_test_opt(root->fs_info, AUTO_DEFRAG))
return 0;
if (btrfs_fs_closing(root->fs_info))
@@ -950,7 +950,7 @@ delete_extent_item:
ret = btrfs_del_items(trans, root, path, del_slot,
del_nr);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
@@ -974,7 +974,7 @@ delete_extent_item:
path->slots[0] = del_slot;
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
leaf = path->nodes[0];
@@ -1190,7 +1190,7 @@ again:
goto again;
}
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -1278,7 +1278,7 @@ again:
ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -1629,13 +1629,11 @@ again:
* managed to copy.
*/
if (num_sectors > dirty_sectors) {
- /*
- * we round down because we don't want to count
- * any partial blocks actually sent through the
- * IO machines
- */
- release_bytes = round_down(release_bytes - copied,
- root->sectorsize);
+
+ /* release everything except the sectors we dirtied */
+ release_bytes -= dirty_sectors <<
+ root->fs_info->sb->s_blocksize_bits;
+
if (copied > 0) {
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
@@ -2479,7 +2477,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2522,7 +2520,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
@@ -2977,7 +2975,7 @@ int btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 69d270f6602c..d571bd2b697b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -280,7 +280,7 @@ fail:
if (locked)
mutex_unlock(&trans->transaction->cache_write_mutex);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -3026,7 +3026,7 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
* For metadata, allow allocates with smaller extents. For
* data, keep it dense.
*/
- if (btrfs_test_opt(root, SSD_SPREAD)) {
+ if (btrfs_test_opt(root->fs_info, SSD_SPREAD)) {
cont1_bytes = min_bytes = bytes + empty_size;
} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
cont1_bytes = bytes;
@@ -3470,7 +3470,7 @@ int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
int ret = 0;
u64 root_gen = btrfs_root_generation(&root->root_item);
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
/*
@@ -3514,7 +3514,7 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
struct btrfs_io_ctl io_ctl;
bool release_metadata = true;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
memset(&io_ctl, 0, sizeof(io_ctl));
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
index 53dbeaf6ce94..87e7e3d3e676 100644
--- a/fs/btrfs/free-space-tree.c
+++ b/fs/btrfs/free-space-tree.c
@@ -305,7 +305,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
out:
kvfree(bitmap);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -454,7 +454,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
out:
kvfree(bitmap);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -851,7 +851,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1047,7 +1047,7 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1193,7 +1193,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
abort:
fs_info->creating_free_space_tree = 0;
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -1280,7 +1280,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
return 0;
abort:
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -1333,7 +1333,7 @@ out:
btrfs_free_path(path);
mutex_unlock(&block_group->free_space_lock);
if (ret)
- btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -1410,7 +1410,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
out:
btrfs_free_path(path);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 70107f7c9307..aa6fabaee72e 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -38,7 +38,7 @@ static int caching_kthread(void *data)
int slot;
int ret;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -141,7 +141,7 @@ static void start_caching(struct btrfs_root *root)
int ret;
u64 objectid;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
spin_lock(&root->ino_cache_lock);
@@ -185,7 +185,7 @@ static void start_caching(struct btrfs_root *root)
int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
{
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return btrfs_find_free_objectid(root, objectid);
again:
@@ -211,7 +211,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
{
struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
again:
if (root->ino_cache_state == BTRFS_CACHE_FINISHED) {
@@ -251,7 +251,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
struct rb_node *n;
u64 count;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return;
while (1) {
@@ -412,7 +412,7 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
if (btrfs_root_refs(&root->root_item) == 0)
return 0;
- if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (!btrfs_test_opt(root->fs_info, INODE_MAP_CACHE))
return 0;
path = btrfs_alloc_path();
@@ -458,7 +458,7 @@ again:
BTRFS_I(inode)->generation = 0;
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
@@ -466,7 +466,7 @@ again:
ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode);
if (ret) {
if (ret != -ENOSPC)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_put;
}
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index df731c0ebec7..b0f421f332ae 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -60,6 +60,7 @@
#include "hash.h"
#include "props.h"
#include "qgroup.h"
+#include "dedupe.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -105,8 +106,9 @@ static int btrfs_truncate(struct inode *inode);
static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written, int unlock);
+ u64 start, u64 end, u64 delalloc_end,
+ int *page_started, unsigned long *nr_written,
+ int unlock, struct btrfs_dedupe_hash *hash);
static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
u64 len, u64 orig_start,
u64 block_start, u64 block_len,
@@ -294,7 +296,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
start, aligned_end, NULL,
1, 1, extent_item_size, &extent_inserted);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -305,7 +307,7 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
inline_len, compressed_size,
compress_type, compressed_pages);
if (ret && ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
} else if (ret == -ENOSPC) {
ret = 1;
@@ -374,12 +376,12 @@ static inline int inode_need_compress(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
/* force compress */
- if (btrfs_test_opt(root, FORCE_COMPRESS))
+ if (btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
return 1;
/* bad compression ratios */
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
return 0;
- if (btrfs_test_opt(root, COMPRESS) ||
+ if (btrfs_test_opt(root->fs_info, COMPRESS) ||
BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
BTRFS_I(inode)->force_compress)
return 1;
@@ -585,9 +587,27 @@ cont:
will_compress = 0;
} else {
num_bytes = total_in;
+ *num_added += 1;
+
+ /*
+ * The async work queues will take care of doing actual
+ * allocation on disk for these compressed pages, and
+ * will submit them to the elevator.
+ */
+ add_async_extent(async_cow, start, num_bytes,
+ total_compressed, pages, nr_pages_ret,
+ compress_type);
+
+ if (start + num_bytes < end) {
+ start += num_bytes;
+ pages = NULL;
+ cond_resched();
+ goto again;
+ }
+ return;
}
}
- if (!will_compress && pages) {
+ if (pages) {
/*
* the compression code ran but failed to make things smaller,
* free any pages it allocated and our page pointer array
@@ -602,48 +622,28 @@ cont:
nr_pages_ret = 0;
/* flag the file so we don't compress in the future */
- if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
+ if (!btrfs_test_opt(root->fs_info, FORCE_COMPRESS) &&
!(BTRFS_I(inode)->force_compress)) {
BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
}
}
- if (will_compress) {
- *num_added += 1;
-
- /* the async work queues will take care of doing actual
- * allocation on disk for these compressed pages,
- * and will submit them to the elevator.
- */
- add_async_extent(async_cow, start, num_bytes,
- total_compressed, pages, nr_pages_ret,
- compress_type);
-
- if (start + num_bytes < end) {
- start += num_bytes;
- pages = NULL;
- cond_resched();
- goto again;
- }
- } else {
cleanup_and_bail_uncompressed:
- /*
- * No compression, but we still need to write the pages in
- * the file we've been given so far. redirty the locked
- * page if it corresponds to our extent and set things up
- * for the async work queue to run cow_file_range to do
- * the normal delalloc dance
- */
- if (page_offset(locked_page) >= start &&
- page_offset(locked_page) <= end) {
- __set_page_dirty_nobuffers(locked_page);
- /* unlocked later on in the async handlers */
- }
- if (redirty)
- extent_range_redirty_for_io(inode, start, end);
- add_async_extent(async_cow, start, end - start + 1,
- 0, NULL, 0, BTRFS_COMPRESS_NONE);
- *num_added += 1;
- }
+ /*
+ * No compression, but we still need to write the pages in the file
+ * we've been given so far. redirty the locked page if it corresponds
+ * to our extent and set things up for the async work queue to run
+ * cow_file_range to do the normal delalloc dance.
+ */
+ if (page_offset(locked_page) >= start &&
+ page_offset(locked_page) <= end)
+ __set_page_dirty_nobuffers(locked_page);
+ /* unlocked later on in the async handlers */
+
+ if (redirty)
+ extent_range_redirty_for_io(inode, start, end);
+ add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
+ BTRFS_COMPRESS_NONE);
+ *num_added += 1;
return;
@@ -712,7 +712,10 @@ retry:
async_extent->start,
async_extent->start +
async_extent->ram_size - 1,
- &page_started, &nr_written, 0);
+ async_extent->start +
+ async_extent->ram_size - 1,
+ &page_started, &nr_written, 0,
+ NULL);
/* JDM XXX */
@@ -925,9 +928,9 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
*/
static noinline int cow_file_range(struct inode *inode,
struct page *locked_page,
- u64 start, u64 end, int *page_started,
- unsigned long *nr_written,
- int unlock)
+ u64 start, u64 end, u64 delalloc_end,
+ int *page_started, unsigned long *nr_written,
+ int unlock, struct btrfs_dedupe_hash *hash)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 alloc_hint = 0;
@@ -1156,7 +1159,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_cow->start = start;
if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
- !btrfs_test_opt(root, FORCE_COMPRESS))
+ !btrfs_test_opt(root->fs_info, FORCE_COMPRESS))
cur_end = end;
else
cur_end = min(end, start + SZ_512K - 1);
@@ -1418,7 +1421,8 @@ out_check:
if (cow_start != (u64)-1) {
ret = cow_file_range(inode, locked_page,
cow_start, found_key.offset - 1,
- page_started, nr_written, 1);
+ end, page_started, nr_written, 1,
+ NULL);
if (ret) {
if (!nolock && nocow)
btrfs_end_write_no_snapshoting(root);
@@ -1501,8 +1505,8 @@ out_check:
}
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = cow_file_range(inode, locked_page, cow_start, end, end,
+ page_started, nr_written, 1, NULL);
if (ret)
goto error;
}
@@ -1561,8 +1565,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
} else if (!inode_need_compress(inode)) {
- ret = cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
+ ret = cow_file_range(inode, locked_page, start, end, end,
+ page_started, nr_written, 1, NULL);
} else {
set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags);
@@ -1740,7 +1744,7 @@ static void btrfs_set_bit_hook(struct inode *inode,
}
/* For sanity tests */
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
@@ -1799,7 +1803,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
btrfs_delalloc_release_metadata(inode, len);
/* For sanity tests. */
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return;
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
@@ -1822,6 +1826,10 @@ static void btrfs_clear_bit_hook(struct inode *inode,
/*
* extent_io.c merge_bio_hook, this must check the chunk tree to make sure
* we don't create bios that span stripes or chunks
+ *
+ * return 1 if page cannot be merged to bio
+ * return 0 if page can be merged to bio
+ * return error otherwise
*/
int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
size_t size, struct bio *bio,
@@ -1840,8 +1848,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
map_length = length;
ret = btrfs_map_block(root->fs_info, bio_op(bio), logical,
&map_length, NULL, 0);
- /* Will always return 0 with map_multi == NULL */
- BUG_ON(ret < 0);
+ if (ret < 0)
+ return ret;
if (map_length < length + size)
return 1;
return 0;
@@ -2594,7 +2602,7 @@ again:
ret = btrfs_insert_empty_item(trans, root, path, &key,
sizeof(*extent));
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -2621,7 +2629,7 @@ again:
backref->root_id, backref->inum,
new->file_pos); /* start - extent_offset */
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_free_path;
}
@@ -2890,7 +2898,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
trans->block_rsv = &root->fs_info->delalloc_block_rsv;
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2950,7 +2958,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->file_offset, ordered_extent->len,
trans->transid);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_unlock;
}
@@ -2960,7 +2968,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
ret = btrfs_update_inode_fallback(trans, root, inode);
if (ret) { /* -ENOMEM or corruption */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_unlock;
}
ret = 0;
@@ -3204,7 +3212,7 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
else
clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
&root->state);
@@ -3295,7 +3303,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
if (ret != -EEXIST) {
clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
&BTRFS_I(inode)->runtime_flags);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -3307,7 +3315,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
root->root_key.objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
}
@@ -4006,20 +4014,20 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
btrfs_info(root->fs_info,
"failed to delete reference to %.*s, inode %llu parent %llu",
name_len, name, ino, dir_ino);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
skip_backref:
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir_ino);
if (ret != 0 && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto err;
}
@@ -4028,7 +4036,7 @@ skip_backref:
if (ret == -ENOENT)
ret = 0;
else if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err:
btrfs_free_path(path);
if (ret)
@@ -4142,7 +4150,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
ret = btrfs_delete_one_dir_name(trans, root, path, di);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
@@ -4152,7 +4160,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
dir_ino, &index, name, name_len);
if (ret < 0) {
if (ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
di = btrfs_search_dir_index_item(root, path, dir_ino,
@@ -4162,7 +4170,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = -ENOENT;
else
ret = PTR_ERR(di);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4175,7 +4183,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -4184,7 +4192,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
dir->i_mtime = dir->i_ctime = current_fs_time(dir->i_sb);
ret = btrfs_update_inode_fallback(trans, root, dir);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
out:
btrfs_free_path(path);
return ret;
@@ -4505,7 +4513,6 @@ search_again:
pending_del_nr);
if (err) {
btrfs_abort_transaction(trans,
- root,
err);
goto error;
}
@@ -4517,8 +4524,7 @@ search_again:
item_end,
new_size);
if (err) {
- btrfs_abort_transaction(trans,
- root, err);
+ btrfs_abort_transaction(trans, err);
goto error;
}
} else if (test_bit(BTRFS_ROOT_REF_COWS,
@@ -4582,8 +4588,7 @@ delete:
pending_del_slot,
pending_del_nr);
if (ret) {
- btrfs_abort_transaction(trans,
- root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error;
}
pending_del_nr = 0;
@@ -4616,7 +4621,7 @@ out:
ret = btrfs_del_items(trans, root, path, pending_del_slot,
pending_del_nr);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
}
error:
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
@@ -4785,7 +4790,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
return ret;
}
@@ -4793,7 +4798,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
0, 0, len, 0, len, 0, 0, 0);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
else
btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans, root);
@@ -5020,7 +5025,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
i_size_write(inode, BTRFS_I(inode)->disk_i_size);
err = btrfs_orphan_del(trans, inode);
if (err)
- btrfs_abort_transaction(trans, root, err);
+ btrfs_abort_transaction(trans, err);
btrfs_end_transaction(trans, root);
}
}
@@ -5158,11 +5163,18 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv, *global_rsv;
int steal_from_global = 0;
- u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+ u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
+ if (!root) {
+ kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+ return;
+ }
+
+ min_size = btrfs_calc_trunc_metadata_size(root, 1);
+
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5263,7 +5275,7 @@ void btrfs_evict_inode(struct inode *inode)
if (steal_from_global) {
if (!btrfs_check_space_for_delayed_refs(trans, root))
ret = btrfs_block_rsv_migrate(global_rsv, rsv,
- min_size);
+ min_size, 0);
else
ret = -ENOSPC;
}
@@ -6239,9 +6251,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
btrfs_inherit_iflags(inode, dir);
if (S_ISREG(mode)) {
- if (btrfs_test_opt(root, NODATASUM))
+ if (btrfs_test_opt(root->fs_info, NODATASUM))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(root->fs_info, NODATACOW))
BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
}
@@ -6319,7 +6331,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
else if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -6330,7 +6342,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode(trans, root, parent_inode);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
return ret;
fail_dir_item:
@@ -9116,7 +9128,7 @@ static int btrfs_truncate(struct inode *inode)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
- min_size);
+ min_size, 0);
BUG_ON(ret);
/*
@@ -9156,7 +9168,7 @@ static int btrfs_truncate(struct inode *inode)
}
ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
- rsv, min_size);
+ rsv, min_size, 0);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -9177,7 +9189,6 @@ static int btrfs_truncate(struct inode *inode)
ret = btrfs_end_transaction(trans, root);
btrfs_btree_balance_dirty(root);
}
-
out:
btrfs_free_block_rsv(root, rsv);
@@ -9386,25 +9397,25 @@ int btrfs_init_cachep(void)
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
goto fail;
btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
sizeof(struct btrfs_transaction), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_transaction_cachep)
goto fail;
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
goto fail;
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+ SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
goto fail;
@@ -9554,7 +9565,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = btrfs_update_inode(trans, root, old_inode);
}
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9574,7 +9585,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
ret = btrfs_update_inode(trans, dest, new_inode);
}
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9582,7 +9593,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, old_idx);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9590,7 +9601,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_dentry->d_name.name,
old_dentry->d_name.len, 0, new_idx);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9829,7 +9840,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = btrfs_update_inode(trans, root, old_inode);
}
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9853,7 +9864,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!ret && new_inode->i_nlink == 0)
ret = btrfs_orphan_add(trans, d_inode(new_dentry));
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
@@ -9862,7 +9873,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dentry->d_name.name,
new_dentry->d_name.len, 0, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
@@ -9882,7 +9893,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
old_dentry);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
}
}
@@ -10308,7 +10319,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
if (ret) {
btrfs_free_reserved_extent(root, ins.objectid,
ins.offset, 0);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
break;
@@ -10368,7 +10379,7 @@ next:
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
if (own_trans)
btrfs_end_transaction(trans, root);
break;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 05173563e4a6..14ed1e9e6bc8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -561,7 +561,7 @@ static noinline int create_subvol(struct inode *dir,
new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -570,7 +570,7 @@ static noinline int create_subvol(struct inode *dir,
ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
if (ret) {
/* We potentially lose an unused inode item here */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -583,7 +583,7 @@ static noinline int create_subvol(struct inode *dir,
*/
ret = btrfs_set_inode_index(dir, &index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -591,7 +591,7 @@ static noinline int create_subvol(struct inode *dir,
name, namelen, dir, &key,
BTRFS_FT_DIR, index);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -608,7 +608,7 @@ static noinline int create_subvol(struct inode *dir,
root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
objectid);
if (ret)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
fail:
kfree(root_item);
@@ -1948,8 +1948,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
return 1;
}
-static noinline int copy_to_sk(struct btrfs_root *root,
- struct btrfs_path *path,
+static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
size_t *buf_size,
@@ -2120,7 +2119,7 @@ static noinline int search_ioctl(struct inode *inode,
ret = 0;
goto err;
}
- ret = copy_to_sk(root, path, &key, sk, buf_size, ubuf,
+ ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
if (ret)
@@ -2406,7 +2405,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
* rmdir(2).
*/
err = -EPERM;
- if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ if (!btrfs_test_opt(root->fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
/*
@@ -2489,7 +2488,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dentry->d_name.len);
if (ret) {
err = ret;
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
@@ -2505,7 +2504,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
root->fs_info->tree_root,
dest->root_key.objectid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -2515,7 +2514,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -2525,7 +2524,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
dest->root_key.objectid);
if (ret && ret != -ENOENT) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
err = ret;
goto out_end_trans;
}
@@ -3292,7 +3291,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3694,7 +3693,7 @@ process_slot:
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root, ret);
+ ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3702,8 +3701,7 @@ process_slot:
ret = btrfs_insert_empty_item(trans, root, path,
&new_key, size);
if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -3735,7 +3733,6 @@ process_slot:
new_key.offset - datao);
if (ret) {
btrfs_abort_transaction(trans,
- root,
ret);
btrfs_end_transaction(trans,
root);
@@ -3772,7 +3769,6 @@ process_slot:
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root,
ret);
btrfs_end_transaction(trans, root);
goto out;
@@ -3828,7 +3824,7 @@ process_slot:
last_dest_end, destoff + len, 1);
if (ret) {
if (ret != -EOPNOTSUPP)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, root);
goto out;
}
@@ -5164,13 +5160,13 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_commit_transaction(trans, root);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index aca8264f4a49..3b78d38173b3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -1122,7 +1122,7 @@ int __init ordered_data_init(void)
{
btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
sizeof(struct btrfs_ordered_extent), 0,
- SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+ SLAB_MEM_SPREAD,
NULL);
if (!btrfs_ordered_extent_cache)
return -ENOMEM;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 36992128c746..cf0b444ac4f3 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -350,6 +350,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *parent_root)
{
+ struct super_block *sb = root->fs_info->sb;
struct btrfs_key key;
struct inode *parent_inode, *child_inode;
int ret;
@@ -358,12 +359,11 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
key.type = BTRFS_INODE_ITEM_KEY;
key.offset = 0;
- parent_inode = btrfs_iget(parent_root->fs_info->sb, &key,
- parent_root, NULL);
+ parent_inode = btrfs_iget(sb, &key, parent_root, NULL);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
- child_inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
+ child_inode = btrfs_iget(sb, &key, root, NULL);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 9d4c05b14f6e..93ee1c18ef9d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -571,7 +571,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_key key;
- if (btrfs_test_is_dummy_root(quota_root))
+ if (btrfs_is_testing(quota_root->fs_info))
return 0;
path = btrfs_alloc_path();
@@ -728,7 +728,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
int ret;
int slot;
- if (btrfs_test_is_dummy_root(root))
+ if (btrfs_is_testing(root->fs_info))
return 0;
key.objectid = 0;
@@ -1453,9 +1453,10 @@ int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
return ret;
}
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record)
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record)
{
struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node;
struct rb_node *parent_node = NULL;
@@ -1463,7 +1464,7 @@ struct btrfs_qgroup_extent_record
u64 bytenr = record->bytenr;
assert_spin_locked(&delayed_refs->lock);
- trace_btrfs_qgroup_insert_dirty_extent(record);
+ trace_btrfs_qgroup_insert_dirty_extent(fs_info, record);
while (*p) {
parent_node = *p;
@@ -1595,8 +1596,8 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq);
cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq);
- trace_qgroup_update_counters(qg->qgroupid, cur_old_count,
- cur_new_count);
+ trace_qgroup_update_counters(fs_info, qg->qgroupid,
+ cur_old_count, cur_new_count);
/* Rfer update part */
if (cur_old_count == 0 && cur_new_count > 0) {
@@ -1687,8 +1688,8 @@ btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
goto out_free;
BUG_ON(!fs_info->quota_root);
- trace_btrfs_qgroup_account_extent(bytenr, num_bytes, nr_old_roots,
- nr_new_roots);
+ trace_btrfs_qgroup_account_extent(fs_info, bytenr, num_bytes,
+ nr_old_roots, nr_new_roots);
qgroups = ulist_alloc(GFP_NOFS);
if (!qgroups) {
@@ -1759,7 +1760,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans,
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
- trace_btrfs_qgroup_account_extents(record);
+ trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
/*
@@ -2195,7 +2196,7 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
return;
- btrfs_err(trans->root->fs_info,
+ btrfs_err(trans->fs_info,
"qgroups not uptodate in trans handle %p: list is%s empty, "
"seq is %#x.%x",
trans, list_empty(&trans->qgroup_ref_list) ? "" : " not",
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index ecb2c143ef75..710887c06aaf 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -63,9 +63,10 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
struct btrfs_delayed_extent_op;
int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-struct btrfs_qgroup_extent_record
-*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs,
- struct btrfs_qgroup_extent_record *record);
+struct btrfs_qgroup_extent_record *
+btrfs_qgroup_insert_dirty_extent(struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_qgroup_extent_record *record);
int
btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info,
@@ -88,7 +89,7 @@ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
- trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0477dca154ed..b26a5aea41b4 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -235,12 +235,12 @@ static void backref_cache_cleanup(struct backref_cache *cache)
cache->last_trans = 0;
for (i = 0; i < BTRFS_MAX_LEVEL; i++)
- BUG_ON(!list_empty(&cache->pending[i]));
- BUG_ON(!list_empty(&cache->changed));
- BUG_ON(!list_empty(&cache->detached));
- BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
- BUG_ON(cache->nr_nodes);
- BUG_ON(cache->nr_edges);
+ ASSERT(list_empty(&cache->pending[i]));
+ ASSERT(list_empty(&cache->changed));
+ ASSERT(list_empty(&cache->detached));
+ ASSERT(RB_EMPTY_ROOT(&cache->rb_root));
+ ASSERT(!cache->nr_nodes);
+ ASSERT(!cache->nr_edges);
}
static struct backref_node *alloc_backref_node(struct backref_cache *cache)
@@ -1171,8 +1171,12 @@ out:
lower = list_entry(useless.next,
struct backref_node, list);
list_del_init(&lower->list);
+ if (lower == node)
+ node = NULL;
free_backref_node(cache, lower);
}
+
+ free_backref_node(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
@@ -1719,7 +1723,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
@@ -1727,7 +1731,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
parent, btrfs_header_owner(leaf),
key.objectid, key.offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
break;
}
}
@@ -2604,25 +2608,28 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
trans->block_rsv = rc->block_rsv;
rc->reserved_bytes += num_bytes;
+
+ /*
+ * We are under a transaction here so we can only do limited flushing.
+ * If we get an enospc just kick back -EAGAIN so we know to drop the
+ * transaction and try to refill when we can flush all the things.
+ */
ret = btrfs_block_rsv_refill(root, rc->block_rsv, num_bytes,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
- if (ret == -EAGAIN) {
- tmp = rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- while (tmp <= rc->reserved_bytes)
- tmp <<= 1;
- /*
- * only one thread can access block_rsv at this point,
- * so we don't need hold lock to protect block_rsv.
- * we expand more reservation size here to allow enough
- * space for relocation and we will return earlier in
- * enospc case.
- */
- rc->block_rsv->size = tmp + rc->extent_root->nodesize *
- RELOCATION_RESERVED_NODES;
- }
- return ret;
+ tmp = rc->extent_root->nodesize * RELOCATION_RESERVED_NODES;
+ while (tmp <= rc->reserved_bytes)
+ tmp <<= 1;
+ /*
+ * only one thread can access block_rsv at this point,
+ * so we don't need hold lock to protect block_rsv.
+ * we expand more reservation size here to allow enough
+ * space for relocation and we will return eailer in
+ * enospc case.
+ */
+ rc->block_rsv->size = tmp + rc->extent_root->nodesize *
+ RELOCATION_RESERVED_NODES;
+ return -EAGAIN;
}
return 0;
@@ -3871,6 +3878,7 @@ static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
struct btrfs_trans_handle *trans;
+ int ret;
rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
BTRFS_BLOCK_RSV_TEMP);
@@ -3885,6 +3893,11 @@ int prepare_to_relocate(struct reloc_control *rc)
rc->reserved_bytes = 0;
rc->block_rsv->size = rc->extent_root->nodesize *
RELOCATION_RESERVED_NODES;
+ ret = btrfs_block_rsv_refill(rc->extent_root,
+ rc->block_rsv, rc->block_rsv->size,
+ BTRFS_RESERVE_FLUSH_ALL);
+ if (ret)
+ return ret;
rc->create_reloc_tree = 1;
set_reloc_control(rc);
@@ -4643,7 +4656,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated);
+ rc->nodes_relocated, 1);
if (ret)
return ret;
}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index f1c30861d062..7fd7e1830cfe 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -150,7 +150,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path, 0, 1);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -176,20 +176,20 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
ret = btrfs_search_slot(trans, root, key, path,
-1, 1);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
ret = btrfs_del_item(trans, root, path);
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, root, path,
key, sizeof(*item));
if (ret < 0) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
l = path->nodes[0];
@@ -448,7 +448,7 @@ again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
if (ret) {
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e08b6bc676e3..1d195d2b32c6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3785,27 +3785,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
if (fs_info->scrub_workers_refcnt == 0) {
if (is_dev_replace)
fs_info->scrub_workers =
- btrfs_alloc_workqueue("scrub", flags,
+ btrfs_alloc_workqueue(fs_info, "scrub", flags,
1, 4);
else
fs_info->scrub_workers =
- btrfs_alloc_workqueue("scrub", flags,
+ btrfs_alloc_workqueue(fs_info, "scrub", flags,
max_active, 4);
if (!fs_info->scrub_workers)
goto fail_scrub_workers;
fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue("scrubwrc", flags,
+ btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
if (!fs_info->scrub_wr_completion_workers)
goto fail_scrub_wr_completion_workers;
fs_info->scrub_nocow_workers =
- btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
+ btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
if (!fs_info->scrub_nocow_workers)
goto fail_scrub_nocow_workers;
fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue("scrubparity", flags,
+ btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
max_active, 2);
if (!fs_info->scrub_parity_workers)
goto fail_scrub_parity_workers;
@@ -3860,7 +3860,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
/* not supported for data w/o checksums */
- btrfs_err(fs_info,
+ btrfs_err_rl(fs_info,
"scrub: size assumption sectorsize != PAGE_SIZE "
"(%d != %lu) fails",
fs_info->chunk_root->sectorsize, PAGE_SIZE);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 60e7179ed4b7..864ce334f696 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -184,6 +184,22 @@ static const char * const logtypes[] = {
"debug",
};
+
+/*
+ * Use one ratelimit state per log level so that a flood of less important
+ * messages doesn't cause more important ones to be dropped.
+ */
+static struct ratelimit_state printk_limits[] = {
+ RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
+ RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
+};
+
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
@@ -192,6 +208,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_list args;
const char *type = logtypes[4];
int kern_level;
+ struct ratelimit_state *ratelimit;
va_start(args, fmt);
@@ -202,13 +219,18 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
lvl[size] = '\0';
fmt += size;
type = logtypes[kern_level - '0'];
- } else
+ ratelimit = &printk_limits[kern_level - '0'];
+ } else {
*lvl = '\0';
+ /* Default to debug output */
+ ratelimit = &printk_limits[7];
+ }
vaf.fmt = fmt;
vaf.va = &args;
- printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+ if (__ratelimit(ratelimit))
+ printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
va_end(args);
}
@@ -229,9 +251,11 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *function,
+ const char *function,
unsigned int line, int errno)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+
trans->aborted = errno;
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
@@ -239,16 +263,16 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *errstr;
errstr = btrfs_decode_error(errno);
- btrfs_warn(root->fs_info,
+ btrfs_warn(fs_info,
"%s:%d: Aborting unused transaction(%s).",
function, line, errstr);
return;
}
ACCESS_ONCE(trans->transaction->aborted) = errno;
/* Wake up anybody who may be waiting on this transaction */
- wake_up(&root->fs_info->transaction_wait);
- wake_up(&root->fs_info->transaction_blocked_wait);
- __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
+ wake_up(&fs_info->transaction_wait);
+ wake_up(&fs_info->transaction_blocked_wait);
+ __btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -432,12 +456,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
*/
break;
case Opt_nodatasum:
- btrfs_set_and_info(root, NODATASUM,
+ btrfs_set_and_info(info, NODATASUM,
"setting nodatasum");
break;
case Opt_datasum:
- if (btrfs_test_opt(root, NODATASUM)) {
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(info, NODATASUM)) {
+ if (btrfs_test_opt(info, NODATACOW))
btrfs_info(root->fs_info, "setting datasum, datacow enabled");
else
btrfs_info(root->fs_info, "setting datasum");
@@ -446,9 +470,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_clear_opt(info->mount_opt, NODATASUM);
break;
case Opt_nodatacow:
- if (!btrfs_test_opt(root, NODATACOW)) {
- if (!btrfs_test_opt(root, COMPRESS) ||
- !btrfs_test_opt(root, FORCE_COMPRESS)) {
+ if (!btrfs_test_opt(info, NODATACOW)) {
+ if (!btrfs_test_opt(info, COMPRESS) ||
+ !btrfs_test_opt(info, FORCE_COMPRESS)) {
btrfs_info(root->fs_info,
"setting nodatacow, compression disabled");
} else {
@@ -461,7 +485,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_set_opt(info->mount_opt, NODATASUM);
break;
case Opt_datacow:
- btrfs_clear_and_info(root, NODATACOW,
+ btrfs_clear_and_info(info, NODATACOW,
"setting datacow");
break;
case Opt_compress_force:
@@ -470,10 +494,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
/* Fallthrough */
case Opt_compress:
case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+ saved_compress_type = btrfs_test_opt(info,
+ COMPRESS) ?
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
- btrfs_test_opt(root, FORCE_COMPRESS);
+ btrfs_test_opt(info, FORCE_COMPRESS);
if (token == Opt_compress ||
token == Opt_compress_force ||
strcmp(args[0].from, "zlib") == 0) {
@@ -513,10 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(root, COMPRESS) &&
+ if ((btrfs_test_opt(info, COMPRESS) &&
(info->compress_type != saved_compress_type ||
compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(root, COMPRESS) &&
+ (!btrfs_test_opt(info, COMPRESS) &&
no_compress == 1)) {
btrfs_info(root->fs_info,
"%s %s compression",
@@ -526,25 +551,25 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
compress_force = false;
break;
case Opt_ssd:
- btrfs_set_and_info(root, SSD,
+ btrfs_set_and_info(info, SSD,
"use ssd allocation scheme");
break;
case Opt_ssd_spread:
- btrfs_set_and_info(root, SSD_SPREAD,
+ btrfs_set_and_info(info, SSD_SPREAD,
"use spread ssd allocation scheme");
btrfs_set_opt(info->mount_opt, SSD);
break;
case Opt_nossd:
- btrfs_set_and_info(root, NOSSD,
+ btrfs_set_and_info(info, NOSSD,
"not using ssd allocation scheme");
btrfs_clear_opt(info->mount_opt, SSD);
break;
case Opt_barrier:
- btrfs_clear_and_info(root, NOBARRIER,
+ btrfs_clear_and_info(info, NOBARRIER,
"turning on barriers");
break;
case Opt_nobarrier:
- btrfs_set_and_info(root, NOBARRIER,
+ btrfs_set_and_info(info, NOBARRIER,
"turning off barriers");
break;
case Opt_thread_pool:
@@ -604,24 +629,24 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
root->fs_info->sb->s_flags &= ~MS_POSIXACL;
break;
case Opt_notreelog:
- btrfs_set_and_info(root, NOTREELOG,
+ btrfs_set_and_info(info, NOTREELOG,
"disabling tree log");
break;
case Opt_treelog:
- btrfs_clear_and_info(root, NOTREELOG,
+ btrfs_clear_and_info(info, NOTREELOG,
"enabling tree log");
break;
case Opt_norecovery:
case Opt_nologreplay:
- btrfs_set_and_info(root, NOLOGREPLAY,
+ btrfs_set_and_info(info, NOLOGREPLAY,
"disabling log replay at mount time");
break;
case Opt_flushoncommit:
- btrfs_set_and_info(root, FLUSHONCOMMIT,
+ btrfs_set_and_info(info, FLUSHONCOMMIT,
"turning on flush-on-commit");
break;
case Opt_noflushoncommit:
- btrfs_clear_and_info(root, FLUSHONCOMMIT,
+ btrfs_clear_and_info(info, FLUSHONCOMMIT,
"turning off flush-on-commit");
break;
case Opt_ratio:
@@ -638,11 +663,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
}
break;
case Opt_discard:
- btrfs_set_and_info(root, DISCARD,
+ btrfs_set_and_info(info, DISCARD,
"turning on discard");
break;
case Opt_nodiscard:
- btrfs_clear_and_info(root, DISCARD,
+ btrfs_clear_and_info(info, DISCARD,
"turning off discard");
break;
case Opt_space_cache:
@@ -651,12 +676,13 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
strcmp(args[0].from, "v1") == 0) {
btrfs_clear_opt(root->fs_info->mount_opt,
FREE_SPACE_TREE);
- btrfs_set_and_info(root, SPACE_CACHE,
+ btrfs_set_and_info(info, SPACE_CACHE,
"enabling disk space caching");
} else if (strcmp(args[0].from, "v2") == 0) {
btrfs_clear_opt(root->fs_info->mount_opt,
SPACE_CACHE);
- btrfs_set_and_info(root, FREE_SPACE_TREE,
+ btrfs_set_and_info(info,
+ FREE_SPACE_TREE,
"enabling free space tree");
} else {
ret = -EINVAL;
@@ -667,12 +693,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
break;
case Opt_no_space_cache:
- if (btrfs_test_opt(root, SPACE_CACHE)) {
- btrfs_clear_and_info(root, SPACE_CACHE,
+ if (btrfs_test_opt(info, SPACE_CACHE)) {
+ btrfs_clear_and_info(info,
+ SPACE_CACHE,
"disabling disk space caching");
}
- if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(root, FREE_SPACE_TREE,
+ if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
+ btrfs_clear_and_info(info,
+ FREE_SPACE_TREE,
"disabling free space tree");
}
break;
@@ -685,7 +713,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
"disabling inode map caching");
break;
case Opt_clear_cache:
- btrfs_set_and_info(root, CLEAR_CACHE,
+ btrfs_set_and_info(info, CLEAR_CACHE,
"force clearing of disk cache");
break;
case Opt_user_subvol_rm_allowed:
@@ -698,11 +726,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options,
btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
break;
case Opt_defrag:
- btrfs_set_and_info(root, AUTO_DEFRAG,
+ btrfs_set_and_info(info, AUTO_DEFRAG,
"enabling auto defrag");
break;
case Opt_nodefrag:
- btrfs_clear_and_info(root, AUTO_DEFRAG,
+ btrfs_clear_and_info(info, AUTO_DEFRAG,
"disabling auto defrag");
break;
case Opt_recovery:
@@ -810,22 +838,22 @@ check:
/*
* Extra check for current option against current flag
*/
- if (btrfs_test_opt(root, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
+ if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) {
btrfs_err(root->fs_info,
"nologreplay must be used with ro mount option");
ret = -EINVAL;
}
out:
if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(root, FREE_SPACE_TREE) &&
- !btrfs_test_opt(root, CLEAR_CACHE)) {
+ !btrfs_test_opt(info, FREE_SPACE_TREE) &&
+ !btrfs_test_opt(info, CLEAR_CACHE)) {
btrfs_err(root->fs_info, "cannot disable free space tree");
ret = -EINVAL;
}
- if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+ if (!ret && btrfs_test_opt(info, SPACE_CACHE))
btrfs_info(root->fs_info, "disk space caching is enabled");
- if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+ if (!ret && btrfs_test_opt(info, FREE_SPACE_TREE))
btrfs_info(root->fs_info, "using free space tree");
kfree(orig);
return ret;
@@ -1149,7 +1177,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root = fs_info->tree_root;
- trace_btrfs_sync_fs(wait);
+ trace_btrfs_sync_fs(fs_info, wait);
if (!wait) {
filemap_flush(fs_info->btree_inode->i_mapping);
@@ -1192,13 +1220,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
struct btrfs_root *root = info->tree_root;
char *compress_type;
- if (btrfs_test_opt(root, DEGRADED))
+ if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
- if (btrfs_test_opt(root, NODATASUM))
+ if (btrfs_test_opt(info, NODATASUM))
seq_puts(seq, ",nodatasum");
- if (btrfs_test_opt(root, NODATACOW))
+ if (btrfs_test_opt(info, NODATACOW))
seq_puts(seq, ",nodatacow");
- if (btrfs_test_opt(root, NOBARRIER))
+ if (btrfs_test_opt(info, NOBARRIER))
seq_puts(seq, ",nobarrier");
if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
seq_printf(seq, ",max_inline=%llu", info->max_inline);
@@ -1207,56 +1235,56 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (info->thread_pool_size != min_t(unsigned long,
num_online_cpus() + 2, 8))
seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
- if (btrfs_test_opt(root, COMPRESS)) {
+ if (btrfs_test_opt(info, COMPRESS)) {
if (info->compress_type == BTRFS_COMPRESS_ZLIB)
compress_type = "zlib";
else
compress_type = "lzo";
- if (btrfs_test_opt(root, FORCE_COMPRESS))
+ if (btrfs_test_opt(info, FORCE_COMPRESS))
seq_printf(seq, ",compress-force=%s", compress_type);
else
seq_printf(seq, ",compress=%s", compress_type);
}
- if (btrfs_test_opt(root, NOSSD))
+ if (btrfs_test_opt(info, NOSSD))
seq_puts(seq, ",nossd");
- if (btrfs_test_opt(root, SSD_SPREAD))
+ if (btrfs_test_opt(info, SSD_SPREAD))
seq_puts(seq, ",ssd_spread");
- else if (btrfs_test_opt(root, SSD))
+ else if (btrfs_test_opt(info, SSD))
seq_puts(seq, ",ssd");
- if (btrfs_test_opt(root, NOTREELOG))
+ if (btrfs_test_opt(info, NOTREELOG))
seq_puts(seq, ",notreelog");
- if (btrfs_test_opt(root, NOLOGREPLAY))
+ if (btrfs_test_opt(info, NOLOGREPLAY))
seq_puts(seq, ",nologreplay");
- if (btrfs_test_opt(root, FLUSHONCOMMIT))
+ if (btrfs_test_opt(info, FLUSHONCOMMIT))
seq_puts(seq, ",flushoncommit");
- if (btrfs_test_opt(root, DISCARD))
+ if (btrfs_test_opt(info, DISCARD))
seq_puts(seq, ",discard");
if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
seq_puts(seq, ",noacl");
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(info, SPACE_CACHE))
seq_puts(seq, ",space_cache");
- else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+ else if (btrfs_test_opt(info, FREE_SPACE_TREE))
seq_puts(seq, ",space_cache=v2");
else
seq_puts(seq, ",nospace_cache");
- if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+ if (btrfs_test_opt(info, RESCAN_UUID_TREE))
seq_puts(seq, ",rescan_uuid_tree");
- if (btrfs_test_opt(root, CLEAR_CACHE))
+ if (btrfs_test_opt(info, CLEAR_CACHE))
seq_puts(seq, ",clear_cache");
- if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+ if (btrfs_test_opt(info, USER_SUBVOL_RM_ALLOWED))
seq_puts(seq, ",user_subvol_rm_allowed");
- if (btrfs_test_opt(root, ENOSPC_DEBUG))
+ if (btrfs_test_opt(info, ENOSPC_DEBUG))
seq_puts(seq, ",enospc_debug");
- if (btrfs_test_opt(root, AUTO_DEFRAG))
+ if (btrfs_test_opt(info, AUTO_DEFRAG))
seq_puts(seq, ",autodefrag");
- if (btrfs_test_opt(root, INODE_MAP_CACHE))
+ if (btrfs_test_opt(info, INODE_MAP_CACHE))
seq_puts(seq, ",inode_cache");
- if (btrfs_test_opt(root, SKIP_BALANCE))
+ if (btrfs_test_opt(info, SKIP_BALANCE))
seq_puts(seq, ",skip_balance");
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
- if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+ if (btrfs_test_opt(info, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
seq_puts(seq, ",check_int_data");
- else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+ else if (btrfs_test_opt(info, CHECK_INTEGRITY))
seq_puts(seq, ",check_int");
if (info->check_integrity_print_mask)
seq_printf(seq, ",check_int_print_mask=%d",
@@ -1265,14 +1293,14 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
if (info->metadata_ratio)
seq_printf(seq, ",metadata_ratio=%d",
info->metadata_ratio);
- if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+ if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR))
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
#ifdef CONFIG_BTRFS_DEBUG
- if (btrfs_test_opt(root, FRAGMENT_DATA))
+ if (btrfs_test_opt(info, FRAGMENT_DATA))
seq_puts(seq, ",fragment=data");
- if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ if (btrfs_test_opt(info, FRAGMENT_METADATA))
seq_puts(seq, ",fragment=metadata");
#endif
seq_printf(seq, ",subvolid=%llu",
@@ -2030,9 +2058,6 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
* chunk).
*
* If metadata is exhausted, f_bavail will be 0.
- *
- * FIXME: not accurate for mixed block groups, total and free/used are ok,
- * available appears slightly larger.
*/
static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
@@ -2319,49 +2344,6 @@ static void btrfs_print_mod_info(void)
btrfs_crc32c_impl());
}
-static int btrfs_run_sanity_tests(void)
-{
- int ret, i;
- u32 sectorsize, nodesize;
- u32 test_sectorsize[] = {
- PAGE_SIZE,
- };
- ret = btrfs_init_test_fs();
- if (ret)
- return ret;
- for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
- sectorsize = test_sectorsize[i];
- for (nodesize = sectorsize;
- nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
- nodesize <<= 1) {
- pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
- sectorsize, nodesize);
- ret = btrfs_test_free_space_cache(sectorsize, nodesize);
- if (ret)
- goto out;
- ret = btrfs_test_extent_buffer_operations(sectorsize,
- nodesize);
- if (ret)
- goto out;
- ret = btrfs_test_extent_io(sectorsize, nodesize);
- if (ret)
- goto out;
- ret = btrfs_test_inodes(sectorsize, nodesize);
- if (ret)
- goto out;
- ret = btrfs_test_qgroups(sectorsize, nodesize);
- if (ret)
- goto out;
- ret = btrfs_test_free_space_tree(sectorsize, nodesize);
- if (ret)
- goto out;
- }
- }
-out:
- btrfs_destroy_test_fs();
- return ret;
-}
-
static int __init init_btrfs_fs(void)
{
int err;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4879656bda3c..c6569905d3d1 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -326,6 +326,7 @@ SPACE_INFO_ATTR(bytes_used);
SPACE_INFO_ATTR(bytes_pinned);
SPACE_INFO_ATTR(bytes_reserved);
SPACE_INFO_ATTR(bytes_may_use);
+SPACE_INFO_ATTR(bytes_readonly);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned);
@@ -337,6 +338,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(bytes_pinned),
BTRFS_ATTR_PTR(bytes_reserved),
BTRFS_ATTR_PTR(bytes_may_use),
+ BTRFS_ATTR_PTR(bytes_readonly),
BTRFS_ATTR_PTR(disk_used),
BTRFS_ATTR_PTR(disk_total),
BTRFS_ATTR_PTR(total_bytes_pinned),
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 02223f3f78f4..bf62ad919a95 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -54,7 +54,7 @@ struct inode *btrfs_new_test_inode(void)
return new_inode(test_mnt->mnt_sb);
}
-int btrfs_init_test_fs(void)
+static int btrfs_init_test_fs(void)
{
int ret;
@@ -73,7 +73,7 @@ int btrfs_init_test_fs(void)
return 0;
}
-void btrfs_destroy_test_fs(void)
+static void btrfs_destroy_test_fs(void)
{
kern_unmount(test_mnt);
unregister_filesystem(&test_type);
@@ -128,14 +128,27 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
extent_io_tree_init(&fs_info->freed_extents[0], NULL);
extent_io_tree_init(&fs_info->freed_extents[1], NULL);
fs_info->pinned_extents = &fs_info->freed_extents[0];
+ set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+
+ test_mnt->mnt_sb->s_fs_info = fs_info;
+
return fs_info;
}
-static void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
+ if (!fs_info)
+ return;
+
+ if (WARN_ON(!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
+ &fs_info->fs_state)))
+ return;
+
+ test_mnt->mnt_sb->s_fs_info = NULL;
+
spin_lock(&fs_info->buffer_lock);
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
@@ -167,10 +180,11 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
{
if (!root)
return;
+ /* Will be freed by btrfs_free_fs_roots */
+ if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
+ return;
if (root->node)
free_extent_buffer(root->node);
- if (root->fs_info)
- btrfs_free_dummy_fs_info(root->fs_info);
kfree(root);
}
@@ -220,3 +234,46 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
INIT_LIST_HEAD(&trans->qgroup_ref_list);
trans->type = __TRANS_DUMMY;
}
+
+int btrfs_run_sanity_tests(void)
+{
+ int ret, i;
+ u32 sectorsize, nodesize;
+ u32 test_sectorsize[] = {
+ PAGE_SIZE,
+ };
+ ret = btrfs_init_test_fs();
+ if (ret)
+ return ret;
+ for (i = 0; i < ARRAY_SIZE(test_sectorsize); i++) {
+ sectorsize = test_sectorsize[i];
+ for (nodesize = sectorsize;
+ nodesize <= BTRFS_MAX_METADATA_BLOCKSIZE;
+ nodesize <<= 1) {
+ pr_info("BTRFS: selftest: sectorsize: %u nodesize: %u\n",
+ sectorsize, nodesize);
+ ret = btrfs_test_free_space_cache(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_buffer_operations(sectorsize,
+ nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_extent_io(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_inodes(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_qgroups(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ ret = btrfs_test_free_space_tree(sectorsize, nodesize);
+ if (ret)
+ goto out;
+ }
+ }
+out:
+ btrfs_destroy_test_fs();
+ return ret;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 66fb6b701eb7..b17ffbe8f9f3 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -20,57 +20,29 @@
#define __BTRFS_TESTS
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int btrfs_run_sanity_tests(void);
#define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
struct btrfs_root;
struct btrfs_trans_handle;
-int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_buffer_operations(u32 sectorsize, u32 nodesize);
+int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize);
int btrfs_test_extent_io(u32 sectorsize, u32 nodesize);
int btrfs_test_inodes(u32 sectorsize, u32 nodesize);
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize);
int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize);
-int btrfs_init_test_fs(void);
-void btrfs_destroy_test_fs(void);
struct inode *btrfs_new_test_inode(void);
struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
+void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info);
void btrfs_free_dummy_root(struct btrfs_root *root);
struct btrfs_block_group_cache *
btrfs_alloc_dummy_block_group(unsigned long length, u32 sectorsize);
void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
#else
-static inline int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
-{
- return 0;
-}
-static inline int btrfs_test_extent_buffer_operations(u32 sectorsize,
- u32 nodesize)
-{
- return 0;
-}
-static inline int btrfs_init_test_fs(void)
-{
- return 0;
-}
-static inline void btrfs_destroy_test_fs(void)
-{
-}
-static inline int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
-{
- return 0;
-}
-static inline int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
-{
- return 0;
-}
-static inline int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
-{
- return 0;
-}
-static inline int btrfs_test_free_space_tree(u32 sectorsize, u32 nodesize)
+static inline int btrfs_run_sanity_tests(void)
{
return 0;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index 4f8cbd1ec5ee..199569174637 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -24,8 +24,9 @@
static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{
- struct btrfs_path *path;
- struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_path *path = NULL;
+ struct btrfs_root *root = NULL;
struct extent_buffer *eb;
struct btrfs_item *item;
char *value = "mary had a little lamb";
@@ -40,17 +41,24 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
test_msg("Running btrfs_split_item tests\n");
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Could not allocate fs_info\n");
+ return -ENOMEM;
+ }
+
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(root)) {
test_msg("Could not allocate root\n");
- return PTR_ERR(root);
+ ret = PTR_ERR(root);
+ goto out;
}
path = btrfs_alloc_path();
if (!path) {
test_msg("Could not allocate path\n");
- kfree(root);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, nodesize,
@@ -219,7 +227,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
}
out:
btrfs_free_path(path);
- kfree(root);
+ btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 3956bb2ff84c..3221c8dee272 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -837,6 +837,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache,
int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_block_group_cache *cache;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
@@ -855,15 +856,17 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
return 0;
}
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- ret = PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ ret = -ENOMEM;
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info)
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
goto out;
+ }
root->fs_info->extent_root = root;
cache->fs_info = root->fs_info;
@@ -882,6 +885,7 @@ int btrfs_test_free_space_cache(u32 sectorsize, u32 nodesize)
out:
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index aac507085ab0..7508d3b42780 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -443,23 +443,24 @@ typedef int (*test_func_t)(struct btrfs_trans_handle *,
static int run_test(test_func_t test_func, int bitmaps,
u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info;
struct btrfs_root *root = NULL;
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_trans_handle trans;
struct btrfs_path *path = NULL;
int ret;
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate dummy root\n");
- ret = PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ ret = -ENOMEM;
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
- ret = -ENOMEM;
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate dummy root\n");
+ ret = PTR_ERR(root);
goto out;
}
@@ -534,6 +535,7 @@ out:
btrfs_free_path(path);
btrfs_free_dummy_block_group(cache);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 29648c0a39f1..9f72aeda9220 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -230,6 +230,7 @@ static unsigned long vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
struct extent_map *em = NULL;
@@ -248,19 +249,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- /*
- * We do this since btrfs_get_extent wants to assign em->bdev to
- * root->fs_info->fs_devices->latest_bdev.
- */
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
@@ -835,11 +832,13 @@ out:
free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
static int test_hole_first(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
struct extent_map *em = NULL;
@@ -855,15 +854,15 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID;
BTRFS_I(inode)->location.offset = 0;
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
@@ -934,11 +933,13 @@ out:
free_extent_map(em);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
static int test_extent_accounting(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct inode *inode = NULL;
struct btrfs_root *root = NULL;
int ret = -ENOMEM;
@@ -949,15 +950,15 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize)
return ret;
}
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
goto out;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
goto out;
}
@@ -1132,6 +1133,7 @@ out:
NULL, GFP_KERNEL);
iput(inode);
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 57a12c0d680b..4407fef7c16c 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -453,22 +453,24 @@ static int test_multiple_refs(struct btrfs_root *root,
int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
{
+ struct btrfs_fs_info *fs_info = NULL;
struct btrfs_root *root;
struct btrfs_root *tmp_root;
int ret = 0;
- root = btrfs_alloc_dummy_root(sectorsize, nodesize);
- if (IS_ERR(root)) {
- test_msg("Couldn't allocate root\n");
- return PTR_ERR(root);
+ fs_info = btrfs_alloc_dummy_fs_info();
+ if (!fs_info) {
+ test_msg("Couldn't allocate dummy fs info\n");
+ return -ENOMEM;
}
- root->fs_info = btrfs_alloc_dummy_fs_info();
- if (!root->fs_info) {
- test_msg("Couldn't allocate dummy fs info\n");
- ret = -ENOMEM;
+ root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
+ if (IS_ERR(root)) {
+ test_msg("Couldn't allocate root\n");
+ ret = PTR_ERR(root);
goto out;
}
+
/* We are using this root as our extent root */
root->fs_info->extent_root = root;
@@ -495,7 +497,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
btrfs_set_header_nritems(root->node, 0);
root->alloc_bytenr += 2 * nodesize;
- tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+ tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
@@ -510,7 +512,7 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
goto out;
}
- tmp_root = btrfs_alloc_dummy_root(sectorsize, nodesize);
+ tmp_root = btrfs_alloc_dummy_root(fs_info, sectorsize, nodesize);
if (IS_ERR(tmp_root)) {
test_msg("Couldn't allocate a fs root\n");
ret = PTR_ERR(tmp_root);
@@ -531,5 +533,6 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
ret = test_multiple_refs(root, sectorsize, nodesize);
out:
btrfs_free_dummy_root(root);
+ btrfs_free_dummy_fs_info(fs_info);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 948aa186b353..9cca0a721961 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -561,6 +561,7 @@ again:
h->transaction = cur_trans;
h->root = root;
h->use_count = 1;
+ h->fs_info = root->fs_info;
h->type = type;
h->can_flush_pending_bgs = true;
@@ -1491,7 +1492,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto dir_item_existed;
} else if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_release_path(path);
@@ -1504,7 +1505,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
*/
ret = btrfs_run_delayed_items(trans, root);
if (ret) { /* Transaction aborted */
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1543,7 +1544,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret) {
btrfs_tree_unlock(old);
free_extent_buffer(old);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1554,7 +1555,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(old);
free_extent_buffer(old);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
/* see comments in should_cow_block() */
@@ -1568,7 +1569,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_tree_unlock(tmp);
free_extent_buffer(tmp);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1580,7 +1581,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_ino(parent_inode), index,
dentry->d_name.name, dentry->d_name.len);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1588,19 +1589,19 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_reloc_post_snapshot(trans, pending);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1622,7 +1623,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1632,13 +1633,13 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
current_fs_time(parent_inode->i_sb);
ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
@@ -1647,14 +1648,14 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
objectid);
if (ret && ret != -EEXIST) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
}
ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto fail;
}
@@ -1709,7 +1710,7 @@ static void update_super_roots(struct btrfs_root *root)
super->root = root_item->bytenr;
super->generation = root_item->generation;
super->root_level = root_item->level;
- if (btrfs_test_opt(root, SPACE_CACHE))
+ if (btrfs_test_opt(root->fs_info, SPACE_CACHE))
super->cache_generation = root_item->generation;
if (root->fs_info->update_uuid_tree_gen)
super->uuid_tree_generation = root_item->generation;
@@ -1850,7 +1851,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
WARN_ON(trans->use_count > 1);
- btrfs_abort_transaction(trans, root, err);
+ btrfs_abort_transaction(trans, err);
spin_lock(&root->fs_info->trans_lock);
@@ -1895,14 +1896,14 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
return btrfs_start_delalloc_roots(fs_info, 1, -1);
return 0;
}
static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
{
- if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT))
btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index c5abee4f01ad..efb122643380 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -128,6 +128,7 @@ struct btrfs_trans_handle {
* Subvolume quota depends on this
*/
struct btrfs_root *root;
+ struct btrfs_fs_info *fs_info;
struct seq_list delayed_ref_elem;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c05f69a8ec42..d31a0c4f56be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2757,7 +2757,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
while (1) {
int batch = atomic_read(&root->log_batch);
/* when we're on an ssd, just kick the log commit out */
- if (!btrfs_test_opt(root, SSD) &&
+ if (!btrfs_test_opt(root->fs_info, SSD) &&
test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
@@ -2788,7 +2788,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
if (ret) {
blk_finish_plug(&plug);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
btrfs_set_log_full_commit(root->fs_info, trans);
mutex_unlock(&root->log_mutex);
@@ -2838,7 +2838,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_set_log_full_commit(root->fs_info, trans);
if (ret != -ENOSPC) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
@@ -2898,7 +2898,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
blk_finish_plug(&plug);
if (ret) {
btrfs_set_log_full_commit(root->fs_info, trans);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_free_logged_extents(log, log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out_wake_log_root;
@@ -2934,7 +2934,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
if (ret) {
btrfs_set_log_full_commit(root->fs_info, trans);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out_wake_log_root;
}
@@ -2991,7 +2991,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
ret = walk_log_tree(trans, log, &wc);
/* I don't think this can happen but just in case */
if (ret)
- btrfs_abort_transaction(trans, log, ret);
+ btrfs_abort_transaction(trans, ret);
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -3160,7 +3160,7 @@ out_unlock:
btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
@@ -3193,7 +3193,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
btrfs_set_log_full_commit(root->fs_info, trans);
ret = 0;
} else if (ret < 0 && ret != -ENOENT)
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_log_trans(root);
return ret;
@@ -4703,6 +4703,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
ins_nr = 0;
ret = btrfs_search_forward(root, &min_key,
path, trans->transid);
+ if (ret < 0) {
+ err = ret;
+ goto out_unlock;
+ }
if (ret != 0)
break;
again:
@@ -5301,7 +5305,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
sb = inode->i_sb;
- if (btrfs_test_opt(root, NOTREELOG)) {
+ if (btrfs_test_opt(root->fs_info, NOTREELOG)) {
ret = 1;
goto end_no_trans;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0fb4a959012e..bb0addce7558 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -140,7 +140,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
-static void btrfs_close_one_device(struct btrfs_device *device);
DEFINE_MUTEX(uuid_mutex);
static LIST_HEAD(fs_uuids);
@@ -853,6 +852,46 @@ static void free_device(struct rcu_head *head)
schedule_work(&device->rcu_work);
}
+static void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ if (device->bdev && device->writeable) {
+ sync_blockdev(device->bdev);
+ invalidate_bdev(device->bdev);
+ }
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
+
static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device, *tmp;
@@ -2399,14 +2438,14 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = init_first_rw_device(trans, root, device);
unlock_chunks(root);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
}
ret = btrfs_add_device(trans, root, device);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
@@ -2415,7 +2454,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_finish_sprout(trans, root);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto error_trans;
}
@@ -2801,7 +2840,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
&dev_extent_len);
if (ret) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2820,7 +2859,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_update_device(trans, map->stripes[i].dev);
if (ret) {
mutex_unlock(&fs_devices->device_list_mutex);
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
@@ -2829,7 +2868,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2838,14 +2877,14 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
if (ret) {
- btrfs_abort_transaction(trans, root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
}
ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
if (ret) {
- btrfs_abort_transaction(trans, extent_root, ret);
+ btrfs_abort_transaction(trans, ret);
goto out;
}
@@ -2902,7 +2941,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
* chunk tree entries
*/
ret = btrfs_remove_chunk(trans, root, chunk_offset);
- btrfs_end_transaction(trans, root);
+ btrfs_end_transaction(trans, extent_root);
return ret;
}
@@ -3421,7 +3460,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
u64 size_to_free;
u64 chunk_type;
struct btrfs_chunk *chunk;
- struct btrfs_path *path;
+ struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_trans_handle *trans;
@@ -3455,13 +3494,33 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
ret = btrfs_shrink_device(device, old_size - size_to_free);
if (ret == -ENOSPC)
break;
- BUG_ON(ret);
+ if (ret) {
+ /* btrfs_shrink_device never returns ret > 0 */
+ WARN_ON(ret > 0);
+ goto error;
+ }
trans = btrfs_start_transaction(dev_root, 0);
- BUG_ON(IS_ERR(trans));
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_info_in_rcu(fs_info,
+ "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
+ rcu_str_deref(device->name), ret,
+ old_size, old_size - size_to_free);
+ goto error;
+ }
ret = btrfs_grow_device(trans, device, old_size);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_end_transaction(trans, dev_root);
+ /* btrfs_grow_device never returns ret > 0 */
+ WARN_ON(ret > 0);
+ btrfs_info_in_rcu(fs_info,
+ "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
+ rcu_str_deref(device->name), ret,
+ old_size, old_size - size_to_free);
+ goto error;
+ }
btrfs_end_transaction(trans, dev_root);
}
@@ -3885,7 +3944,7 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->balance_lock);
- if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+ if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
btrfs_info(fs_info, "force skipping balance");
return 0;
}
@@ -4240,7 +4299,7 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
BTRFS_UUID_TREE_OBJECTID);
if (IS_ERR(uuid_root)) {
ret = PTR_ERR(uuid_root);
- btrfs_abort_transaction(trans, tree_root, ret);
+ btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans, tree_root);
return ret;
}
@@ -4514,8 +4573,7 @@ static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
btrfs_set_fs_incompat(info, RAID56);
}
-#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r) \
- - sizeof(struct btrfs_item) \
+#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r) \
- sizeof(struct btrfs_chunk)) \
/ sizeof(struct btrfs_stripe) + 1)
@@ -6401,7 +6459,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
BTRFS_UUID_SIZE);
map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
uuid, NULL);
- if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
+ if (!map->stripes[i].dev &&
+ !btrfs_test_opt(root->fs_info, DEGRADED)) {
free_extent_map(em);
return -EIO;
}
@@ -6469,7 +6528,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
fs_devices = find_fsid(fsid);
if (!fs_devices) {
- if (!btrfs_test_opt(root, DEGRADED))
+ if (!btrfs_test_opt(root->fs_info, DEGRADED))
return ERR_PTR(-ENOENT);
fs_devices = alloc_fs_devices(fsid);
@@ -6531,7 +6590,7 @@ static int read_one_dev(struct btrfs_root *root,
device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
if (!device) {
- if (!btrfs_test_opt(root, DEGRADED))
+ if (!btrfs_test_opt(root->fs_info, DEGRADED))
return -EIO;
device = add_missing_dev(root, fs_devices, devid, dev_uuid);
@@ -6540,7 +6599,7 @@ static int read_one_dev(struct btrfs_root *root,
btrfs_warn(root->fs_info, "devid %llu uuid %pU missing",
devid, dev_uuid);
} else {
- if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
+ if (!device->bdev && !btrfs_test_opt(root->fs_info, DEGRADED))
return -EIO;
if(!device->bdev && !device->missing) {
@@ -7143,38 +7202,3 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
-
-static void btrfs_close_one_device(struct btrfs_device *device)
-{
- struct btrfs_fs_devices *fs_devices = device->fs_devices;
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
-}
diff --git a/fs/cachefiles/proc.c b/fs/cachefiles/proc.c
index eccd33941199..125b90f6c796 100644
--- a/fs/cachefiles/proc.c
+++ b/fs/cachefiles/proc.c
@@ -93,7 +93,6 @@ static int cachefiles_histogram_open(struct inode *inode, struct file *file)
}
static const struct file_operations cachefiles_histogram_fops = {
- .owner = THIS_MODULE,
.open = cachefiles_histogram_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26a9d10d75e9..d5b6f959a3c3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1730,7 +1730,8 @@ enum {
POOL_WRITE = 2,
};
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
struct page **pages;
+ size_t pool_ns_len;
int err = 0, err2 = 0, have = 0;
down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
up_read(&mdsc->pool_perm_rwsem);
if (*p)
goto out;
- dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
parent = NULL;
while (*p) {
parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
if (!perm) {
err = -ENOMEM;
goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
perm->pool = pool;
perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
rb_link_node(&perm->node, parent, p);
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
out:
if (!err)
err = have;
- dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
{
- u32 pool;
+ s64 pool;
+ struct ceph_string *pool_ns;
int ret, flags;
- /* does not support pool namespace yet */
- if (ci->i_pool_ns_len)
- return -EIO;
-
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
spin_lock(&ci->i_ceph_lock);
flags = ci->i_ceph_flags;
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
spin_unlock(&ci->i_ceph_lock);
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %u no read perm\n",
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %u no write perm\n",
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
pool);
return -EPERM;
}
return 0;
}
- ret = __ceph_pool_perm_get(ci, pool);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
if (ret < 0)
return ret;
@@ -1907,10 +1943,11 @@ check:
flags |= CEPH_I_POOL_WR;
spin_lock(&ci->i_ceph_lock);
- if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
- ci->i_ceph_flags = flags;
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
} else {
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
flags = ci->i_ceph_flags;
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 238c55b01723..5bc5d37b1217 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
&ceph_fscache_fsid_object_def,
fsc, true);
if (!fsc->fscache)
- pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+ pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6f60d0a3d0f9..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
}
/*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u64 time_warp_seq,
+ struct timespec *ctime, u32 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct inode *inode = &ci->vfs_inode;
u64 cap_id = cap->cap_id;
int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
+ u64 follows, size, max_size;
+ u32 seq, issue_seq, mseq, time_warp_seq;
struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed;
}
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ return send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->cap_flush.tid,
+ oldest_flush_tid, 0, mseq, capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ &capsnap->ctime, capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ capsnap->xattr_version, capsnap->xattr_blob,
+ capsnap->follows, capsnap->inline_data);
+}
+
/*
* When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ dout("__flush_snaps %p session %p\n", inode, session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
/* should be removed by ceph_try_drop_cap_snap() */
BUG_ON(!capsnap->need_flush);
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
/* only flush each capsnap once */
- if (!kick && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- if (kick)
- goto out;
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- spin_lock(&mdsc->cap_dirty_lock);
- capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
- spin_unlock(&mdsc->cap_dirty_lock);
+ first_tid = cf->tid + 1;
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref);
- if (list_empty(&capsnap->flushing_item))
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0,
- 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = *psession;
+ int mds;
+ dout("ceph_flush_snaps %p\n", inode);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
-out:
- if (psession)
- *psession = session;
- else if (session) {
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
}
-}
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
+ __ceph_flush_snaps(ci, session);
+out:
spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
}
/*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return dirty;
}
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->i_node, parent, p);
- rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->g_node, parent, p);
- rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
{
- struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
- if (n) {
+ if (!list_empty(&mdsc->cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, g_node);
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
return cf->tid;
}
return 0;
}
/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session,
+ struct ceph_mds_session *session, bool wake,
u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
- __add_cap_flushing_to_mdsc(mdsc, cf);
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) tid %llu\n",
- inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
- __add_cap_flushing_to_inode(ci, cf);
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
*flush_tid = cf->tid;
return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0, num;
+ bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool force_requeue = false;
+ bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
if (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) {
dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
} else {
dout("check_caps failed to invalidate pages\n");
/* we failed to invalidate pages. check these
caps again later. */
- force_requeue = 1;
+ force_requeue = true;
__cap_set_timeouts(mdsc, ci);
}
}
- tried_invalidate = 1;
+ tried_invalidate = true;
goto retry_locked;
}
@@ -1720,10 +1739,15 @@ retry_locked:
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
}
/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
goto retry;
}
}
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
/* take snap_rwsem after session mutex */
if (!took_snap_rwsem) {
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- flushing = __mark_caps_flushing(inode, session,
+ flushing = __mark_caps_flushing(inode, session, false,
&flush_tid,
&oldest_flush_tid);
} else {
@@ -1822,7 +1866,7 @@ ack:
* otherwise cancel.
*/
if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
+ force_requeue = true; /* __send_cap delayed release; requeue */
if (!delayed && !is_delayed)
__cap_delay_cancel(mdsc, ci);
else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session, &flush_tid,
- &oldest_flush_tid);
+ flushing = __mark_caps_flushing(inode, session, true,
+ &flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1887,10 +1931,11 @@ retry:
spin_unlock(&ci->i_ceph_lock);
}
} else {
- struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
- if (n) {
+ if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, i_node);
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
flush_tid = cf->tid;
}
flushing = ci->i_flushing_caps;
@@ -1910,14 +1955,13 @@ out:
static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap_flush *cf;
- struct rb_node *n;
int ret = 1;
spin_lock(&ci->i_ceph_lock);
- n = rb_first(&ci->i_cap_flush_tree);
- if (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
if (cf->tid <= flush_tid)
ret = 0;
}
@@ -1926,53 +1970,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- if (!S_ISREG(inode->i_mode))
- return;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_last_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-/*
* wait for any unsafe requests to complete.
*/
static int unsafe_request_wait(struct inode *inode)
@@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int dirty;
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
+
+ ceph_sync_write_wait(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
@@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_inode_info *ci)
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
- struct rb_node *n;
- int delayed = 0;
+ int ret;
u64 first_tid = 0;
- u64 oldest_flush_tid;
- spin_lock(&mdsc->cap_dirty_lock);
- oldest_flush_tid = __get_oldest_flush_tid(mdsc);
- spin_unlock(&mdsc->cap_dirty_lock);
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
- while (true) {
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&ci->i_ceph_lock);
+ pr_err("%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
- for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- if (cf->tid >= first_tid)
- break;
- }
- if (!n) {
+ first_tid = cf->tid + 1;
+
+ if (cf->caps) {
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+ inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ if (ret) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flush, ino (%llx.%llx) "
+ "tid %llu flushing %s\n",
+ ceph_vinop(inode), cf->tid,
+ ceph_cap_string(cf->caps));
+ }
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ atomic_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- break;
- }
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flushsnap, ino (%llx.%llx) "
+ "tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
- first_tid = cf->tid + 1;
+ ceph_put_cap_snap(capsnap);
+ }
- dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
- cap, cf->tid, ceph_cap_string(cf->caps));
- delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- cf->caps, cf->tid, oldest_flush_tid);
+ spin_lock(&ci->i_ceph_lock);
}
- return delayed;
}
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci;
struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
@@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
- spin_unlock(&ci->i_ceph_lock);
- if (!__kick_flushing_caps(mdsc, session, ci))
- continue;
- spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ } else {
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
}
spin_unlock(&ci->i_ceph_lock);
@@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- int delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
}
+ spin_unlock(&ci->i_ceph_lock);
}
}
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct inode *inode)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps));
- __ceph_flush_snaps(ci, &session, 1);
-
- if (ci->i_flushing_caps) {
- int delayed;
-
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
-
- delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
* drop cap_snap that is not associated with any snapshot.
* we don't need to send FLUSHSNAP message for it.
*/
-static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
{
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
-
dout("dropping cap_snap %p follows %llu\n",
capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
return 1;
}
@@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
struct ceph_cap_snap,
ci_item);
capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(capsnap))
+ if (ceph_try_drop_cap_snap(ci, capsnap))
put++;
else if (__ceph_finish_cap_snap(ci, capsnap))
flushsnaps = 1;
@@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (last && !flushsnaps)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
- ceph_flush_snaps(ci);
+ ceph_flush_snaps(ci, NULL);
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
@@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
struct ceph_cap_snap *capsnap = NULL;
+ int put = 0;
+ bool last = false;
+ bool found = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
@@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
} else {
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) {
- found = 1;
+ found = true;
break;
}
}
BUG_ON(!found);
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- drop_capsnap = ceph_try_drop_cap_snap(capsnap);
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
" snap %lld %d/%d -> %d/%d %s%s\n",
@@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
}
- if (drop_capsnap)
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
iput(inode);
}
@@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- u64 inline_version,
- void *inline_data, int inline_len,
+ struct ceph_string **pns, u64 inline_version,
+ void *inline_data, u32 inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued,
- u32 pool_ns_len)
+ struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
- ci->i_layout = grant->layout;
- ci->i_pool_ns_len = pool_ns_len;
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+ if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ *pns = old_ns;
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
@@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
fill_inline = true;
}
- spin_unlock(&ci->i_ceph_lock);
-
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
wake = true;
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
}
if (fill_inline)
@@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_cap_flush *cf;
- struct rb_node *n;
+ struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
- int drop = 0;
+ bool drop = false;
+ bool wake_ci = 0;
+ bool wake_mdsc = 0;
- n = rb_first(&ci->i_cap_flush_tree);
- while (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- n = rb_next(&cf->i_node);
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
cleaned = cf->caps;
+ if (cf->caps == 0) /* capsnap */
+ continue;
if (cf->tid <= flush_tid) {
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add_tail(&cf->list, &to_remove);
+ if (__finish_cap_flush(NULL, ci, cf))
+ wake_ci = true;
+ list_add_tail(&cf->i_list, &to_remove);
} else {
cleaned &= ~cf->caps;
if (!cleaned)
@@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&to_remove)) {
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
-
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (!cf || cf->tid > flush_tid)
- wake_up_all(&mdsc->cap_flushing_wq);
+ list_for_each_entry(cf, &to_remove, i_list) {
+ if (__finish_cap_flush(mdsc, NULL, cf))
+ wake_mdsc = true;
}
if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ }
+ }
mdsc->num_cap_flushing--;
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
+ drop = true;
if (ci->i_wr_ref == 0 &&
ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
@@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
if (drop)
iput(inode);
}
@@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
- int drop = 0;
+ bool flushed = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows);
@@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
+ if (capsnap->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
+ flush_tid, capsnap->cap_flush.tid);
break;
}
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- wake_up_all(&mdsc->cap_flushing_wq);
- drop = 1;
+ flushed = true;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows);
}
}
+ if (flushed) {
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing %p cap_snap %p follows %lld\n",
+ inode, capsnap, follows);
+ list_del(&capsnap->ci_item);
+ if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+ wake_ci = true;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+ wake_mdsc = true;
+
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
spin_unlock(&ci->i_ceph_lock);
- if (drop)
+ if (flushed) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
iput(inode);
+ }
}
/*
@@ -3267,7 +3310,8 @@ retry:
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap;
- if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing);
@@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_string *pool_ns = NULL;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
u64 tid;
u64 inline_version = 0;
void *inline_data = NULL;
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
- u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
op = le32_to_cpu(h->op);
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 flush_tid;
u32 caller_uid, caller_gid;
u32 osd_epoch_barrier;
+ u32 pool_ns_len;
/* version >= 5 */
ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
/* version >= 6 */
@@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_decode_32_safe(&p, end, caller_gid, bad);
/* version >= 8 */
ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
}
/* lookup ino */
@@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap = ceph_get_cap(mdsc, NULL);
cap->cap_ino = vino.ino;
cap->queue_release = 1;
- cap->cap_id = cap_id;
+ cap->cap_id = le64_to_cpu(h->cap_id);
cap->mseq = mseq;
cap->seq = seq;
spin_lock(&session->s_cap_lock);
@@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3613,6 +3656,7 @@ done:
mutex_unlock(&session->s_mutex);
done_unlocked:
iput(inode);
+ ceph_put_string(pool_ns);
return;
bad:
@@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i;
+ int bits = (fmode << 1) | 1;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i]++;
+ }
+}
+
/*
* Drop open file reference. If we were the last open file,
* we may need to release capabilities to the MDS (or schedule
@@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
+ int i, last = 0;
+ int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] == 0);
+ if (--ci->i_nr_by_mode[i] == 0)
+ last++;
+ }
+ }
+ dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+ &ci->vfs_inode, fmode,
+ ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+ ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6e0fedf6713b..c64a0b794d49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry)
di->dentry = dentry;
di->lease_session = NULL;
- dentry->d_time = jiffies;
+ di->time = jiffies;
/* avoid reordering d_fsdata setup so that the check above is safe */
smp_mb();
dentry->d_fsdata = di;
@@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
+ ceph_dentry(dentry)->time = jiffies;
ceph_dentry(dentry)->lease_shared_gen = 0;
spin_unlock(&dentry->d_lock);
}
@@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
-static int dentry_lease_is_valid(struct dentry *dentry)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+ struct inode *dir)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *s;
@@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry)
u32 gen;
unsigned long ttl;
struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
u32 seq = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di->lease_session) {
+ if (di && di->lease_session) {
s = di->lease_session;
spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
@@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, di->time) &&
time_before(jiffies, ttl)) {
valid = 1;
if (di->lease_renew_after &&
time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = d_inode(dentry->d_parent);
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
}
}
}
@@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ }
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- parent = dget_parent(dentry);
- dir = d_inode(parent);
-
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
} else if (d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
- else
- valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags, dir);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
+ else
+ valid = 1;
+ }
}
if (!valid) {
@@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct ceph_mds_request *req;
int op, mask, err;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dir_clear_complete(dir);
}
- dput(parent);
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
return valid;
}
@@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry)
dout("d_release %p\n", dentry);
ceph_dentry_lru_del(dentry);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
if (di->lease_session)
ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0daaf7ceedc5..0f5375d8e030 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, false);
+ ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+/*
+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+void ceph_sync_write_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("sync_write_wait on tid %llu (until %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
len = ret;
}
- ceph_put_page_vector(pages, num_pages, false);
+ ceph_put_page_vector(pages, num_pages, !write);
ceph_osdc_put_request(req);
if (ret < 0)
@@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
if (aio_req->num_reqs == 0) {
kfree(aio_req);
return ret;
@@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD);
- while (!list_empty(&aio_req->osd_reqs)) {
- req = list_first_entry(&aio_req->osd_reqs,
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
r_unsafe_item);
list_del_init(&req->r_unsafe_item);
@@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t i_size;
- int ret;
+ loff_t ret;
inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
- if (ret < 0) {
- offset = ret;
+ if (ret < 0)
goto out;
- }
}
i_size = i_size_read(inode);
@@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
* write() or lseek() might have altered it
*/
if (offset == 0) {
- offset = file->f_pos;
+ ret = file->f_pos;
goto out;
}
offset += file->f_pos;
@@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+ ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
inode_unlock(inode);
- return offset;
+ return ret;
}
static inline void ceph_zero_partial_page(
@@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
{
int ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
- s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
- s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
u64 object_set_size = object_size * stripe_count;
u64 nearly, t;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f059b5997072..dd3a6dbf71eb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
- ci->i_pool_ns_len = 0;
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_prealloc_cap_flush = NULL;
- ci->i_cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
mutex_init(&ci->i_truncate_mutex);
@@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+
call_rcu(&inode->i_rcu, ceph_i_callback);
}
@@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode)
return 1;
}
+void ceph_evict_inode(struct inode *inode)
+{
+ /* wait unsafe sync writes */
+ ceph_sync_write_wait(inode);
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
@@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
bool wake = false;
@@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
iinfo->xattr_len);
}
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
- if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
- ci->i_layout = info->layout;
- ci->i_pool_ns_len = iinfo->pool_ns_len;
+
+ pool_ns = old_ns;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -985,6 +1008,7 @@ out:
ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
return err;
}
@@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
+ time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session)
@@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
- dentry->d_time = ttl;
+ di->time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
return;
@@ -1164,7 +1188,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dname.name = rinfo->dname;
dname.len = rinfo->dname_len;
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
retry_lookup:
@@ -1508,7 +1532,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
dname.name = rde->name;
dname.len = rde->name_len;
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
vino.ino = le64_to_cpu(rde->inode.in->ino);
vino.snap = le64_to_cpu(rde->inode.in->snapid);
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index be6b1657b1af..7d752d53353a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.stripe_count = ci->i_layout.stripe_count;
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_unit = ci->i_layout.stripe_unit;
if (l.object_size)
nl.object_size = l.object_size;
else
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.object_size = ci->i_layout.object_size;
if (l.data_pool)
nl.data_pool = l.data_pool;
else
- nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+ nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1);
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
- struct ceph_object_id oid;
+ CEPH_DEFINE_OID_ONSTACK(oid);
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EIO;
}
dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
@@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
ceph_oid_printf(&oid, "%s", dl.object_name);
r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+ ceph_oloc_destroy(&oloc);
if (r < 0) {
up_read(&osdc->lock);
return r;
@@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file)
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 2103b823bec0..fa59a85226b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -48,7 +48,7 @@
struct ceph_reconnect_state {
int nr_caps;
struct ceph_pagelist *pagelist;
- bool flock;
+ unsigned msg_version;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
@@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- *p += info->pool_ns_len;
- } else {
- info->pool_ns_len = 0;
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
}
return 0;
@@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
dout("register_session mds%d\n", mds);
if (mds >= mdsc->max_sessions) {
@@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
- while (true) {
- struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
- if (!n)
- break;
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add(&cf->list, &to_remove);
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ list_add(&cf->i_list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+ list_for_each_entry(cf, &to_remove, i_list)
+ list_del(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
}
@@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
while (!list_empty(&to_remove)) {
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
@@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, fsc);
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) {
struct inode *inode;
@@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static int check_capsnap_flush(struct ceph_inode_info *ci,
- u64 want_snap_seq)
-{
- int ret = 1;
- spin_lock(&ci->i_ceph_lock);
- if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap, ci_item);
- ret = capsnap->follows >= want_snap_seq;
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- struct rb_node *n;
- struct ceph_cap_flush *cf;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (cf && cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid %llu <= %llu\n",
- cf->tid, want_flush_tid);
- ret = 0;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid "
+ "%llu <= %llu\n", cf->tid, want_flush_tid);
+ ret = 0;
+ }
}
spin_unlock(&mdsc->cap_dirty_lock);
return ret;
@@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
* returns true if we've flushed through want_flush_tid
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid, u64 want_snap_seq)
+ u64 want_flush_tid)
{
- int mds;
-
- dout("check_caps_flush want %llu snap want %llu\n",
- want_flush_tid, want_snap_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
- struct inode *inode = NULL;
-
- if (!session) {
- mds++;
- continue;
- }
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_snaps_flushing)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&session->s_cap_snaps_flushing,
- struct ceph_cap_snap,
- flushing_item);
- struct ceph_inode_info *ci = capsnap->ci;
- if (!check_capsnap_flush(ci, want_snap_seq)) {
- dout("check_cap_flush still flushing snap %p "
- "follows %lld <= %lld to mds%d\n",
- &ci->vfs_inode, capsnap->follows,
- want_snap_seq, mds);
- inode = igrab(&ci->vfs_inode);
- }
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (inode) {
- wait_event(mdsc->cap_flushing_wq,
- check_capsnap_flush(ceph_inode(inode),
- want_snap_seq));
- iput(inode);
- } else {
- mds++;
- }
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ dout("check_caps_flush want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
@@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
goto out;
@@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- /* deny access to directories with pool_ns layouts */
- if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
- ceph_inode(req->r_inode)->i_pool_ns_len)
- return -EIO;
- if (req->r_locked_dir &&
- ceph_inode(req->r_locked_dir)->i_pool_ns_len)
- return -EIO;
-
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- size_t reclen;
struct ceph_inode_info *ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
char *path;
int pathlen, err;
u64 pathbase;
+ u64 snap_follows;
struct dentry *dentry;
ci = cap->ci;
@@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
path = NULL;
pathlen = 0;
}
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
}
spin_unlock(&ci->i_ceph_lock);
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks;
+ size_t struct_len, total_len = 0;
+ u8 struct_v = 0;
encode_again:
ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
@@ -2872,20 +2818,51 @@ encode_again:
goto encode_again;
goto out_free;
}
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len = 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
+ }
/*
* number of encoded locks is stable, so copy to pagelist
*/
- rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen;
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+ err = ceph_pagelist_reserve(pagelist, total_len);
+
+ if (!err) {
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+ }
kfree(flocks);
} else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
+ size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+ err = ceph_pagelist_reserve(pagelist, size);
+ if (!err) {
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
}
recon_state->nr_caps++;
@@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+ if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ recon_state.msg_version = 3;
+ else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+ recon_state.msg_version = 2;
+ else
+ recon_state.msg_version = 1;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
@@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail;
}
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
/* raced with cap release? */
if (s_nr_caps != recon_state.nr_caps) {
@@ -3204,7 +3185,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
WARN_ON(1);
goto release; /* hrm... */
}
- dname.hash = full_name_hash(dname.name, dname.len);
+ dname.hash = full_name_hash(parent, dname.name, dname.len);
dentry = d_lookup(parent, &dname);
dput(parent);
if (!dentry)
@@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
+ di->time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
di->lease_renew_from = 0;
@@ -3297,47 +3278,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
* drop all leases (and dentry refs) in preparation for umount
*/
static void drop_leases(struct ceph_mds_client *mdsc)
@@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
- mdsc->cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3585,7 +3525,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
- u64 want_tid, want_flush, want_snap;
+ u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
@@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
spin_unlock(&mdsc->cap_dirty_lock);
- down_read(&mdsc->snap_rwsem);
- want_snap = mdsc->last_snap_seq;
- up_read(&mdsc->snap_rwsem);
-
- dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
- want_tid, want_flush, want_snap);
+ dout("sync want tid %lld flush_seq %lld\n",
+ want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush, want_snap);
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
dout("mdsc_destroy %p done\n", mdsc);
}
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 map_len;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ u8 struct_v, struct_cv;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ dout("handle_fsmap epoch %u\n", epoch);
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ struct_cv = ceph_decode_8(&p);
+ map_len = ceph_decode_32(&p);
+
+ ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+ p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+ num_fs = ceph_decode_32(&p);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u8 info_v, info_cv;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ info_v = ceph_decode_8(&p);
+ info_cv = ceph_decode_8(&p);
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+bad:
+ pr_err("error decoding fsmap\n");
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = -ENOENT;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+ return;
+}
/*
* handle mds map update.
*/
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
u32 epoch;
u32 maplen;
@@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
break;
case CEPH_MSG_CLIENT_SESSION:
handle_session(s, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e7d38aac7109..6b3679737d4a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in {
u32 inline_len;
char *inline_data;
u32 pool_ns_len;
+ char *pool_ns_data;
};
struct ceph_mds_reply_dir_entry {
@@ -151,7 +152,6 @@ struct ceph_mds_session {
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
@@ -275,8 +275,10 @@ struct ceph_mds_request {
struct ceph_pool_perm {
struct rb_node node;
- u32 pool;
int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
};
/*
@@ -290,6 +292,7 @@ struct ceph_mds_client {
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
+ int mdsmap_err;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
@@ -321,7 +324,7 @@ struct ceph_mds_client {
spinlock_t snap_flush_lock;
u64 last_cap_flush_tid;
- struct rb_root cap_flush_tree;
+ struct list_head cap_flush_list;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
@@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
@@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq);
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
extern struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9caaa7ffc93f..9ff5219d849e 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ihold(inode);
atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
- old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
@@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
+ old_snapc = NULL;
update_snapc:
if (ci->i_head_snapc) {
@@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->dirty_pages);
return 0;
}
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
@@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_flush_snaps(ci, &session);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 91e02481ce06..e247f6f0feb7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
* mount options
*/
enum {
- Opt_mds_namespace,
Opt_wsize,
Opt_rsize,
Opt_rasize,
@@ -121,6 +120,7 @@ enum {
Opt_last_int,
/* int args above */
Opt_snapdirname,
+ Opt_mds_namespace,
Opt_last_string,
/* string args above */
Opt_dirstat,
@@ -144,7 +144,6 @@ enum {
};
static match_table_t fsopt_tokens = {
- {Opt_mds_namespace, "mds_namespace=%d"},
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
{Opt_rasize, "rasize=%d"},
@@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = {
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
+ {Opt_mds_namespace, "mds_namespace=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
@@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->snapdir_name)
return -ENOMEM;
break;
-
- /* misc */
case Opt_mds_namespace:
- fsopt->mds_namespace = intval;
+ fsopt->mds_namespace = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
break;
+ /* misc */
case Opt_wsize:
fsopt->wsize = intval;
break;
@@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args);
}
@@ -333,6 +337,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
@@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
- fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
/*
* Distinguish the server list from the path in "dev_name".
@@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
- if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
- seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
+ if (fsopt->mds_namespace)
+ seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
return 0;
-
default:
return -1;
}
@@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
- ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
+
+ if (fsopt->mds_namespace == NULL) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
fsc->mount_options = fsopt;
@@ -672,8 +686,8 @@ static int __init init_caches(void)
if (ceph_dentry_cachep == NULL)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+
if (ceph_file_cachep == NULL)
goto bad_file;
@@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0168b49fb6ad..3e3fa9163059 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,7 +62,6 @@ struct ceph_mount_options {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
- int mds_namespace;
/*
* everything above this point can be memcmp'd; everything below
@@ -70,6 +69,7 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
char *server_path; /* default "/" */
};
@@ -147,6 +147,14 @@ struct ceph_cap {
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+struct ceph_cap_flush {
+ u64 tid;
+ int caps; /* 0 means capsnap */
+ bool wake; /* wake up flush waiters when finish ? */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
+
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty
@@ -154,10 +162,11 @@ struct ceph_cap {
*/
struct ceph_cap_snap {
atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
+ struct list_head ci_item;
+
+ struct ceph_cap_flush cap_flush;
- u64 follows, flush_tid;
+ u64 follows;
int issued, dirty;
struct ceph_snap_context *context;
@@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
}
}
-struct ceph_cap_flush {
- u64 tid;
- int caps;
- struct rb_node g_node; // global
- union {
- struct rb_node i_node; // inode
- struct list_head list;
- };
-};
-
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
@@ -246,7 +245,7 @@ struct ceph_dentry_info {
unsigned long lease_renew_after, lease_renew_from;
struct list_head lru;
struct dentry *dentry;
- u64 time;
+ unsigned long time;
u64 offset;
};
@@ -287,7 +286,6 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
- size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -311,7 +309,7 @@ struct ceph_inode_info {
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
struct ceph_cap_flush *i_prealloc_cap_flush;
- struct rb_root i_cap_flush_tree;
+ struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
@@ -322,7 +320,7 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
@@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
@@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
+extern void ceph_sync_write_wait(struct inode *inode);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 4870b29df224..adc231892b0d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,81 +57,88 @@ struct ceph_vxattr {
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
- size_t s;
- char *p = (char *)&ci->i_layout;
-
- for (s = 0; s < sizeof(ci->i_layout); s++, p++)
- if (*p)
- return true;
- return false;
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
+ const char *ns_field = " pool_namespace=";
char buf[128];
+ size_t len, total_len = 0;
+ int ret;
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
- size_t len = strlen(pool_name);
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- if (!size) {
- ret += len;
- } else if (ret + len > size) {
- ret = -ERANGE;
- } else {
- memcpy(val, buf, ret);
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, (unsigned long long)pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ if (!size) {
+ ret = total_len;
+ } else if (total_len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
memcpy(val + ret, pool_name, len);
ret += len;
}
- } else {
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- (unsigned long long)pool);
- if (size) {
- if (ret <= size)
- memcpy(val, buf, ret);
- else
- ret = -ERANGE;
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
}
}
up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
return ret;
}
static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
}
static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_count);
}
static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.object_size);
}
static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
down_read(&osdc->lock);
@@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
return ret;
}
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ if (ns) {
+ ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ceph_put_string(ns);
+ }
+ return ret;
+}
+
/* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
{ .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 788e19195991..6c58e13fed2f 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -244,7 +244,6 @@ static int cifs_debug_data_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations cifs_debug_data_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_debug_data_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -361,7 +360,6 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations cifs_stats_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_stats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -447,7 +445,6 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer,
}
static const struct file_operations cifsFYI_proc_fops = {
- .owner = THIS_MODULE,
.open = cifsFYI_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -479,7 +476,6 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file,
}
static const struct file_operations cifs_linux_ext_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_linux_ext_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -511,7 +507,6 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file,
}
static const struct file_operations cifs_lookup_cache_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_lookup_cache_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -543,7 +538,6 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer,
}
static const struct file_operations traceSMB_proc_fops = {
- .owner = THIS_MODULE,
.open = traceSMB_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -655,7 +649,6 @@ static ssize_t cifs_security_flags_proc_write(struct file *file,
}
static const struct file_operations cifs_security_flags_proc_fops = {
- .owner = THIS_MODULE,
.open = cifs_security_flags_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h
index 3182273a3407..1418daa03d95 100644
--- a/fs/cifs/cifs_fs_sb.h
+++ b/fs/cifs/cifs_fs_sb.h
@@ -46,6 +46,9 @@
#define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */
#define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */
#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */
+#define CIFS_MOUNT_USE_PREFIX_PATH 0x1000000 /* make subpath with unaccessible
+ * root mountable
+ */
struct cifs_sb_info {
struct rb_root tlink_tree;
@@ -67,5 +70,6 @@ struct cifs_sb_info {
struct backing_dev_info bdi;
struct delayed_work prune_tlinks;
struct rcu_head rcu;
+ char *prepath;
};
#endif /* _CIFS_FS_SB_H */
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 6aeb8d4616a4..8347c90cf483 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -743,24 +743,26 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
memcpy(ses->auth_key.response + baselen, tiblob, tilen);
+ mutex_lock(&ses->server->srv_mutex);
+
rc = crypto_hmacmd5_alloc(ses->server);
if (rc) {
cifs_dbg(VFS, "could not crypto alloc hmacmd5 rc %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* calculate ntlmv2_hash */
rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp);
if (rc) {
cifs_dbg(VFS, "could not get v2 hash rc %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* calculate first part of the client response (CR1) */
rc = CalcNTLMv2_response(ses, ntlmv2_hash);
if (rc) {
cifs_dbg(VFS, "Could not calculate CR1 rc: %d\n", rc);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
/* now calculate the session key for NTLMv2 */
@@ -769,13 +771,13 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (rc) {
cifs_dbg(VFS, "%s: Could not set NTLMV2 Hash as a key\n",
__func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_init(&ses->server->secmech.sdeschmacmd5->shash);
if (rc) {
cifs_dbg(VFS, "%s: Could not init hmacmd5\n", __func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash,
@@ -783,7 +785,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
CIFS_HMAC_MD5_HASH_SIZE);
if (rc) {
cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
- goto setup_ntlmv2_rsp_ret;
+ goto unlock;
}
rc = crypto_shash_final(&ses->server->secmech.sdeschmacmd5->shash,
@@ -791,6 +793,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
if (rc)
cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__);
+unlock:
+ mutex_unlock(&ses->server->srv_mutex);
setup_ntlmv2_rsp_ret:
kfree(tiblob);
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5d841f39c4b7..6bbec5e784cd 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -689,6 +689,14 @@ cifs_do_mount(struct file_system_type *fs_type,
goto out_cifs_sb;
}
+ if (volume_info->prepath) {
+ cifs_sb->prepath = kstrdup(volume_info->prepath, GFP_KERNEL);
+ if (cifs_sb->prepath == NULL) {
+ root = ERR_PTR(-ENOMEM);
+ goto out_cifs_sb;
+ }
+ }
+
cifs_setup_cifs_sb(volume_info, cifs_sb);
rc = cifs_mount(cifs_sb, volume_info);
@@ -727,7 +735,11 @@ cifs_do_mount(struct file_system_type *fs_type,
sb->s_flags |= MS_ACTIVE;
}
- root = cifs_get_root(volume_info, sb);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ root = dget(sb->s_root);
+ else
+ root = cifs_get_root(volume_info, sb);
+
if (IS_ERR(root))
goto out_super;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7d2b15c06090..7ae03283bd61 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1228,6 +1228,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->ops = &smb1_operations;
vol->vals = &smb1_values;
+ vol->echo_interval = SMB_ECHO_INTERVAL_DEFAULT;
+
if (!mountdata)
goto cifs_parse_mount_err;
@@ -2049,7 +2051,7 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
if (!match_security(server, vol))
return 0;
- if (server->echo_interval != vol->echo_interval)
+ if (server->echo_interval != vol->echo_interval * HZ)
return 0;
return 1;
@@ -3483,6 +3485,44 @@ cifs_get_volume_info(char *mount_data, const char *devname)
return volume_info;
}
+static int
+cifs_are_all_path_components_accessible(struct TCP_Server_Info *server,
+ unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ char *full_path)
+{
+ int rc;
+ char *s;
+ char sep, tmp;
+
+ sep = CIFS_DIR_SEP(cifs_sb);
+ s = full_path;
+
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, "");
+ while (rc == 0) {
+ /* skip separators */
+ while (*s == sep)
+ s++;
+ if (!*s)
+ break;
+ /* next separator */
+ while (*s && *s != sep)
+ s++;
+
+ /*
+ * temporarily null-terminate the path at the end of
+ * the current component
+ */
+ tmp = *s;
+ *s = 0;
+ rc = server->ops->is_path_accessible(xid, tcon, cifs_sb,
+ full_path);
+ *s = tmp;
+ }
+ return rc;
+}
+
int
cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
{
@@ -3620,6 +3660,16 @@ remote_path_check:
kfree(full_path);
goto mount_fail_check;
}
+
+ rc = cifs_are_all_path_components_accessible(server,
+ xid, tcon, cifs_sb,
+ full_path);
+ if (rc != 0) {
+ cifs_dbg(VFS, "cannot query dirs between root and final path, "
+ "enabling CIFS_MOUNT_USE_PREFIX_PATH\n");
+ cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
+ rc = 0;
+ }
kfree(full_path);
}
@@ -3889,6 +3939,7 @@ cifs_umount(struct cifs_sb_info *cifs_sb)
bdi_destroy(&cifs_sb->bdi);
kfree(cifs_sb->mountdata);
+ kfree(cifs_sb->prepath);
call_rcu(&cifs_sb->rcu, delayed_free);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index fb0903fffc22..4e532536cbc6 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -84,6 +84,7 @@ build_path_from_dentry(struct dentry *direntry)
struct dentry *temp;
int namelen;
int dfsplen;
+ int pplen = 0;
char *full_path;
char dirsep;
struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb);
@@ -95,8 +96,12 @@ build_path_from_dentry(struct dentry *direntry)
dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
else
dfsplen = 0;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0;
+
cifs_bp_rename_retry:
- namelen = dfsplen;
+ namelen = dfsplen + pplen;
seq = read_seqbegin(&rename_lock);
rcu_read_lock();
for (temp = direntry; !IS_ROOT(temp);) {
@@ -137,7 +142,7 @@ cifs_bp_rename_retry:
}
}
rcu_read_unlock();
- if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
+ if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) {
cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n",
namelen, dfsplen);
/* presumably this is only possible if racing with a rename
@@ -153,6 +158,17 @@ cifs_bp_rename_retry:
those safely to '/' if any are found in the middle of the prepath */
/* BB test paths to Windows with '/' in the midst of prepath */
+ if (pplen) {
+ int i;
+
+ cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath);
+ memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1);
+ full_path[dfsplen] = '\\';
+ for (i = 0; i < pplen-1; i++)
+ if (full_path[dfsplen+1+i] == '/')
+ full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb);
+ }
+
if (dfsplen) {
strncpy(full_path, tcon->treeName, dfsplen);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) {
@@ -229,6 +245,13 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
goto cifs_create_get_file_info;
}
+ if (S_ISDIR(newinode->i_mode)) {
+ CIFSSMBClose(xid, tcon, fid->netfid);
+ iput(newinode);
+ rc = -EISDIR;
+ goto out;
+ }
+
if (!S_ISREG(newinode->i_mode)) {
/*
* The server may allow us to open things like
@@ -399,10 +422,14 @@ cifs_create_set_dentry:
if (rc != 0) {
cifs_dbg(FYI, "Create worked, get_inode_info failed rc = %d\n",
rc);
- if (server->ops->close)
- server->ops->close(xid, tcon, fid);
- goto out;
+ goto out_err;
}
+
+ if (S_ISDIR(newinode->i_mode)) {
+ rc = -EISDIR;
+ goto out_err;
+ }
+
d_drop(direntry);
d_add(direntry, newinode);
@@ -410,6 +437,13 @@ out:
kfree(buf);
kfree(full_path);
return rc;
+
+out_err:
+ if (server->ops->close)
+ server->ops->close(xid, tcon, fid);
+ if (newinode)
+ iput(newinode);
+ goto out;
}
int
@@ -856,7 +890,7 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
wchar_t c;
int i, charlen;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i = 0; i < q->len; i += charlen) {
charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
/* error out if we can't convert the character */
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 514dadb0575d..b87efd0c92d6 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1002,10 +1002,26 @@ struct inode *cifs_root_iget(struct super_block *sb)
struct inode *inode = NULL;
long rc;
struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
+ char *path = NULL;
+ int len;
+
+ if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)
+ && cifs_sb->prepath) {
+ len = strlen(cifs_sb->prepath);
+ path = kzalloc(len + 2 /* leading sep + null */, GFP_KERNEL);
+ if (path == NULL)
+ return ERR_PTR(-ENOMEM);
+ path[0] = '/';
+ memcpy(path+1, cifs_sb->prepath, len);
+ } else {
+ path = kstrdup("", GFP_KERNEL);
+ if (path == NULL)
+ return ERR_PTR(-ENOMEM);
+ }
xid = get_xid();
if (tcon->unix_ext) {
- rc = cifs_get_inode_info_unix(&inode, "", sb, xid);
+ rc = cifs_get_inode_info_unix(&inode, path, sb, xid);
/* some servers mistakenly claim POSIX support */
if (rc != -EOPNOTSUPP)
goto iget_no_retry;
@@ -1013,7 +1029,8 @@ struct inode *cifs_root_iget(struct super_block *sb)
tcon->unix_ext = false;
}
- rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL);
+ convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
+ rc = cifs_get_inode_info(&inode, path, NULL, sb, xid, NULL);
iget_no_retry:
if (!inode) {
@@ -1042,6 +1059,7 @@ iget_no_retry:
}
out:
+ kfree(path);
/* can not call macro free_xid here since in a void func
* TODO: This is no longer true
*/
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 3525ed756173..d203c0329626 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1044,6 +1044,9 @@ smb2_new_lease_key(struct cifs_fid *fid)
get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
}
+#define SMB2_SYMLINK_STRUCT_SIZE \
+ (sizeof(struct smb2_err_rsp) - 1 + sizeof(struct smb2_symlink_err_rsp))
+
static int
smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, char **target_path,
@@ -1056,7 +1059,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct smb2_err_rsp *err_buf = NULL;
struct smb2_symlink_err_rsp *symlink;
- unsigned int sub_len, sub_offset;
+ unsigned int sub_len;
+ unsigned int sub_offset;
+ unsigned int print_len;
+ unsigned int print_offset;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -1077,11 +1083,33 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
kfree(utf16_path);
return -ENOENT;
}
+
+ if (le32_to_cpu(err_buf->ByteCount) < sizeof(struct smb2_symlink_err_rsp) ||
+ get_rfc1002_length(err_buf) + 4 < SMB2_SYMLINK_STRUCT_SIZE) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
/* open must fail on symlink - reset rc */
rc = 0;
symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
sub_len = le16_to_cpu(symlink->SubstituteNameLength);
sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
+ print_len = le16_to_cpu(symlink->PrintNameLength);
+ print_offset = le16_to_cpu(symlink->PrintNameOffset);
+
+ if (get_rfc1002_length(err_buf) + 4 <
+ SMB2_SYMLINK_STRUCT_SIZE + sub_offset + sub_len) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
+ if (get_rfc1002_length(err_buf) + 4 <
+ SMB2_SYMLINK_STRUCT_SIZE + print_offset + print_len) {
+ kfree(utf16_path);
+ return -ENOENT;
+ }
+
*target_path = cifs_strndup_from_utf16(
(char *)symlink->PathBuffer + sub_offset,
sub_len, true, cifs_sb->local_nls);
@@ -1515,6 +1543,8 @@ struct smb_version_operations smb20_operations = {
.rename = smb2_rename_path,
.create_hardlink = smb2_create_hardlink,
.query_symlink = smb2_query_symlink,
+ .query_mf_symlink = smb3_query_mf_symlink,
+ .create_mf_symlink = smb3_create_mf_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index f36a4040afb8..b0b9cda41928 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -35,7 +35,6 @@ const struct inode_operations coda_ioctl_inode_operations = {
};
const struct file_operations coda_ioctl_operations = {
- .owner = THIS_MODULE,
.unlocked_ioctl = coda_pioctl,
.llseek = noop_llseek,
};
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index bbc1252a59f5..c30cf49b69d2 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -80,11 +80,11 @@ static int fill_read_buffer(struct dentry * dentry, struct configfs_buffer * buf
count = attr->show(item, buffer->page);
- buffer->needs_read_fill = 0;
BUG_ON(count > (ssize_t)SIMPLE_ATTR_SIZE);
- if (count >= 0)
+ if (count >= 0) {
+ buffer->needs_read_fill = 0;
buffer->count = count;
- else
+ } else
ret = count;
return ret;
}
diff --git a/fs/dax.c b/fs/dax.c
index 432b9e6dd63b..993dc6fe0416 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -75,13 +75,13 @@ static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
struct request_queue *q = bdev->bd_queue;
long rc = -EIO;
- dax->addr = (void __pmem *) ERR_PTR(-EIO);
+ dax->addr = ERR_PTR(-EIO);
if (blk_queue_enter(q, true) != 0)
return rc;
rc = bdev_direct_access(bdev, dax);
if (rc < 0) {
- dax->addr = (void __pmem *) ERR_PTR(rc);
+ dax->addr = ERR_PTR(rc);
blk_queue_exit(q);
return rc;
}
@@ -147,12 +147,12 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
struct buffer_head *bh)
{
loff_t pos = start, max = start, bh_max = start;
- bool hole = false, need_wmb = false;
+ bool hole = false;
struct block_device *bdev = NULL;
int rw = iov_iter_rw(iter), rc;
long map_len = 0;
struct blk_dax_ctl dax = {
- .addr = (void __pmem *) ERR_PTR(-EIO),
+ .addr = ERR_PTR(-EIO),
};
unsigned blkbits = inode->i_blkbits;
sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
@@ -218,7 +218,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE) {
len = copy_from_iter_pmem(dax.addr, max - pos, iter);
- need_wmb = true;
} else if (!hole)
len = copy_to_iter((void __force *) dax.addr, max - pos,
iter);
@@ -235,8 +234,6 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
dax.addr += len;
}
- if (need_wmb)
- wmb_pmem();
dax_unmap_atomic(bdev, &dax);
return (pos == start) ? rc : pos - start;
@@ -788,7 +785,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
return ret;
}
}
- wmb_pmem();
return 0;
}
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
@@ -1187,7 +1183,6 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
if (dax_map_atomic(bdev, &dax) < 0)
return PTR_ERR(dax.addr);
clear_pmem(dax.addr + offset, length);
- wmb_pmem();
dax_unmap_atomic(bdev, &dax);
}
return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index d6847d7b123d..b90cf8e09d5b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -104,11 +104,9 @@ static unsigned int d_hash_shift __read_mostly;
static struct hlist_bl_head *dentry_hashtable __read_mostly;
-static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
- unsigned int hash)
+static inline struct hlist_bl_head *d_hash(unsigned int hash)
{
- hash += (unsigned long) parent / L1_CACHE_BYTES;
- return dentry_hashtable + hash_32(hash, d_hash_shift);
+ return dentry_hashtable + (hash >> (32 - d_hash_shift));
}
#define IN_LOOKUP_SHIFT 10
@@ -226,10 +224,9 @@ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char
static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount)
{
- const unsigned char *cs;
/*
* Be careful about RCU walk racing with rename:
- * use ACCESS_ONCE to fetch the name pointer.
+ * use 'lockless_dereference' to fetch the name pointer.
*
* NOTE! Even if a rename will mean that the length
* was not loaded atomically, we don't care. The
@@ -243,8 +240,8 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c
* early because the data cannot match (there can
* be no NUL in the ct/tcount data)
*/
- cs = ACCESS_ONCE(dentry->d_name.name);
- smp_read_barrier_depends();
+ const unsigned char *cs = lockless_dereference(dentry->d_name.name);
+
return dentry_string_cmp(cs, ct, tcount);
}
@@ -335,44 +332,21 @@ static inline void dentry_rcuwalk_invalidate(struct dentry *dentry)
/*
* Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. Dentry has no refcount
- * and is unhashed.
- */
-static void dentry_iput(struct dentry * dentry)
- __releases(dentry->d_lock)
- __releases(dentry->d_inode->i_lock)
-{
- struct inode *inode = dentry->d_inode;
- if (inode) {
- __d_clear_type_and_inode(dentry);
- hlist_del_init(&dentry->d_u.d_alias);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
- if (!inode->i_nlink)
- fsnotify_inoderemove(inode);
- if (dentry->d_op && dentry->d_op->d_iput)
- dentry->d_op->d_iput(dentry, inode);
- else
- iput(inode);
- } else {
- spin_unlock(&dentry->d_lock);
- }
-}
-
-/*
- * Release the dentry's inode, using the filesystem
- * d_iput() operation if defined. dentry remains in-use.
+ * d_iput() operation if defined.
*/
static void dentry_unlink_inode(struct dentry * dentry)
__releases(dentry->d_lock)
__releases(dentry->d_inode->i_lock)
{
struct inode *inode = dentry->d_inode;
+ bool hashed = !d_unhashed(dentry);
- raw_write_seqcount_begin(&dentry->d_seq);
+ if (hashed)
+ raw_write_seqcount_begin(&dentry->d_seq);
__d_clear_type_and_inode(dentry);
hlist_del_init(&dentry->d_u.d_alias);
- raw_write_seqcount_end(&dentry->d_seq);
+ if (hashed)
+ raw_write_seqcount_end(&dentry->d_seq);
spin_unlock(&dentry->d_lock);
spin_unlock(&inode->i_lock);
if (!inode->i_nlink)
@@ -488,7 +462,7 @@ void __d_drop(struct dentry *dentry)
if (unlikely(IS_ROOT(dentry)))
b = &dentry->d_sb->s_anon;
else
- b = d_hash(dentry->d_parent, dentry->d_name.hash);
+ b = d_hash(dentry->d_name.hash);
hlist_bl_lock(b);
__hlist_bl_del(&dentry->d_hash);
@@ -573,12 +547,10 @@ static void __dentry_kill(struct dentry *dentry)
dentry_unlist(dentry, parent);
if (parent)
spin_unlock(&parent->d_lock);
- dentry_iput(dentry);
- /*
- * dentry_iput drops the locks, at which point nobody (except
- * transient RCU lookups) can reach this dentry.
- */
- BUG_ON(dentry->d_lockref.count > 0);
+ if (dentry->d_inode)
+ dentry_unlink_inode(dentry);
+ else
+ spin_unlock(&dentry->d_lock);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@@ -622,7 +594,6 @@ static struct dentry *dentry_kill(struct dentry *dentry)
failed:
spin_unlock(&dentry->d_lock);
- cpu_relax();
return dentry; /* try again with same dentry */
}
@@ -796,6 +767,8 @@ void dput(struct dentry *dentry)
return;
repeat:
+ might_sleep();
+
rcu_read_lock();
if (likely(fast_dput(dentry))) {
rcu_read_unlock();
@@ -829,8 +802,10 @@ repeat:
kill_it:
dentry = dentry_kill(dentry);
- if (dentry)
+ if (dentry) {
+ cond_resched();
goto repeat;
+ }
}
EXPORT_SYMBOL(dput);
@@ -1595,6 +1570,7 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
{
struct dentry *dentry;
char *dname;
+ int err;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL);
if (!dentry)
@@ -1653,6 +1629,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
INIT_LIST_HEAD(&dentry->d_child);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
+ if (dentry->d_op && dentry->d_op->d_init) {
+ err = dentry->d_op->d_init(dentry);
+ if (err) {
+ if (dname_external(dentry))
+ kfree(external_name(dentry));
+ kmem_cache_free(dentry_cache, dentry);
+ return NULL;
+ }
+ }
+
this_cpu_inc(nr_dentry);
return dentry;
@@ -1716,7 +1702,7 @@ struct dentry *d_alloc_name(struct dentry *parent, const char *name)
struct qstr q;
q.name = name;
- q.hash_len = hashlen_string(name);
+ q.hash_len = hashlen_string(parent, name);
return d_alloc(parent, &q);
}
EXPORT_SYMBOL(d_alloc_name);
@@ -1729,7 +1715,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
DCACHE_OP_REVALIDATE |
DCACHE_OP_WEAK_REVALIDATE |
DCACHE_OP_DELETE |
- DCACHE_OP_SELECT_INODE |
DCACHE_OP_REAL));
dentry->d_op = op;
if (!op)
@@ -1746,8 +1731,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
dentry->d_flags |= DCACHE_OP_DELETE;
if (op->d_prune)
dentry->d_flags |= DCACHE_OP_PRUNE;
- if (op->d_select_inode)
- dentry->d_flags |= DCACHE_OP_SELECT_INODE;
if (op->d_real)
dentry->d_flags |= DCACHE_OP_REAL;
@@ -1815,7 +1798,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
- __fsnotify_d_instantiate(dentry);
+ fsnotify_update_flags(dentry);
spin_unlock(&dentry->d_lock);
}
@@ -2067,42 +2050,19 @@ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode,
}
EXPORT_SYMBOL(d_add_ci);
-/*
- * Do the slow-case of the dentry name compare.
- *
- * Unlike the dentry_cmp() function, we need to atomically
- * load the name and length information, so that the
- * filesystem can rely on them, and can use the 'name' and
- * 'len' information without worrying about walking off the
- * end of memory etc.
- *
- * Thus the read_seqcount_retry() and the "duplicate" info
- * in arguments (the low-level filesystem should not look
- * at the dentry inode or name contents directly, since
- * rename can change them while we're in RCU mode).
- */
-enum slow_d_compare {
- D_COMP_OK,
- D_COMP_NOMATCH,
- D_COMP_SEQRETRY,
-};
-static noinline enum slow_d_compare slow_dentry_cmp(
- const struct dentry *parent,
- struct dentry *dentry,
- unsigned int seq,
- const struct qstr *name)
+static inline bool d_same_name(const struct dentry *dentry,
+ const struct dentry *parent,
+ const struct qstr *name)
{
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
-
- if (read_seqcount_retry(&dentry->d_seq, seq)) {
- cpu_relax();
- return D_COMP_SEQRETRY;
+ if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) {
+ if (dentry->d_name.len != name->len)
+ return false;
+ return dentry_cmp(dentry, name->name, name->len) == 0;
}
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- return D_COMP_NOMATCH;
- return D_COMP_OK;
+ return parent->d_op->d_compare(parent, dentry,
+ dentry->d_name.len, dentry->d_name.name,
+ name) == 0;
}
/**
@@ -2140,7 +2100,7 @@ struct dentry *__d_lookup_rcu(const struct dentry *parent,
{
u64 hashlen = name->hash_len;
const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hashlen_hash(hashlen));
+ struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen));
struct hlist_bl_node *node;
struct dentry *dentry;
@@ -2181,6 +2141,9 @@ seqretry:
* dentry compare, we will do seqretries until it is stable,
* and if we end up with a successful lookup, we actually
* want to exit RCU lookup anyway.
+ *
+ * Note that raw_seqcount_begin still *does* smp_rmb(), so
+ * we are still guaranteed NUL-termination of ->d_name.name.
*/
seq = raw_seqcount_begin(&dentry->d_seq);
if (dentry->d_parent != parent)
@@ -2189,24 +2152,28 @@ seqretry:
continue;
if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) {
+ int tlen;
+ const char *tname;
if (dentry->d_name.hash != hashlen_hash(hashlen))
continue;
- *seqp = seq;
- switch (slow_dentry_cmp(parent, dentry, seq, name)) {
- case D_COMP_OK:
- return dentry;
- case D_COMP_NOMATCH:
- continue;
- default:
+ tlen = dentry->d_name.len;
+ tname = dentry->d_name.name;
+ /* we want a consistent (name,len) pair */
+ if (read_seqcount_retry(&dentry->d_seq, seq)) {
+ cpu_relax();
goto seqretry;
}
+ if (parent->d_op->d_compare(parent, dentry,
+ tlen, tname, name) != 0)
+ continue;
+ } else {
+ if (dentry->d_name.hash_len != hashlen)
+ continue;
+ if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0)
+ continue;
}
-
- if (dentry->d_name.hash_len != hashlen)
- continue;
*seqp = seq;
- if (!dentry_cmp(dentry, str, hashlen_len(hashlen)))
- return dentry;
+ return dentry;
}
return NULL;
}
@@ -2254,10 +2221,8 @@ EXPORT_SYMBOL(d_lookup);
*/
struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
{
- unsigned int len = name->len;
unsigned int hash = name->hash;
- const unsigned char *str = name->name;
- struct hlist_bl_head *b = d_hash(parent, hash);
+ struct hlist_bl_head *b = d_hash(hash);
struct hlist_bl_node *node;
struct dentry *found = NULL;
struct dentry *dentry;
@@ -2295,21 +2260,8 @@ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name)
if (d_unhashed(dentry))
goto next;
- /*
- * It is safe to compare names since d_move() cannot
- * change the qstr (protected by d_lock).
- */
- if (parent->d_flags & DCACHE_OP_COMPARE) {
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- goto next;
- } else {
- if (dentry->d_name.len != len)
- goto next;
- if (dentry_cmp(dentry, str, len))
- goto next;
- }
+ if (!d_same_name(dentry, parent, name))
+ goto next;
dentry->d_lockref.count++;
found = dentry;
@@ -2337,7 +2289,7 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
* calculate the standard hash first, as the d_op->d_hash()
* routine may choose to leave the hash value unchanged.
*/
- name->hash = full_name_hash(name->name, name->len);
+ name->hash = full_name_hash(dir, name->name, name->len);
if (dir->d_flags & DCACHE_OP_HASH) {
int err = dir->d_op->d_hash(dir, name);
if (unlikely(err < 0))
@@ -2410,7 +2362,7 @@ static void __d_rehash(struct dentry * entry, struct hlist_bl_head *b)
static void _d_rehash(struct dentry * entry)
{
- __d_rehash(entry, d_hash(entry->d_parent, entry->d_name.hash));
+ __d_rehash(entry, d_hash(entry->d_name.hash));
}
/**
@@ -2462,9 +2414,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent,
const struct qstr *name,
wait_queue_head_t *wq)
{
- unsigned int len = name->len;
unsigned int hash = name->hash;
- const unsigned char *str = name->name;
struct hlist_bl_head *b = in_lookup_hash(parent, hash);
struct hlist_bl_node *node;
struct dentry *new = d_alloc(parent, name);
@@ -2515,17 +2465,8 @@ retry:
continue;
if (dentry->d_parent != parent)
continue;
- if (parent->d_flags & DCACHE_OP_COMPARE) {
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- continue;
- } else {
- if (dentry->d_name.len != len)
- continue;
- if (dentry_cmp(dentry, str, len))
- continue;
- }
+ if (!d_same_name(dentry, parent, name))
+ continue;
hlist_bl_unlock(b);
/* now we can try to grab a reference */
if (!lockref_get_not_dead(&dentry->d_lockref)) {
@@ -2552,17 +2493,8 @@ retry:
goto mismatch;
if (unlikely(d_unhashed(dentry)))
goto mismatch;
- if (parent->d_flags & DCACHE_OP_COMPARE) {
- int tlen = dentry->d_name.len;
- const char *tname = dentry->d_name.name;
- if (parent->d_op->d_compare(parent, dentry, tlen, tname, name))
- goto mismatch;
- } else {
- if (unlikely(dentry->d_name.len != len))
- goto mismatch;
- if (unlikely(dentry_cmp(dentry, str, len)))
- goto mismatch;
- }
+ if (unlikely(!d_same_name(dentry, parent, name)))
+ goto mismatch;
/* OK, it *is* a hashed match; return it */
spin_unlock(&dentry->d_lock);
dput(new);
@@ -2615,7 +2547,7 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
raw_write_seqcount_begin(&dentry->d_seq);
__d_set_inode_and_type(dentry, inode, add_flags);
raw_write_seqcount_end(&dentry->d_seq);
- __fsnotify_d_instantiate(dentry);
+ fsnotify_update_flags(dentry);
}
_d_rehash(dentry);
if (dir)
@@ -2658,8 +2590,6 @@ EXPORT_SYMBOL(d_add);
struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
{
struct dentry *alias;
- int len = entry->d_name.len;
- const char *name = entry->d_name.name;
unsigned int hash = entry->d_name.hash;
spin_lock(&inode->i_lock);
@@ -2673,9 +2603,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
continue;
if (alias->d_parent != entry->d_parent)
continue;
- if (alias->d_name.len != len)
- continue;
- if (dentry_cmp(alias, name, len))
+ if (!d_same_name(alias, entry->d_parent, &entry->d_name))
continue;
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
@@ -2874,7 +2802,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
* for the same hash queue because of how unlikely it is.
*/
__d_drop(dentry);
- __d_rehash(dentry, d_hash(target->d_parent, target->d_name.hash));
+ __d_rehash(dentry, d_hash(target->d_name.hash));
/*
* Unhash the target (d_delete() is not usable here). If exchanging
@@ -2882,8 +2810,7 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
*/
__d_drop(target);
if (exchange) {
- __d_rehash(target,
- d_hash(dentry->d_parent, dentry->d_name.hash));
+ __d_rehash(target, d_hash(dentry->d_name.hash));
}
/* Switch the names.. */
@@ -2906,8 +2833,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
list_move(&target->d_child, &target->d_parent->d_subdirs);
list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
if (exchange)
- fsnotify_d_move(target);
- fsnotify_d_move(dentry);
+ fsnotify_update_flags(target);
+ fsnotify_update_flags(dentry);
}
write_seqcount_end(&target->d_seq);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 4bc1f68243c1..72361baf9da7 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -621,9 +621,6 @@ void debugfs_remove(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || d_really_is_negative(parent))
- return;
-
inode_lock(d_inode(parent));
ret = __debugfs_remove(dentry, parent);
inode_unlock(d_inode(parent));
@@ -654,10 +651,6 @@ void debugfs_remove_recursive(struct dentry *dentry)
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry->d_parent;
- if (!parent || d_really_is_negative(parent))
- return;
-
parent = dentry;
down:
inode_lock(d_inode(parent));
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 37c134a132c7..d116453b0276 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -396,6 +396,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
{
struct inode *inode;
+ s->s_iflags &= ~SB_I_NODEV;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = DEVPTS_SUPER_MAGIC;
@@ -480,7 +481,7 @@ static struct file_system_type devpts_fs_type = {
.name = "devpts",
.mount = devpts_mount,
.kill_sb = devpts_kill_sb,
- .fs_flags = FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
/*
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 9cb54a38832d..a5e607e8f056 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -65,7 +65,7 @@ static int efivarfs_d_compare(const struct dentry *parent,
static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(dentry);
const unsigned char *s = qstr->name;
unsigned int len = qstr->len;
@@ -98,7 +98,7 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
q.name = name;
q.len = strlen(name);
- err = efivarfs_d_hash(NULL, &q);
+ err = efivarfs_d_hash(parent, &q);
if (err)
return ERR_PTR(err);
diff --git a/fs/exec.c b/fs/exec.c
index 887c1c955df8..6fcfb3f7b137 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -762,6 +762,39 @@ out_unlock:
}
EXPORT_SYMBOL(setup_arg_pages);
+#else
+
+/*
+ * Transfer the program arguments and environment from the holding pages
+ * onto the stack. The provided stack pointer is adjusted accordingly.
+ */
+int transfer_args_to_stack(struct linux_binprm *bprm,
+ unsigned long *sp_location)
+{
+ unsigned long index, stop, sp;
+ int ret = 0;
+
+ stop = bprm->p >> PAGE_SHIFT;
+ sp = *sp_location;
+
+ for (index = MAX_ARG_PAGES - 1; index >= stop; index--) {
+ unsigned int offset = index == stop ? bprm->p & ~PAGE_MASK : 0;
+ char *src = kmap(bprm->page[index]) + offset;
+ sp -= PAGE_SIZE - offset;
+ if (copy_to_user((void *) sp, src, PAGE_SIZE - offset) != 0)
+ ret = -EFAULT;
+ kunmap(bprm->page[index]);
+ if (ret)
+ goto out;
+ }
+
+ *sp_location = sp;
+
+out:
+ return ret;
+}
+EXPORT_SYMBOL(transfer_args_to_stack);
+
#endif /* CONFIG_MMU */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
@@ -866,7 +899,8 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
goto out;
}
- *buf = vmalloc(i_size);
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER)
+ *buf = vmalloc(i_size);
if (!*buf) {
ret = -ENOMEM;
goto out;
@@ -897,8 +931,10 @@ int kernel_read_file(struct file *file, void **buf, loff_t *size,
out_free:
if (ret < 0) {
- vfree(*buf);
- *buf = NULL;
+ if (id != READING_FIRMWARE_PREALLOC_BUFFER) {
+ vfree(*buf);
+ *buf = NULL;
+ }
}
out:
@@ -1411,7 +1447,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
bprm->cred->euid = current_euid();
bprm->cred->egid = current_egid();
- if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)
+ if (!mnt_may_suid(bprm->file->f_path.mnt))
return;
if (task_no_new_privs(current))
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 11562161e24a..f418f55c2bbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2350,7 +2350,6 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
const struct file_operations ext4_seq_mb_groups_fops = {
- .owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 1420a3c614af..73bcfd41f5f2 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -359,7 +359,6 @@ static int name##_open(struct inode *inode, struct file *file) \
} \
\
static const struct file_operations ext4_seq_##name##_fops = { \
- .owner = THIS_MODULE, \
.open = name##_open, \
.read = seq_read, \
.llseek = seq_lseek, \
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index e2624275d828..d64d2a515cb2 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -245,7 +245,6 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
bio_put(bio);
return -EFAULT;
}
- bio->bi_rw = fio->op_flags;
bio_set_op_attrs(bio, fio->op, fio->op_flags);
__submit_bio(fio->sbi, bio, fio->type);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index b97c065cbe74..1b86d3f638ef 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -961,7 +961,6 @@ static int _name##_open_fs(struct inode *inode, struct file *file) \
} \
\
static const struct file_operations f2fs_seq_##_name##_fops = { \
- .owner = THIS_MODULE, \
.open = _name##_open_fs, \
.read = seq_read, \
.llseek = seq_lseek, \
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 3bcf57925dca..da04c0298fab 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -1589,7 +1589,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
/*
* GFP_KERNEL is ok here, because while we do hold the
- * supeblock lock, memory pressure can't call back into
+ * superblock lock, memory pressure can't call back into
* the filesystem, since we're only just about to mount
* it and have no inodes etc active!
*/
@@ -1726,7 +1726,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
sbi->dir_entries = bpb.fat_dir_entries;
if (sbi->dir_entries & (sbi->dir_per_block - 1)) {
if (!silent)
- fat_msg(sb, KERN_ERR, "bogus directory-entries per block"
+ fat_msg(sb, KERN_ERR, "bogus number of directory entries"
" (%u)", sbi->dir_entries);
goto out_invalid;
}
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index b7e2b33aa793..1337c0c7527d 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -154,7 +154,7 @@ static int msdos_hash(const struct dentry *dentry, struct qstr *qstr)
error = msdos_format_name(qstr->name, qstr->len, msdos_name, options);
if (!error)
- qstr->hash = full_name_hash(msdos_name, MSDOS_NAME);
+ qstr->hash = full_name_hash(dentry, msdos_name, MSDOS_NAME);
return 0;
}
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index 7092584f424a..6ccdf3f34f90 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -107,7 +107,7 @@ static unsigned int vfat_striptail_len(const struct qstr *qstr)
*/
static int vfat_hash(const struct dentry *dentry, struct qstr *qstr)
{
- qstr->hash = full_name_hash(qstr->name, vfat_striptail_len(qstr));
+ qstr->hash = full_name_hash(dentry, qstr->name, vfat_striptail_len(qstr));
return 0;
}
@@ -127,7 +127,7 @@ static int vfat_hashi(const struct dentry *dentry, struct qstr *qstr)
name = qstr->name;
len = vfat_striptail_len(qstr);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
while (len--)
hash = partial_name_hash(nls_tolower(t, *name++), hash);
qstr->hash = end_name_hash(hash);
diff --git a/fs/freevxfs/Kconfig b/fs/freevxfs/Kconfig
index 8dc1cd5c1efe..ce49df1020dd 100644
--- a/fs/freevxfs/Kconfig
+++ b/fs/freevxfs/Kconfig
@@ -5,12 +5,21 @@ config VXFS_FS
FreeVxFS is a file system driver that support the VERITAS VxFS(TM)
file system format. VERITAS VxFS(TM) is the standard file system
of SCO UnixWare (and possibly others) and optionally available
- for Sunsoft Solaris, HP-UX and many other operating systems.
- Currently only readonly access is supported.
+ for Sunsoft Solaris, HP-UX and many other operating systems. However
+ these particular OS implementations of vxfs may differ in on-disk
+ data endianess and/or superblock offset. The vxfs module has been
+ tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.)
+ Currently only readonly access is supported and VxFX versions
+ 2, 3 and 4. Tests were performed with HP-UX VxFS version 3.
NOTE: the file system type as used by mount(1), mount(2) and
fstab(5) is 'vxfs' as it describes the file system format, not
the actual driver.
+ There is a userspace utility for HP-UX logical volumes which makes
+ creating HP-UX logical volumes easy from HP-UX disk block device file
+ or regular file with image of the disk. See:
+ https://sourceforge.net/projects/linux-vxfs/
+
To compile this as a module, choose M here: the module will be
called freevxfs. If unsure, say N.
diff --git a/fs/freevxfs/vxfs.h b/fs/freevxfs/vxfs.h
index c8a92652612a..a41ea0ba6943 100644
--- a/fs/freevxfs/vxfs.h
+++ b/fs/freevxfs/vxfs.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -38,13 +39,6 @@
*/
#include <linux/types.h>
-
-/*
- * Data types for use with the VxFS ondisk format.
- */
-typedef int32_t vx_daddr_t;
-typedef int32_t vx_ino_t;
-
/*
* Superblock magic number (vxfs_super->vs_magic).
*/
@@ -60,6 +54,14 @@ typedef int32_t vx_ino_t;
*/
#define VXFS_NEFREE 32
+enum vxfs_byte_order {
+ VXFS_BO_LE,
+ VXFS_BO_BE,
+};
+
+typedef __u16 __bitwise __fs16;
+typedef __u32 __bitwise __fs32;
+typedef __u64 __bitwise __fs64;
/*
* VxFS superblock (disk).
@@ -71,83 +73,83 @@ struct vxfs_sb {
* Lots of this fields are no more used by version 2
* and never filesystems.
*/
- u_int32_t vs_magic; /* Magic number */
- int32_t vs_version; /* VxFS version */
- u_int32_t vs_ctime; /* create time - secs */
- u_int32_t vs_cutime; /* create time - usecs */
- int32_t __unused1; /* unused */
- int32_t __unused2; /* unused */
- vx_daddr_t vs_old_logstart; /* obsolete */
- vx_daddr_t vs_old_logend; /* obsolete */
- int32_t vs_bsize; /* block size */
- int32_t vs_size; /* number of blocks */
- int32_t vs_dsize; /* number of data blocks */
- u_int32_t vs_old_ninode; /* obsolete */
- int32_t vs_old_nau; /* obsolete */
- int32_t __unused3; /* unused */
- int32_t vs_old_defiextsize; /* obsolete */
- int32_t vs_old_ilbsize; /* obsolete */
- int32_t vs_immedlen; /* size of immediate data area */
- int32_t vs_ndaddr; /* number of direct extentes */
- vx_daddr_t vs_firstau; /* address of first AU */
- vx_daddr_t vs_emap; /* offset of extent map in AU */
- vx_daddr_t vs_imap; /* offset of inode map in AU */
- vx_daddr_t vs_iextop; /* offset of ExtOp. map in AU */
- vx_daddr_t vs_istart; /* offset of inode list in AU */
- vx_daddr_t vs_bstart; /* offset of fdblock in AU */
- vx_daddr_t vs_femap; /* aufirst + emap */
- vx_daddr_t vs_fimap; /* aufirst + imap */
- vx_daddr_t vs_fiextop; /* aufirst + iextop */
- vx_daddr_t vs_fistart; /* aufirst + istart */
- vx_daddr_t vs_fbstart; /* aufirst + bstart */
- int32_t vs_nindir; /* number of entries in indir */
- int32_t vs_aulen; /* length of AU in blocks */
- int32_t vs_auimlen; /* length of imap in blocks */
- int32_t vs_auemlen; /* length of emap in blocks */
- int32_t vs_auilen; /* length of ilist in blocks */
- int32_t vs_aupad; /* length of pad in blocks */
- int32_t vs_aublocks; /* data blocks in AU */
- int32_t vs_maxtier; /* log base 2 of aublocks */
- int32_t vs_inopb; /* number of inodes per blk */
- int32_t vs_old_inopau; /* obsolete */
- int32_t vs_old_inopilb; /* obsolete */
- int32_t vs_old_ndiripau; /* obsolete */
- int32_t vs_iaddrlen; /* size of indirect addr ext. */
- int32_t vs_bshift; /* log base 2 of bsize */
- int32_t vs_inoshift; /* log base 2 of inobp */
- int32_t vs_bmask; /* ~( bsize - 1 ) */
- int32_t vs_boffmask; /* bsize - 1 */
- int32_t vs_old_inomask; /* old_inopilb - 1 */
- int32_t vs_checksum; /* checksum of V1 data */
+ __fs32 vs_magic; /* Magic number */
+ __fs32 vs_version; /* VxFS version */
+ __fs32 vs_ctime; /* create time - secs */
+ __fs32 vs_cutime; /* create time - usecs */
+ __fs32 __unused1; /* unused */
+ __fs32 __unused2; /* unused */
+ __fs32 vs_old_logstart; /* obsolete */
+ __fs32 vs_old_logend; /* obsolete */
+ __fs32 vs_bsize; /* block size */
+ __fs32 vs_size; /* number of blocks */
+ __fs32 vs_dsize; /* number of data blocks */
+ __fs32 vs_old_ninode; /* obsolete */
+ __fs32 vs_old_nau; /* obsolete */
+ __fs32 __unused3; /* unused */
+ __fs32 vs_old_defiextsize; /* obsolete */
+ __fs32 vs_old_ilbsize; /* obsolete */
+ __fs32 vs_immedlen; /* size of immediate data area */
+ __fs32 vs_ndaddr; /* number of direct extentes */
+ __fs32 vs_firstau; /* address of first AU */
+ __fs32 vs_emap; /* offset of extent map in AU */
+ __fs32 vs_imap; /* offset of inode map in AU */
+ __fs32 vs_iextop; /* offset of ExtOp. map in AU */
+ __fs32 vs_istart; /* offset of inode list in AU */
+ __fs32 vs_bstart; /* offset of fdblock in AU */
+ __fs32 vs_femap; /* aufirst + emap */
+ __fs32 vs_fimap; /* aufirst + imap */
+ __fs32 vs_fiextop; /* aufirst + iextop */
+ __fs32 vs_fistart; /* aufirst + istart */
+ __fs32 vs_fbstart; /* aufirst + bstart */
+ __fs32 vs_nindir; /* number of entries in indir */
+ __fs32 vs_aulen; /* length of AU in blocks */
+ __fs32 vs_auimlen; /* length of imap in blocks */
+ __fs32 vs_auemlen; /* length of emap in blocks */
+ __fs32 vs_auilen; /* length of ilist in blocks */
+ __fs32 vs_aupad; /* length of pad in blocks */
+ __fs32 vs_aublocks; /* data blocks in AU */
+ __fs32 vs_maxtier; /* log base 2 of aublocks */
+ __fs32 vs_inopb; /* number of inodes per blk */
+ __fs32 vs_old_inopau; /* obsolete */
+ __fs32 vs_old_inopilb; /* obsolete */
+ __fs32 vs_old_ndiripau; /* obsolete */
+ __fs32 vs_iaddrlen; /* size of indirect addr ext. */
+ __fs32 vs_bshift; /* log base 2 of bsize */
+ __fs32 vs_inoshift; /* log base 2 of inobp */
+ __fs32 vs_bmask; /* ~( bsize - 1 ) */
+ __fs32 vs_boffmask; /* bsize - 1 */
+ __fs32 vs_old_inomask; /* old_inopilb - 1 */
+ __fs32 vs_checksum; /* checksum of V1 data */
/*
* Version 1, writable
*/
- int32_t vs_free; /* number of free blocks */
- int32_t vs_ifree; /* number of free inodes */
- int32_t vs_efree[VXFS_NEFREE]; /* number of free extents by size */
- int32_t vs_flags; /* flags ?!? */
- u_int8_t vs_mod; /* filesystem has been changed */
- u_int8_t vs_clean; /* clean FS */
- u_int16_t __unused4; /* unused */
- u_int32_t vs_firstlogid; /* mount time log ID */
- u_int32_t vs_wtime; /* last time written - sec */
- u_int32_t vs_wutime; /* last time written - usec */
- u_int8_t vs_fname[6]; /* FS name */
- u_int8_t vs_fpack[6]; /* FS pack name */
- int32_t vs_logversion; /* log format version */
- int32_t __unused5; /* unused */
+ __fs32 vs_free; /* number of free blocks */
+ __fs32 vs_ifree; /* number of free inodes */
+ __fs32 vs_efree[VXFS_NEFREE]; /* number of free extents by size */
+ __fs32 vs_flags; /* flags ?!? */
+ __u8 vs_mod; /* filesystem has been changed */
+ __u8 vs_clean; /* clean FS */
+ __fs16 __unused4; /* unused */
+ __fs32 vs_firstlogid; /* mount time log ID */
+ __fs32 vs_wtime; /* last time written - sec */
+ __fs32 vs_wutime; /* last time written - usec */
+ __u8 vs_fname[6]; /* FS name */
+ __u8 vs_fpack[6]; /* FS pack name */
+ __fs32 vs_logversion; /* log format version */
+ __u32 __unused5; /* unused */
/*
* Version 2, Read-only
*/
- vx_daddr_t vs_oltext[2]; /* OLT extent and replica */
- int32_t vs_oltsize; /* OLT extent size */
- int32_t vs_iauimlen; /* size of inode map */
- int32_t vs_iausize; /* size of IAU in blocks */
- int32_t vs_dinosize; /* size of inode in bytes */
- int32_t vs_old_dniaddr; /* indir levels per inode */
- int32_t vs_checksum2; /* checksum of V2 RO */
+ __fs32 vs_oltext[2]; /* OLT extent and replica */
+ __fs32 vs_oltsize; /* OLT extent size */
+ __fs32 vs_iauimlen; /* size of inode map */
+ __fs32 vs_iausize; /* size of IAU in blocks */
+ __fs32 vs_dinosize; /* size of inode in bytes */
+ __fs32 vs_old_dniaddr; /* indir levels per inode */
+ __fs32 vs_checksum2; /* checksum of V2 RO */
/*
* Actually much more...
@@ -168,8 +170,32 @@ struct vxfs_sb_info {
ino_t vsi_fshino; /* fileset header inode */
daddr_t vsi_oltext; /* OLT extent */
daddr_t vsi_oltsize; /* OLT size */
+ enum vxfs_byte_order byte_order;
};
+static inline u16 fs16_to_cpu(struct vxfs_sb_info *sbi, __fs16 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be16_to_cpu((__force __be16)a);
+ else
+ return le16_to_cpu((__force __le16)a);
+}
+
+static inline u32 fs32_to_cpu(struct vxfs_sb_info *sbi, __fs32 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be32_to_cpu((__force __be32)a);
+ else
+ return le32_to_cpu((__force __le32)a);
+}
+
+static inline u64 fs64_to_cpu(struct vxfs_sb_info *sbi, __fs64 a)
+{
+ if (sbi->byte_order == VXFS_BO_BE)
+ return be64_to_cpu((__force __be64)a);
+ else
+ return le64_to_cpu((__force __le64)a);
+}
/*
* File modes. File types above 0xf000 are vxfs internal only, they should
@@ -247,13 +273,6 @@ enum {
#define VXFS_ISIMMED(ip) VXFS_IS_ORG((ip), VXFS_ORG_IMMED)
#define VXFS_ISTYPED(ip) VXFS_IS_ORG((ip), VXFS_ORG_TYPED)
-
-/*
- * Get filesystem private data from VFS inode.
- */
-#define VXFS_INO(ip) \
- ((struct vxfs_inode_info *)(ip)->i_private)
-
/*
* Get filesystem private data from VFS superblock.
*/
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index f86fd3cacd5a..1fd41cf98b9f 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -68,8 +68,9 @@ vxfs_bmap_ext4(struct inode *ip, long bn)
{
struct super_block *sb = ip->i_sb;
struct vxfs_inode_info *vip = VXFS_INO(ip);
+ struct vxfs_sb_info *sbi = VXFS_SBI(sb);
unsigned long bsize = sb->s_blocksize;
- u32 indsize = vip->vii_ext4.ve4_indsize;
+ u32 indsize = fs32_to_cpu(sbi, vip->vii_ext4.ve4_indsize);
int i;
if (indsize > sb->s_blocksize)
@@ -77,22 +78,24 @@ vxfs_bmap_ext4(struct inode *ip, long bn)
for (i = 0; i < VXFS_NDADDR; i++) {
struct direct *d = vip->vii_ext4.ve4_direct + i;
- if (bn >= 0 && bn < d->size)
- return (bn + d->extent);
- bn -= d->size;
+ if (bn >= 0 && bn < fs32_to_cpu(sbi, d->size))
+ return (bn + fs32_to_cpu(sbi, d->extent));
+ bn -= fs32_to_cpu(sbi, d->size);
}
if ((bn / (indsize * indsize * bsize / 4)) == 0) {
struct buffer_head *buf;
daddr_t bno;
- u32 *indir;
+ __fs32 *indir;
- buf = sb_bread(sb, vip->vii_ext4.ve4_indir[0]);
+ buf = sb_bread(sb,
+ fs32_to_cpu(sbi, vip->vii_ext4.ve4_indir[0]));
if (!buf || !buffer_mapped(buf))
goto fail_buf;
- indir = (u32 *)buf->b_data;
- bno = indir[(bn/indsize) % (indsize*bn)] + (bn%indsize);
+ indir = (__fs32 *)buf->b_data;
+ bno = fs32_to_cpu(sbi, indir[(bn / indsize) % (indsize * bn)]) +
+ (bn % indsize);
brelse(buf);
return bno;
@@ -127,6 +130,7 @@ fail_buf:
static daddr_t
vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
{
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
struct buffer_head *bp = NULL;
daddr_t pblock = 0;
int i;
@@ -142,24 +146,27 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
typ = ((struct vxfs_typed *)bp->b_data) +
(i % VXFS_TYPED_PER_BLOCK(ip->i_sb));
- off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+ off = fs64_to_cpu(sbi, typ->vt_hdr) & VXFS_TYPED_OFFSETMASK;
if (block < off) {
brelse(bp);
continue;
}
- switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+ switch ((u_int32_t)(fs64_to_cpu(sbi, typ->vt_hdr) >>
+ VXFS_TYPED_TYPESHIFT)) {
case VXFS_TYPED_INDIRECT:
- pblock = vxfs_bmap_indir(ip, typ->vt_block,
- typ->vt_size, block - off);
+ pblock = vxfs_bmap_indir(ip,
+ fs32_to_cpu(sbi, typ->vt_block),
+ fs32_to_cpu(sbi, typ->vt_size),
+ block - off);
if (pblock == -2)
break;
goto out;
case VXFS_TYPED_DATA:
- if ((block - off) >= typ->vt_size)
+ if ((block - off) >= fs32_to_cpu(sbi, typ->vt_size))
break;
- pblock = (typ->vt_block + block - off);
+ pblock = fs32_to_cpu(sbi, typ->vt_block) + block - off;
goto out;
case VXFS_TYPED_INDIRECT_DEV4:
case VXFS_TYPED_DATA_DEV4: {
@@ -167,13 +174,15 @@ vxfs_bmap_indir(struct inode *ip, long indir, int size, long block)
(struct vxfs_typed_dev4 *)typ;
printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
- printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
- (unsigned long long) typ4->vd4_block,
- (unsigned long long) typ4->vd4_size,
- typ4->vd4_dev);
+ printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n",
+ fs64_to_cpu(sbi, typ4->vd4_block),
+ fs64_to_cpu(sbi, typ4->vd4_size),
+ fs32_to_cpu(sbi, typ4->vd4_dev));
goto fail;
}
default:
+ printk(KERN_ERR "%s:%d vt_hdr %llu\n", __func__,
+ __LINE__, fs64_to_cpu(sbi, typ->vt_hdr));
BUG();
}
brelse(bp);
@@ -201,28 +210,33 @@ static daddr_t
vxfs_bmap_typed(struct inode *ip, long iblock)
{
struct vxfs_inode_info *vip = VXFS_INO(ip);
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
daddr_t pblock = 0;
int i;
for (i = 0; i < VXFS_NTYPED; i++) {
struct vxfs_typed *typ = vip->vii_org.typed + i;
- int64_t off = (typ->vt_hdr & VXFS_TYPED_OFFSETMASK);
+ u64 hdr = fs64_to_cpu(sbi, typ->vt_hdr);
+ int64_t off = (hdr & VXFS_TYPED_OFFSETMASK);
#ifdef DIAGNOSTIC
vxfs_typdump(typ);
#endif
if (iblock < off)
continue;
- switch ((u_int32_t)(typ->vt_hdr >> VXFS_TYPED_TYPESHIFT)) {
+ switch ((u32)(hdr >> VXFS_TYPED_TYPESHIFT)) {
case VXFS_TYPED_INDIRECT:
- pblock = vxfs_bmap_indir(ip, typ->vt_block,
- typ->vt_size, iblock - off);
+ pblock = vxfs_bmap_indir(ip,
+ fs32_to_cpu(sbi, typ->vt_block),
+ fs32_to_cpu(sbi, typ->vt_size),
+ iblock - off);
if (pblock == -2)
break;
return (pblock);
case VXFS_TYPED_DATA:
- if ((iblock - off) < typ->vt_size)
- return (typ->vt_block + iblock - off);
+ if ((iblock - off) < fs32_to_cpu(sbi, typ->vt_size))
+ return (fs32_to_cpu(sbi, typ->vt_block) +
+ iblock - off);
break;
case VXFS_TYPED_INDIRECT_DEV4:
case VXFS_TYPED_DATA_DEV4: {
@@ -230,10 +244,10 @@ vxfs_bmap_typed(struct inode *ip, long iblock)
(struct vxfs_typed_dev4 *)typ;
printk(KERN_INFO "\n\nTYPED_DEV4 detected!\n");
- printk(KERN_INFO "block: %Lu\tsize: %Ld\tdev: %d\n",
- (unsigned long long) typ4->vd4_block,
- (unsigned long long) typ4->vd4_size,
- typ4->vd4_dev);
+ printk(KERN_INFO "block: %llu\tsize: %lld\tdev: %d\n",
+ fs64_to_cpu(sbi, typ4->vd4_block),
+ fs64_to_cpu(sbi, typ4->vd4_size),
+ fs32_to_cpu(sbi, typ4->vd4_dev));
return 0;
}
default:
diff --git a/fs/freevxfs/vxfs_dir.h b/fs/freevxfs/vxfs_dir.h
index aaf1fb098639..acc5477b3f23 100644
--- a/fs/freevxfs/vxfs_dir.h
+++ b/fs/freevxfs/vxfs_dir.h
@@ -48,9 +48,9 @@
* Linux driver for now.
*/
struct vxfs_dirblk {
- u_int16_t d_free; /* free space in dirblock */
- u_int16_t d_nhash; /* no of hash chains */
- u_int16_t d_hash[1]; /* hash chain */
+ __fs16 d_free; /* free space in dirblock */
+ __fs16 d_nhash; /* no of hash chains */
+ __fs16 d_hash[1]; /* hash chain */
};
/*
@@ -63,10 +63,10 @@ struct vxfs_dirblk {
* VxFS directory entry.
*/
struct vxfs_direct {
- vx_ino_t d_ino; /* inode number */
- u_int16_t d_reclen; /* record length */
- u_int16_t d_namelen; /* d_name length */
- u_int16_t d_hashnext; /* next hash entry */
+ __fs32 d_ino; /* inode number */
+ __fs16 d_reclen; /* record length */
+ __fs16 d_namelen; /* d_name length */
+ __fs16 d_hashnext; /* next hash entry */
char d_name[VXFS_NAMELEN]; /* name */
};
@@ -87,6 +87,7 @@ struct vxfs_direct {
/*
* VXFS_DIRBLKOV is the overhead of a specific dirblock.
*/
-#define VXFS_DIRBLKOV(dbp) ((sizeof(short) * dbp->d_nhash) + 4)
+#define VXFS_DIRBLKOV(sbi, dbp) \
+ ((sizeof(short) * fs16_to_cpu(sbi, dbp->d_nhash)) + 4)
#endif /* _VXFS_DIR_H_ */
diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h
index e3dcb4467d92..f5c428e21024 100644
--- a/fs/freevxfs/vxfs_extern.h
+++ b/fs/freevxfs/vxfs_extern.h
@@ -52,14 +52,10 @@ extern int vxfs_read_fshead(struct super_block *);
/* vxfs_inode.c */
extern const struct address_space_operations vxfs_immed_aops;
-extern struct kmem_cache *vxfs_inode_cachep;
extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t);
-extern struct inode * vxfs_get_fake_inode(struct super_block *,
- struct vxfs_inode_info *);
-extern void vxfs_put_fake_inode(struct inode *);
-extern struct vxfs_inode_info * vxfs_blkiget(struct super_block *, u_long, ino_t);
-extern struct vxfs_inode_info * vxfs_stiget(struct super_block *, ino_t);
-extern struct inode * vxfs_iget(struct super_block *, ino_t);
+extern struct inode *vxfs_blkiget(struct super_block *, u_long, ino_t);
+extern struct inode *vxfs_stiget(struct super_block *, ino_t);
+extern struct inode *vxfs_iget(struct super_block *, ino_t);
extern void vxfs_evict_inode(struct inode *);
/* vxfs_lookup.c */
diff --git a/fs/freevxfs/vxfs_fshead.c b/fs/freevxfs/vxfs_fshead.c
index c9a6a94e58e9..a4610a77649e 100644
--- a/fs/freevxfs/vxfs_fshead.c
+++ b/fs/freevxfs/vxfs_fshead.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -108,31 +109,26 @@ vxfs_read_fshead(struct super_block *sbp)
{
struct vxfs_sb_info *infp = VXFS_SBI(sbp);
struct vxfs_fsh *pfp, *sfp;
- struct vxfs_inode_info *vip, *tip;
+ struct vxfs_inode_info *vip;
- vip = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
- if (!vip) {
+ infp->vsi_fship = vxfs_blkiget(sbp, infp->vsi_iext, infp->vsi_fshino);
+ if (!infp->vsi_fship) {
printk(KERN_ERR "vxfs: unable to read fsh inode\n");
return -EINVAL;
}
+
+ vip = VXFS_INO(infp->vsi_fship);
if (!VXFS_ISFSH(vip)) {
printk(KERN_ERR "vxfs: fsh list inode is of wrong type (%x)\n",
vip->vii_mode & VXFS_TYPE_MASK);
- goto out_free_fship;
+ goto out_iput_fship;
}
-
#ifdef DIAGNOSTIC
printk("vxfs: fsh inode dump:\n");
vxfs_dumpi(vip, infp->vsi_fshino);
#endif
- infp->vsi_fship = vxfs_get_fake_inode(sbp, vip);
- if (!infp->vsi_fship) {
- printk(KERN_ERR "vxfs: unable to get fsh inode\n");
- goto out_free_fship;
- }
-
sfp = vxfs_getfsh(infp->vsi_fship, 0);
if (!sfp) {
printk(KERN_ERR "vxfs: unable to get structural fsh\n");
@@ -153,14 +149,10 @@ vxfs_read_fshead(struct super_block *sbp)
vxfs_dumpfsh(pfp);
#endif
- tip = vxfs_blkiget(sbp, infp->vsi_iext, sfp->fsh_ilistino[0]);
- if (!tip)
- goto out_free_pfp;
-
- infp->vsi_stilist = vxfs_get_fake_inode(sbp, tip);
+ infp->vsi_stilist = vxfs_blkiget(sbp, infp->vsi_iext,
+ fs32_to_cpu(infp, sfp->fsh_ilistino[0]));
if (!infp->vsi_stilist) {
printk(KERN_ERR "vxfs: unable to get structural list inode\n");
- kfree(tip);
goto out_free_pfp;
}
if (!VXFS_ISILT(VXFS_INO(infp->vsi_stilist))) {
@@ -169,13 +161,9 @@ vxfs_read_fshead(struct super_block *sbp)
goto out_iput_stilist;
}
- tip = vxfs_stiget(sbp, pfp->fsh_ilistino[0]);
- if (!tip)
- goto out_iput_stilist;
- infp->vsi_ilist = vxfs_get_fake_inode(sbp, tip);
+ infp->vsi_ilist = vxfs_stiget(sbp, fs32_to_cpu(infp, pfp->fsh_ilistino[0]));
if (!infp->vsi_ilist) {
printk(KERN_ERR "vxfs: unable to get inode list inode\n");
- kfree(tip);
goto out_iput_stilist;
}
if (!VXFS_ISILT(VXFS_INO(infp->vsi_ilist))) {
@@ -184,6 +172,8 @@ vxfs_read_fshead(struct super_block *sbp)
goto out_iput_ilist;
}
+ kfree(pfp);
+ kfree(sfp);
return 0;
out_iput_ilist:
@@ -197,7 +187,4 @@ vxfs_read_fshead(struct super_block *sbp)
out_iput_fship:
iput(infp->vsi_fship);
return -EINVAL;
- out_free_fship:
- kfree(vip);
- return -EINVAL;
}
diff --git a/fs/freevxfs/vxfs_fshead.h b/fs/freevxfs/vxfs_fshead.h
index ead0d640c181..e026f0c49159 100644
--- a/fs/freevxfs/vxfs_fshead.h
+++ b/fs/freevxfs/vxfs_fshead.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -42,20 +43,20 @@
* Fileset header
*/
struct vxfs_fsh {
- u_int32_t fsh_version; /* fileset header version */
- u_int32_t fsh_fsindex; /* fileset index */
- u_int32_t fsh_time; /* modification time - sec */
- u_int32_t fsh_utime; /* modification time - usec */
- u_int32_t fsh_extop; /* extop flags */
- vx_ino_t fsh_ninodes; /* allocated inodes */
- u_int32_t fsh_nau; /* number of IAUs */
- u_int32_t fsh_old_ilesize; /* old size of ilist */
- u_int32_t fsh_dflags; /* flags */
- u_int32_t fsh_quota; /* quota limit */
- vx_ino_t fsh_maxinode; /* maximum inode number */
- vx_ino_t fsh_iauino; /* IAU inode */
- vx_ino_t fsh_ilistino[2]; /* ilist inodes */
- vx_ino_t fsh_lctino; /* link count table inode */
+ __fs32 fsh_version; /* fileset header version */
+ __fs32 fsh_fsindex; /* fileset index */
+ __fs32 fsh_time; /* modification time - sec */
+ __fs32 fsh_utime; /* modification time - usec */
+ __fs32 fsh_extop; /* extop flags */
+ __fs32 fsh_ninodes; /* allocated inodes */
+ __fs32 fsh_nau; /* number of IAUs */
+ __fs32 fsh_old_ilesize; /* old size of ilist */
+ __fs32 fsh_dflags; /* flags */
+ __fs32 fsh_quota; /* quota limit */
+ __fs32 fsh_maxinode; /* maximum inode number */
+ __fs32 fsh_iauino; /* IAU inode */
+ __fs32 fsh_ilistino[2]; /* ilist inodes */
+ __fs32 fsh_lctino; /* link count table inode */
/*
* Slightly more fields follow, but they
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 3e2ccade61ed..1f41b25ef38b 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -42,9 +43,6 @@
#include "vxfs_extern.h"
-struct kmem_cache *vxfs_inode_cachep;
-
-
#ifdef DIAGNOSTIC
/*
* Dump inode contents (partially).
@@ -68,6 +66,83 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
}
#endif
+/**
+ * vxfs_transmod - mode for a VxFS inode
+ * @vip: VxFS inode
+ *
+ * Description:
+ * vxfs_transmod returns a Linux mode_t for a given
+ * VxFS inode structure.
+ */
+static __inline__ umode_t
+vxfs_transmod(struct vxfs_inode_info *vip)
+{
+ umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+
+ if (VXFS_ISFIFO(vip))
+ ret |= S_IFIFO;
+ if (VXFS_ISCHR(vip))
+ ret |= S_IFCHR;
+ if (VXFS_ISDIR(vip))
+ ret |= S_IFDIR;
+ if (VXFS_ISBLK(vip))
+ ret |= S_IFBLK;
+ if (VXFS_ISLNK(vip))
+ ret |= S_IFLNK;
+ if (VXFS_ISREG(vip))
+ ret |= S_IFREG;
+ if (VXFS_ISSOC(vip))
+ ret |= S_IFSOCK;
+
+ return (ret);
+}
+
+static inline void dip2vip_cpy(struct vxfs_sb_info *sbi,
+ struct vxfs_inode_info *vip, struct vxfs_dinode *dip)
+{
+ struct inode *inode = &vip->vfs_inode;
+
+ vip->vii_mode = fs32_to_cpu(sbi, dip->vdi_mode);
+ vip->vii_nlink = fs32_to_cpu(sbi, dip->vdi_nlink);
+ vip->vii_uid = fs32_to_cpu(sbi, dip->vdi_uid);
+ vip->vii_gid = fs32_to_cpu(sbi, dip->vdi_gid);
+ vip->vii_size = fs64_to_cpu(sbi, dip->vdi_size);
+ vip->vii_atime = fs32_to_cpu(sbi, dip->vdi_atime);
+ vip->vii_autime = fs32_to_cpu(sbi, dip->vdi_autime);
+ vip->vii_mtime = fs32_to_cpu(sbi, dip->vdi_mtime);
+ vip->vii_mutime = fs32_to_cpu(sbi, dip->vdi_mutime);
+ vip->vii_ctime = fs32_to_cpu(sbi, dip->vdi_ctime);
+ vip->vii_cutime = fs32_to_cpu(sbi, dip->vdi_cutime);
+ vip->vii_orgtype = dip->vdi_orgtype;
+
+ vip->vii_blocks = fs32_to_cpu(sbi, dip->vdi_blocks);
+ vip->vii_gen = fs32_to_cpu(sbi, dip->vdi_gen);
+
+ if (VXFS_ISDIR(vip))
+ vip->vii_dotdot = fs32_to_cpu(sbi, dip->vdi_dotdot);
+ else if (!VXFS_ISREG(vip) && !VXFS_ISLNK(vip))
+ vip->vii_rdev = fs32_to_cpu(sbi, dip->vdi_rdev);
+
+ /* don't endian swap the fields that differ by orgtype */
+ memcpy(&vip->vii_org, &dip->vdi_org, sizeof(vip->vii_org));
+
+ inode->i_mode = vxfs_transmod(vip);
+ i_uid_write(inode, (uid_t)vip->vii_uid);
+ i_gid_write(inode, (gid_t)vip->vii_gid);
+
+ set_nlink(inode, vip->vii_nlink);
+ inode->i_size = vip->vii_size;
+
+ inode->i_atime.tv_sec = vip->vii_atime;
+ inode->i_ctime.tv_sec = vip->vii_ctime;
+ inode->i_mtime.tv_sec = vip->vii_mtime;
+ inode->i_atime.tv_nsec = 0;
+ inode->i_ctime.tv_nsec = 0;
+ inode->i_mtime.tv_nsec = 0;
+
+ inode->i_blocks = vip->vii_blocks;
+ inode->i_generation = vip->vii_gen;
+}
/**
* vxfs_blkiget - find inode based on extent #
@@ -85,50 +160,55 @@ vxfs_dumpi(struct vxfs_inode_info *vip, ino_t ino)
* buffercache. This function should not be used outside the
* read_super() method, otherwise the data may be incoherent.
*/
-struct vxfs_inode_info *
+struct inode *
vxfs_blkiget(struct super_block *sbp, u_long extent, ino_t ino)
{
struct buffer_head *bp;
+ struct inode *inode;
u_long block, offset;
+ inode = new_inode(sbp);
+ if (!inode)
+ return NULL;
+ inode->i_ino = get_next_ino();
+
block = extent + ((ino * VXFS_ISIZE) / sbp->s_blocksize);
offset = ((ino % (sbp->s_blocksize / VXFS_ISIZE)) * VXFS_ISIZE);
bp = sb_bread(sbp, block);
if (bp && buffer_mapped(bp)) {
- struct vxfs_inode_info *vip;
+ struct vxfs_inode_info *vip = VXFS_INO(inode);
struct vxfs_dinode *dip;
- if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
- goto fail;
dip = (struct vxfs_dinode *)(bp->b_data + offset);
- memcpy(vip, dip, sizeof(*vip));
+ dip2vip_cpy(VXFS_SBI(sbp), vip, dip);
+ vip->vfs_inode.i_mapping->a_ops = &vxfs_aops;
#ifdef DIAGNOSTIC
vxfs_dumpi(vip, ino);
#endif
brelse(bp);
- return (vip);
+ return inode;
}
-fail:
printk(KERN_WARNING "vxfs: unable to read block %ld\n", block);
brelse(bp);
+ iput(inode);
return NULL;
}
/**
* __vxfs_iget - generic find inode facility
- * @sbp: VFS superblock
- * @ino: inode number
* @ilistp: inode list
+ * @vip: VxFS inode to fill in
+ * @ino: inode number
*
* Description:
* Search the for inode number @ino in the filesystem
* described by @sbp. Use the specified inode table (@ilistp).
- * Returns the matching VxFS inode on success, else an error code.
+ * Returns the matching inode on success, else an error code.
*/
-static struct vxfs_inode_info *
-__vxfs_iget(ino_t ino, struct inode *ilistp)
+static int
+__vxfs_iget(struct inode *ilistp, struct vxfs_inode_info *vip, ino_t ino)
{
struct page *pp;
u_long offset;
@@ -137,28 +217,22 @@ __vxfs_iget(ino_t ino, struct inode *ilistp)
pp = vxfs_get_page(ilistp->i_mapping, ino * VXFS_ISIZE / PAGE_SIZE);
if (!IS_ERR(pp)) {
- struct vxfs_inode_info *vip;
struct vxfs_dinode *dip;
caddr_t kaddr = (char *)page_address(pp);
- if (!(vip = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL)))
- goto fail;
dip = (struct vxfs_dinode *)(kaddr + offset);
- memcpy(vip, dip, sizeof(*vip));
+ dip2vip_cpy(VXFS_SBI(ilistp->i_sb), vip, dip);
+ vip->vfs_inode.i_mapping->a_ops = &vxfs_aops;
#ifdef DIAGNOSTIC
vxfs_dumpi(vip, ino);
#endif
vxfs_put_page(pp);
- return (vip);
+ return 0;
}
- printk(KERN_WARNING "vxfs: error on page %p\n", pp);
- return ERR_CAST(pp);
-
-fail:
- printk(KERN_WARNING "vxfs: unable to read inode %ld\n", (unsigned long)ino);
- vxfs_put_page(pp);
- return ERR_PTR(-ENOMEM);
+ printk(KERN_WARNING "vxfs: error on page 0x%p for inode %ld\n",
+ pp, (unsigned long)ino);
+ return PTR_ERR(pp);
}
/**
@@ -169,116 +243,26 @@ fail:
* Description:
* Find inode @ino in the filesystem described by @sbp using
* the structural inode list.
- * Returns the matching VxFS inode on success, else a NULL pointer.
- */
-struct vxfs_inode_info *
-vxfs_stiget(struct super_block *sbp, ino_t ino)
-{
- struct vxfs_inode_info *vip;
-
- vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_stilist);
- return IS_ERR(vip) ? NULL : vip;
-}
-
-/**
- * vxfs_transmod - mode for a VxFS inode
- * @vip: VxFS inode
- *
- * Description:
- * vxfs_transmod returns a Linux mode_t for a given
- * VxFS inode structure.
- */
-static __inline__ umode_t
-vxfs_transmod(struct vxfs_inode_info *vip)
-{
- umode_t ret = vip->vii_mode & ~VXFS_TYPE_MASK;
-
- if (VXFS_ISFIFO(vip))
- ret |= S_IFIFO;
- if (VXFS_ISCHR(vip))
- ret |= S_IFCHR;
- if (VXFS_ISDIR(vip))
- ret |= S_IFDIR;
- if (VXFS_ISBLK(vip))
- ret |= S_IFBLK;
- if (VXFS_ISLNK(vip))
- ret |= S_IFLNK;
- if (VXFS_ISREG(vip))
- ret |= S_IFREG;
- if (VXFS_ISSOC(vip))
- ret |= S_IFSOCK;
-
- return (ret);
-}
-
-/**
- * vxfs_iinit- helper to fill inode fields
- * @ip: VFS inode
- * @vip: VxFS inode
- *
- * Description:
- * vxfs_instino is a helper function to fill in all relevant
- * fields in @ip from @vip.
- */
-static void
-vxfs_iinit(struct inode *ip, struct vxfs_inode_info *vip)
-{
-
- ip->i_mode = vxfs_transmod(vip);
- i_uid_write(ip, (uid_t)vip->vii_uid);
- i_gid_write(ip, (gid_t)vip->vii_gid);
-
- set_nlink(ip, vip->vii_nlink);
- ip->i_size = vip->vii_size;
-
- ip->i_atime.tv_sec = vip->vii_atime;
- ip->i_ctime.tv_sec = vip->vii_ctime;
- ip->i_mtime.tv_sec = vip->vii_mtime;
- ip->i_atime.tv_nsec = 0;
- ip->i_ctime.tv_nsec = 0;
- ip->i_mtime.tv_nsec = 0;
-
- ip->i_blocks = vip->vii_blocks;
- ip->i_generation = vip->vii_gen;
-
- ip->i_private = vip;
-
-}
-
-/**
- * vxfs_get_fake_inode - get fake inode structure
- * @sbp: filesystem superblock
- * @vip: fspriv inode
- *
- * Description:
- * vxfs_fake_inode gets a fake inode (not in the inode hash) for a
- * superblock, vxfs_inode pair.
- * Returns the filled VFS inode.
+ * Returns the matching inode on success, else a NULL pointer.
*/
struct inode *
-vxfs_get_fake_inode(struct super_block *sbp, struct vxfs_inode_info *vip)
+vxfs_stiget(struct super_block *sbp, ino_t ino)
{
- struct inode *ip = NULL;
-
- if ((ip = new_inode(sbp))) {
- ip->i_ino = get_next_ino();
- vxfs_iinit(ip, vip);
- ip->i_mapping->a_ops = &vxfs_aops;
+ struct inode *inode;
+ int error;
+
+ inode = new_inode(sbp);
+ if (!inode)
+ return NULL;
+ inode->i_ino = get_next_ino();
+
+ error = __vxfs_iget(VXFS_SBI(sbp)->vsi_stilist, VXFS_INO(inode), ino);
+ if (error) {
+ iput(inode);
+ return NULL;
}
- return (ip);
-}
-/**
- * vxfs_put_fake_inode - free faked inode
- * *ip: VFS inode
- *
- * Description:
- * vxfs_put_fake_inode frees all data associated with @ip.
- */
-void
-vxfs_put_fake_inode(struct inode *ip)
-{
- iput(ip);
+ return inode;
}
/**
@@ -296,6 +280,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
struct vxfs_inode_info *vip;
const struct address_space_operations *aops;
struct inode *ip;
+ int error;
ip = iget_locked(sbp, ino);
if (!ip)
@@ -303,14 +288,13 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
if (!(ip->i_state & I_NEW))
return ip;
- vip = __vxfs_iget(ino, VXFS_SBI(sbp)->vsi_ilist);
- if (IS_ERR(vip)) {
+ vip = VXFS_INO(ip);
+ error = __vxfs_iget(VXFS_SBI(sbp)->vsi_ilist, vip, ino);
+ if (error) {
iget_failed(ip);
- return ERR_CAST(vip);
+ return ERR_PTR(error);
}
- vxfs_iinit(ip, vip);
-
if (VXFS_ISIMMED(vip))
aops = &vxfs_immed_aops;
else
@@ -341,12 +325,6 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
return ip;
}
-static void vxfs_i_callback(struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- kmem_cache_free(vxfs_inode_cachep, inode->i_private);
-}
-
/**
* vxfs_evict_inode - remove inode from main memory
* @ip: inode to discard.
@@ -360,5 +338,4 @@ vxfs_evict_inode(struct inode *ip)
{
truncate_inode_pages_final(&ip->i_data);
clear_inode(ip);
- call_rcu(&ip->i_rcu, vxfs_i_callback);
}
diff --git a/fs/freevxfs/vxfs_inode.h b/fs/freevxfs/vxfs_inode.h
index 240aeb11263f..f012abed125d 100644
--- a/fs/freevxfs/vxfs_inode.h
+++ b/fs/freevxfs/vxfs_inode.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -66,74 +67,74 @@ enum {
* Data stored immediately in the inode.
*/
struct vxfs_immed {
- u_int8_t vi_immed[VXFS_NIMMED];
+ __u8 vi_immed[VXFS_NIMMED];
};
struct vxfs_ext4 {
- u_int32_t ve4_spare; /* ?? */
- u_int32_t ve4_indsize; /* Indirect extent size */
- vx_daddr_t ve4_indir[VXFS_NIADDR]; /* Indirect extents */
+ __fs32 ve4_spare; /* ?? */
+ __fs32 ve4_indsize; /* Indirect extent size */
+ __fs32 ve4_indir[VXFS_NIADDR]; /* Indirect extents */
struct direct { /* Direct extents */
- vx_daddr_t extent; /* Extent number */
- int32_t size; /* Size of extent */
+ __fs32 extent; /* Extent number */
+ __fs32 size; /* Size of extent */
} ve4_direct[VXFS_NDADDR];
};
struct vxfs_typed {
- u_int64_t vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
- vx_daddr_t vt_block; /* Extent block */
- int32_t vt_size; /* Size in blocks */
+ __fs64 vt_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+ __fs32 vt_block; /* Extent block */
+ __fs32 vt_size; /* Size in blocks */
};
struct vxfs_typed_dev4 {
- u_int64_t vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
- u_int64_t vd4_block; /* Extent block */
- u_int64_t vd4_size; /* Size in blocks */
- int32_t vd4_dev; /* Device ID */
- u_int32_t __pad1;
+ __fs64 vd4_hdr; /* Header, 0xTTOOOOOOOOOOOOOO; T=type,O=offs */
+ __fs64 vd4_block; /* Extent block */
+ __fs64 vd4_size; /* Size in blocks */
+ __fs32 vd4_dev; /* Device ID */
+ __u8 __pad1;
};
/*
* The inode as contained on the physical device.
*/
struct vxfs_dinode {
- int32_t vdi_mode;
- u_int32_t vdi_nlink; /* Link count */
- u_int32_t vdi_uid; /* UID */
- u_int32_t vdi_gid; /* GID */
- u_int64_t vdi_size; /* Inode size in bytes */
- u_int32_t vdi_atime; /* Last time accessed - sec */
- u_int32_t vdi_autime; /* Last time accessed - usec */
- u_int32_t vdi_mtime; /* Last modify time - sec */
- u_int32_t vdi_mutime; /* Last modify time - usec */
- u_int32_t vdi_ctime; /* Create time - sec */
- u_int32_t vdi_cutime; /* Create time - usec */
- u_int8_t vdi_aflags; /* Allocation flags */
- u_int8_t vdi_orgtype; /* Organisation type */
- u_int16_t vdi_eopflags;
- u_int32_t vdi_eopdata;
+ __fs32 vdi_mode;
+ __fs32 vdi_nlink; /* Link count */
+ __fs32 vdi_uid; /* UID */
+ __fs32 vdi_gid; /* GID */
+ __fs64 vdi_size; /* Inode size in bytes */
+ __fs32 vdi_atime; /* Last time accessed - sec */
+ __fs32 vdi_autime; /* Last time accessed - usec */
+ __fs32 vdi_mtime; /* Last modify time - sec */
+ __fs32 vdi_mutime; /* Last modify time - usec */
+ __fs32 vdi_ctime; /* Create time - sec */
+ __fs32 vdi_cutime; /* Create time - usec */
+ __u8 vdi_aflags; /* Allocation flags */
+ __u8 vdi_orgtype; /* Organisation type */
+ __fs16 vdi_eopflags;
+ __fs32 vdi_eopdata;
union {
- u_int32_t rdev;
- u_int32_t dotdot;
+ __fs32 rdev;
+ __fs32 dotdot;
struct {
- u_int32_t reserved;
- u_int32_t fixextsize;
+ __u32 reserved;
+ __fs32 fixextsize;
} i_regular;
struct {
- u_int32_t matchino;
- u_int32_t fsetindex;
+ __fs32 matchino;
+ __fs32 fsetindex;
} i_vxspec;
- u_int64_t align;
+ __u64 align;
} vdi_ftarea;
- u_int32_t vdi_blocks; /* How much blocks does inode occupy */
- u_int32_t vdi_gen; /* Inode generation */
- u_int64_t vdi_version; /* Version */
+ __fs32 vdi_blocks; /* How much blocks does inode occupy */
+ __fs32 vdi_gen; /* Inode generation */
+ __fs64 vdi_version; /* Version */
union {
struct vxfs_immed immed;
struct vxfs_ext4 ext4;
struct vxfs_typed typed[VXFS_NTYPED];
} vdi_org;
- u_int32_t vdi_iattrino;
+ __fs32 vdi_iattrino;
};
#define vdi_rdev vdi_ftarea.rdev
@@ -149,32 +150,45 @@ struct vxfs_dinode {
/*
* The inode as represented in the main memory.
- *
- * TBD: This should become a separate structure...
*/
-#define vxfs_inode_info vxfs_dinode
-
-#define vii_mode vdi_mode
-#define vii_uid vdi_uid
-#define vii_gid vdi_gid
-#define vii_nlink vdi_nlink
-#define vii_size vdi_size
-#define vii_atime vdi_atime
-#define vii_ctime vdi_ctime
-#define vii_mtime vdi_mtime
-#define vii_blocks vdi_blocks
-#define vii_org vdi_org
-#define vii_orgtype vdi_orgtype
-#define vii_gen vdi_gen
-
-#define vii_rdev vdi_ftarea.rdev
-#define vii_dotdot vdi_ftarea.dotdot
-#define vii_fixextsize vdi_ftarea.regular.fixextsize
-#define vii_matchino vdi_ftarea.vxspec.matchino
-#define vii_fsetindex vdi_ftarea.vxspec.fsetindex
-
-#define vii_immed vdi_org.immed
-#define vii_ext4 vdi_org.ext4
-#define vii_typed vdi_org.typed
+struct vxfs_inode_info {
+ struct inode vfs_inode;
+
+ __u32 vii_mode;
+ __u32 vii_nlink; /* Link count */
+ __u32 vii_uid; /* UID */
+ __u32 vii_gid; /* GID */
+ __u64 vii_size; /* Inode size in bytes */
+ __u32 vii_atime; /* Last time accessed - sec */
+ __u32 vii_autime; /* Last time accessed - usec */
+ __u32 vii_mtime; /* Last modify time - sec */
+ __u32 vii_mutime; /* Last modify time - usec */
+ __u32 vii_ctime; /* Create time - sec */
+ __u32 vii_cutime; /* Create time - usec */
+ __u8 vii_orgtype; /* Organisation type */
+ union {
+ __u32 rdev;
+ __u32 dotdot;
+ } vii_ftarea;
+ __u32 vii_blocks; /* How much blocks does inode occupy */
+ __u32 vii_gen; /* Inode generation */
+ union {
+ struct vxfs_immed immed;
+ struct vxfs_ext4 ext4;
+ struct vxfs_typed typed[VXFS_NTYPED];
+ } vii_org;
+};
+
+#define vii_rdev vii_ftarea.rdev
+#define vii_dotdot vii_ftarea.dotdot
+
+#define vii_immed vii_org.immed
+#define vii_ext4 vii_org.ext4
+#define vii_typed vii_org.typed
+
+static inline struct vxfs_inode_info *VXFS_INO(struct inode *inode)
+{
+ return container_of(inode, struct vxfs_inode_info, vfs_inode);
+}
#endif /* _VXFS_INODE_H_ */
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index 6d576b97f2c8..ce4785fd81c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -61,33 +62,6 @@ const struct file_operations vxfs_dir_operations = {
.iterate_shared = vxfs_readdir,
};
-static inline u_long
-dir_blocks(struct inode *ip)
-{
- u_long bsize = ip->i_sb->s_blocksize;
- return (ip->i_size + bsize - 1) & ~(bsize - 1);
-}
-
-/*
- * NOTE! unlike strncmp, vxfs_match returns 1 for success, 0 for failure.
- *
- * len <= VXFS_NAMELEN and de != NULL are guaranteed by caller.
- */
-static inline int
-vxfs_match(int len, const char * const name, struct vxfs_direct *de)
-{
- if (len != de->d_namelen)
- return 0;
- if (!de->d_ino)
- return 0;
- return !memcmp(name, de->d_name, len);
-}
-
-static inline struct vxfs_direct *
-vxfs_next_entry(struct vxfs_direct *de)
-{
- return ((struct vxfs_direct *)((char*)de + de->d_reclen));
-}
/**
* vxfs_find_entry - find a mathing directory entry for a dentry
@@ -106,50 +80,64 @@ vxfs_next_entry(struct vxfs_direct *de)
static struct vxfs_direct *
vxfs_find_entry(struct inode *ip, struct dentry *dp, struct page **ppp)
{
- u_long npages, page, nblocks, pblocks, block;
- u_long bsize = ip->i_sb->s_blocksize;
- const char *name = dp->d_name.name;
- int namelen = dp->d_name.len;
-
- npages = dir_pages(ip);
- nblocks = dir_blocks(ip);
- pblocks = VXFS_BLOCK_PER_PAGE(ip->i_sb);
-
- for (page = 0; page < npages; page++) {
- caddr_t kaddr;
- struct page *pp;
+ u_long bsize = ip->i_sb->s_blocksize;
+ const char *name = dp->d_name.name;
+ int namelen = dp->d_name.len;
+ loff_t limit = VXFS_DIRROUND(ip->i_size);
+ struct vxfs_direct *de_exit = NULL;
+ loff_t pos = 0;
+ struct vxfs_sb_info *sbi = VXFS_SBI(ip->i_sb);
- pp = vxfs_get_page(ip->i_mapping, page);
+ while (pos < limit) {
+ struct page *pp;
+ char *kaddr;
+ int pg_ofs = pos & ~PAGE_MASK;
+
+ pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT);
if (IS_ERR(pp))
- continue;
- kaddr = (caddr_t)page_address(pp);
-
- for (block = 0; block <= nblocks && block <= pblocks; block++) {
- caddr_t baddr, limit;
- struct vxfs_dirblk *dbp;
- struct vxfs_direct *de;
-
- baddr = kaddr + (block * bsize);
- limit = baddr + bsize - VXFS_DIRLEN(1);
-
- dbp = (struct vxfs_dirblk *)baddr;
- de = (struct vxfs_direct *)(baddr + VXFS_DIRBLKOV(dbp));
-
- for (; (caddr_t)de <= limit; de = vxfs_next_entry(de)) {
- if (!de->d_reclen)
- break;
- if (!de->d_ino)
- continue;
- if (vxfs_match(namelen, name, de)) {
- *ppp = pp;
- return (de);
- }
+ return NULL;
+ kaddr = (char *)page_address(pp);
+
+ while (pg_ofs < PAGE_SIZE && pos < limit) {
+ struct vxfs_direct *de;
+
+ if ((pos & (bsize - 1)) < 4) {
+ struct vxfs_dirblk *dbp =
+ (struct vxfs_dirblk *)
+ (kaddr + (pos & ~PAGE_MASK));
+ int overhead = VXFS_DIRBLKOV(sbi, dbp);
+
+ pos += overhead;
+ pg_ofs += overhead;
+ }
+ de = (struct vxfs_direct *)(kaddr + pg_ofs);
+
+ if (!de->d_reclen) {
+ pos += bsize - 1;
+ pos &= ~(bsize - 1);
+ break;
+ }
+
+ pg_ofs += fs16_to_cpu(sbi, de->d_reclen);
+ pos += fs16_to_cpu(sbi, de->d_reclen);
+ if (!de->d_ino)
+ continue;
+
+ if (namelen != fs16_to_cpu(sbi, de->d_namelen))
+ continue;
+ if (!memcmp(name, de->d_name, namelen)) {
+ *ppp = pp;
+ de_exit = de;
+ break;
}
}
- vxfs_put_page(pp);
+ if (!de_exit)
+ vxfs_put_page(pp);
+ else
+ break;
}
- return NULL;
+ return de_exit;
}
/**
@@ -173,7 +161,7 @@ vxfs_inode_by_name(struct inode *dip, struct dentry *dp)
de = vxfs_find_entry(dip, dp, &pp);
if (de) {
- ino = de->d_ino;
+ ino = fs32_to_cpu(VXFS_SBI(dip->i_sb), de->d_ino);
kunmap(pp);
put_page(pp);
}
@@ -233,74 +221,80 @@ vxfs_readdir(struct file *fp, struct dir_context *ctx)
struct inode *ip = file_inode(fp);
struct super_block *sbp = ip->i_sb;
u_long bsize = sbp->s_blocksize;
- u_long page, npages, block, pblocks, nblocks, offset;
- loff_t pos;
+ loff_t pos, limit;
+ struct vxfs_sb_info *sbi = VXFS_SBI(sbp);
if (ctx->pos == 0) {
if (!dir_emit_dot(fp, ctx))
- return 0;
- ctx->pos = 1;
+ goto out;
+ ctx->pos++;
}
if (ctx->pos == 1) {
if (!dir_emit(ctx, "..", 2, VXFS_INO(ip)->vii_dotdot, DT_DIR))
- return 0;
- ctx->pos = 2;
+ goto out;
+ ctx->pos++;
}
- pos = ctx->pos - 2;
-
- if (pos > VXFS_DIRROUND(ip->i_size))
- return 0;
- npages = dir_pages(ip);
- nblocks = dir_blocks(ip);
- pblocks = VXFS_BLOCK_PER_PAGE(sbp);
+ limit = VXFS_DIRROUND(ip->i_size);
+ if (ctx->pos > limit)
+ goto out;
- page = pos >> PAGE_SHIFT;
- offset = pos & ~PAGE_MASK;
- block = (u_long)(pos >> sbp->s_blocksize_bits) % pblocks;
+ pos = ctx->pos & ~3L;
- for (; page < npages; page++, block = 0) {
- char *kaddr;
- struct page *pp;
+ while (pos < limit) {
+ struct page *pp;
+ char *kaddr;
+ int pg_ofs = pos & ~PAGE_MASK;
+ int rc = 0;
- pp = vxfs_get_page(ip->i_mapping, page);
+ pp = vxfs_get_page(ip->i_mapping, pos >> PAGE_SHIFT);
if (IS_ERR(pp))
- continue;
+ return -ENOMEM;
+
kaddr = (char *)page_address(pp);
- for (; block <= nblocks && block <= pblocks; block++) {
- char *baddr, *limit;
- struct vxfs_dirblk *dbp;
- struct vxfs_direct *de;
+ while (pg_ofs < PAGE_SIZE && pos < limit) {
+ struct vxfs_direct *de;
- baddr = kaddr + (block * bsize);
- limit = baddr + bsize - VXFS_DIRLEN(1);
-
- dbp = (struct vxfs_dirblk *)baddr;
- de = (struct vxfs_direct *)
- (offset ?
- (kaddr + offset) :
- (baddr + VXFS_DIRBLKOV(dbp)));
-
- for (; (char *)de <= limit; de = vxfs_next_entry(de)) {
- if (!de->d_reclen)
- break;
- if (!de->d_ino)
- continue;
-
- offset = (char *)de - kaddr;
- ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
- if (!dir_emit(ctx, de->d_name, de->d_namelen,
- de->d_ino, DT_UNKNOWN)) {
- vxfs_put_page(pp);
- return 0;
- }
+ if ((pos & (bsize - 1)) < 4) {
+ struct vxfs_dirblk *dbp =
+ (struct vxfs_dirblk *)
+ (kaddr + (pos & ~PAGE_MASK));
+ int overhead = VXFS_DIRBLKOV(sbi, dbp);
+
+ pos += overhead;
+ pg_ofs += overhead;
+ }
+ de = (struct vxfs_direct *)(kaddr + pg_ofs);
+
+ if (!de->d_reclen) {
+ pos += bsize - 1;
+ pos &= ~(bsize - 1);
+ break;
+ }
+
+ pg_ofs += fs16_to_cpu(sbi, de->d_reclen);
+ pos += fs16_to_cpu(sbi, de->d_reclen);
+ if (!de->d_ino)
+ continue;
+
+ rc = dir_emit(ctx, de->d_name,
+ fs16_to_cpu(sbi, de->d_namelen),
+ fs32_to_cpu(sbi, de->d_ino),
+ DT_UNKNOWN);
+ if (!rc) {
+ /* the dir entry was not read, fix pos. */
+ pos -= fs16_to_cpu(sbi, de->d_reclen);
+ break;
}
- offset = 0;
}
vxfs_put_page(pp);
- offset = 0;
+ if (!rc)
+ break;
}
- ctx->pos = ((page << PAGE_SHIFT) | offset) + 2;
+
+ ctx->pos = pos | 2;
+
+out:
return 0;
}
diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c
index 049500847903..813da6685151 100644
--- a/fs/freevxfs/vxfs_olt.c
+++ b/fs/freevxfs/vxfs_olt.c
@@ -43,14 +43,14 @@ static inline void
vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp)
{
BUG_ON(infp->vsi_fshino);
- infp->vsi_fshino = fshp->olt_fsino[0];
+ infp->vsi_fshino = fs32_to_cpu(infp, fshp->olt_fsino[0]);
}
static inline void
vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp)
{
BUG_ON(infp->vsi_iext);
- infp->vsi_iext = ilistp->olt_iext[0];
+ infp->vsi_iext = fs32_to_cpu(infp, ilistp->olt_iext[0]);
}
static inline u_long
@@ -81,13 +81,12 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
struct vxfs_olt *op;
char *oaddr, *eaddr;
-
bp = sb_bread(sbp, vxfs_oblock(sbp, infp->vsi_oltext, bsize));
if (!bp || !bp->b_data)
goto fail;
op = (struct vxfs_olt *)bp->b_data;
- if (op->olt_magic != VXFS_OLT_MAGIC) {
+ if (fs32_to_cpu(infp, op->olt_magic) != VXFS_OLT_MAGIC) {
printk(KERN_NOTICE "vxfs: ivalid olt magic number\n");
goto fail;
}
@@ -102,14 +101,14 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
goto fail;
}
- oaddr = bp->b_data + op->olt_size;
+ oaddr = bp->b_data + fs32_to_cpu(infp, op->olt_size);
eaddr = bp->b_data + (infp->vsi_oltsize * sbp->s_blocksize);
while (oaddr < eaddr) {
struct vxfs_oltcommon *ocp =
(struct vxfs_oltcommon *)oaddr;
- switch (ocp->olt_type) {
+ switch (fs32_to_cpu(infp, ocp->olt_type)) {
case VXFS_OLT_FSHEAD:
vxfs_get_fshead((struct vxfs_oltfshead *)oaddr, infp);
break;
@@ -118,11 +117,11 @@ vxfs_read_olt(struct super_block *sbp, u_long bsize)
break;
}
- oaddr += ocp->olt_size;
+ oaddr += fs32_to_cpu(infp, ocp->olt_size);
}
brelse(bp);
- return 0;
+ return (infp->vsi_fshino && infp->vsi_iext) ? 0 : -EINVAL;
fail:
brelse(bp);
diff --git a/fs/freevxfs/vxfs_olt.h b/fs/freevxfs/vxfs_olt.h
index b7b3af502615..0c0b0c9fa557 100644
--- a/fs/freevxfs/vxfs_olt.h
+++ b/fs/freevxfs/vxfs_olt.h
@@ -63,83 +63,83 @@ enum {
* the initial inode list, the fileset header or the device configuration.
*/
struct vxfs_olt {
- u_int32_t olt_magic; /* magic number */
- u_int32_t olt_size; /* size of this entry */
- u_int32_t olt_checksum; /* checksum of extent */
- u_int32_t __unused1; /* ??? */
- u_int32_t olt_mtime; /* time of last mod. (sec) */
- u_int32_t olt_mutime; /* time of last mod. (usec) */
- u_int32_t olt_totfree; /* free space in OLT extent */
- vx_daddr_t olt_extents[2]; /* addr of this extent, replica */
- u_int32_t olt_esize; /* size of this extent */
- vx_daddr_t olt_next[2]; /* addr of next extent, replica */
- u_int32_t olt_nsize; /* size of next extent */
- u_int32_t __unused2; /* align to 8 byte boundary */
+ __fs32 olt_magic; /* magic number */
+ __fs32 olt_size; /* size of this entry */
+ __fs32 olt_checksum; /* checksum of extent */
+ __u32 __unused1; /* ??? */
+ __fs32 olt_mtime; /* time of last mod. (sec) */
+ __fs32 olt_mutime; /* time of last mod. (usec) */
+ __fs32 olt_totfree; /* free space in OLT extent */
+ __fs32 olt_extents[2]; /* addr of this extent, replica */
+ __fs32 olt_esize; /* size of this extent */
+ __fs32 olt_next[2]; /* addr of next extent, replica */
+ __fs32 olt_nsize; /* size of next extent */
+ __u32 __unused2; /* align to 8 byte boundary */
};
/*
* VxFS common OLT entry (on disk).
*/
struct vxfs_oltcommon {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
};
/*
* VxFS free OLT entry (on disk).
*/
struct vxfs_oltfree {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_fsize; /* size of this free record */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_fsize; /* size of this free record */
};
/*
* VxFS initial-inode list (on disk).
*/
struct vxfs_oltilist {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_iext[2]; /* initial inode list, replica */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_iext[2]; /* initial inode list, replica */
};
/*
* Current Usage Table
*/
struct vxfs_oltcut {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_cutino; /* inode of current usage table */
- u_int32_t __pad; /* unused, 8 byte align */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_cutino; /* inode of current usage table */
+ __u8 __pad; /* unused, 8 byte align */
};
/*
* Inodes containing Superblock, Intent log and OLTs
*/
struct vxfs_oltsb {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_sbino; /* inode of superblock file */
- u_int32_t __unused1; /* ??? */
- vx_ino_t olt_logino[2]; /* inode of log file,replica */
- vx_ino_t olt_oltino[2]; /* inode of OLT, replica */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_sbino; /* inode of superblock file */
+ __u32 __unused1; /* ??? */
+ __fs32 olt_logino[2]; /* inode of log file,replica */
+ __fs32 olt_oltino[2]; /* inode of OLT, replica */
};
/*
* Inode containing device configuration + it's replica
*/
struct vxfs_oltdev {
- u_int32_t olt_type; /* type of this record */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_devino[2]; /* inode of device config files */
+ __fs32 olt_type; /* type of this record */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_devino[2]; /* inode of device config files */
};
/*
* Fileset header
*/
struct vxfs_oltfshead {
- u_int32_t olt_type; /* type number */
- u_int32_t olt_size; /* size of this record */
- vx_ino_t olt_fsino[2]; /* inodes of fileset header */
+ __fs32 olt_type; /* type number */
+ __fs32 olt_size; /* size of this record */
+ __fs32 olt_fsino[2]; /* inodes of fileset header */
};
#endif /* _VXFS_OLT_H_ */
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 7ca8c75d50d3..455ce5b77e9b 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2001 Christoph Hellwig.
+ * Copyright (c) 2016 Krzysztof Blaszkowski
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -48,22 +49,11 @@
#include "vxfs_inode.h"
-MODULE_AUTHOR("Christoph Hellwig");
+MODULE_AUTHOR("Christoph Hellwig, Krzysztof Blaszkowski");
MODULE_DESCRIPTION("Veritas Filesystem (VxFS) driver");
MODULE_LICENSE("Dual BSD/GPL");
-
-
-static void vxfs_put_super(struct super_block *);
-static int vxfs_statfs(struct dentry *, struct kstatfs *);
-static int vxfs_remount(struct super_block *, int *, char *);
-
-static const struct super_operations vxfs_super_ops = {
- .evict_inode = vxfs_evict_inode,
- .put_super = vxfs_put_super,
- .statfs = vxfs_statfs,
- .remount_fs = vxfs_remount,
-};
+static struct kmem_cache *vxfs_inode_cachep;
/**
* vxfs_put_super - free superblock resources
@@ -79,9 +69,9 @@ vxfs_put_super(struct super_block *sbp)
{
struct vxfs_sb_info *infp = VXFS_SBI(sbp);
- vxfs_put_fake_inode(infp->vsi_fship);
- vxfs_put_fake_inode(infp->vsi_ilist);
- vxfs_put_fake_inode(infp->vsi_stilist);
+ iput(infp->vsi_fship);
+ iput(infp->vsi_ilist);
+ iput(infp->vsi_stilist);
brelse(infp->vsi_bp);
kfree(infp);
@@ -109,14 +99,15 @@ static int
vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp)
{
struct vxfs_sb_info *infp = VXFS_SBI(dentry->d_sb);
+ struct vxfs_sb *raw_sb = infp->vsi_raw;
bufp->f_type = VXFS_SUPER_MAGIC;
bufp->f_bsize = dentry->d_sb->s_blocksize;
- bufp->f_blocks = infp->vsi_raw->vs_dsize;
- bufp->f_bfree = infp->vsi_raw->vs_free;
+ bufp->f_blocks = fs32_to_cpu(infp, raw_sb->vs_dsize);
+ bufp->f_bfree = fs32_to_cpu(infp, raw_sb->vs_free);
bufp->f_bavail = 0;
bufp->f_files = 0;
- bufp->f_ffree = infp->vsi_raw->vs_ifree;
+ bufp->f_ffree = fs32_to_cpu(infp, raw_sb->vs_ifree);
bufp->f_namelen = VXFS_NAMELEN;
return 0;
@@ -129,6 +120,81 @@ static int vxfs_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+static struct inode *vxfs_alloc_inode(struct super_block *sb)
+{
+ struct vxfs_inode_info *vi;
+
+ vi = kmem_cache_alloc(vxfs_inode_cachep, GFP_KERNEL);
+ if (!vi)
+ return NULL;
+ inode_init_once(&vi->vfs_inode);
+ return &vi->vfs_inode;
+}
+
+static void vxfs_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+
+ kmem_cache_free(vxfs_inode_cachep, VXFS_INO(inode));
+}
+
+static void vxfs_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, vxfs_i_callback);
+}
+
+static const struct super_operations vxfs_super_ops = {
+ .alloc_inode = vxfs_alloc_inode,
+ .destroy_inode = vxfs_destroy_inode,
+ .evict_inode = vxfs_evict_inode,
+ .put_super = vxfs_put_super,
+ .statfs = vxfs_statfs,
+ .remount_fs = vxfs_remount,
+};
+
+static int vxfs_try_sb_magic(struct super_block *sbp, int silent,
+ unsigned blk, __fs32 magic)
+{
+ struct buffer_head *bp;
+ struct vxfs_sb *rsbp;
+ struct vxfs_sb_info *infp = VXFS_SBI(sbp);
+ int rc = -ENOMEM;
+
+ bp = sb_bread(sbp, blk);
+ do {
+ if (!bp || !buffer_mapped(bp)) {
+ if (!silent) {
+ printk(KERN_WARNING
+ "vxfs: unable to read disk superblock at %u\n",
+ blk);
+ }
+ break;
+ }
+
+ rc = -EINVAL;
+ rsbp = (struct vxfs_sb *)bp->b_data;
+ if (rsbp->vs_magic != magic) {
+ if (!silent)
+ printk(KERN_NOTICE
+ "vxfs: WRONG superblock magic %08x at %u\n",
+ rsbp->vs_magic, blk);
+ break;
+ }
+
+ rc = 0;
+ infp->vsi_raw = rsbp;
+ infp->vsi_bp = bp;
+ } while (0);
+
+ if (rc) {
+ infp->vsi_raw = NULL;
+ infp->vsi_bp = NULL;
+ brelse(bp);
+ }
+
+ return rc;
+}
+
/**
* vxfs_read_super - read superblock into memory and initialize filesystem
* @sbp: VFS superblock (to fill)
@@ -149,10 +215,10 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
{
struct vxfs_sb_info *infp;
struct vxfs_sb *rsbp;
- struct buffer_head *bp = NULL;
u_long bsize;
struct inode *root;
int ret = -EINVAL;
+ u32 j;
sbp->s_flags |= MS_RDONLY;
@@ -168,42 +234,43 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
goto out;
}
- bp = sb_bread(sbp, 1);
- if (!bp || !buffer_mapped(bp)) {
- if (!silent) {
- printk(KERN_WARNING
- "vxfs: unable to read disk superblock\n");
- }
- goto out;
- }
+ sbp->s_op = &vxfs_super_ops;
+ sbp->s_fs_info = infp;
- rsbp = (struct vxfs_sb *)bp->b_data;
- if (rsbp->vs_magic != VXFS_SUPER_MAGIC) {
+ if (!vxfs_try_sb_magic(sbp, silent, 1,
+ (__force __fs32)cpu_to_le32(VXFS_SUPER_MAGIC))) {
+ /* Unixware, x86 */
+ infp->byte_order = VXFS_BO_LE;
+ } else if (!vxfs_try_sb_magic(sbp, silent, 8,
+ (__force __fs32)cpu_to_be32(VXFS_SUPER_MAGIC))) {
+ /* HP-UX, parisc */
+ infp->byte_order = VXFS_BO_BE;
+ } else {
if (!silent)
- printk(KERN_NOTICE "vxfs: WRONG superblock magic\n");
+ printk(KERN_NOTICE "vxfs: can't find superblock.\n");
goto out;
}
- if ((rsbp->vs_version < 2 || rsbp->vs_version > 4) && !silent) {
- printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n",
- rsbp->vs_version);
+ rsbp = infp->vsi_raw;
+ j = fs32_to_cpu(infp, rsbp->vs_version);
+ if ((j < 2 || j > 4) && !silent) {
+ printk(KERN_NOTICE "vxfs: unsupported VxFS version (%d)\n", j);
goto out;
}
#ifdef DIAGNOSTIC
- printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", rsbp->vs_version);
- printk(KERN_DEBUG "vxfs: blocksize: %d\n", rsbp->vs_bsize);
+ printk(KERN_DEBUG "vxfs: supported VxFS version (%d)\n", j);
+ printk(KERN_DEBUG "vxfs: blocksize: %d\n",
+ fs32_to_cpu(infp, rsbp->vs_bsize));
#endif
- sbp->s_magic = rsbp->vs_magic;
- sbp->s_fs_info = infp;
+ sbp->s_magic = fs32_to_cpu(infp, rsbp->vs_magic);
- infp->vsi_raw = rsbp;
- infp->vsi_bp = bp;
- infp->vsi_oltext = rsbp->vs_oltext[0];
- infp->vsi_oltsize = rsbp->vs_oltsize;
+ infp->vsi_oltext = fs32_to_cpu(infp, rsbp->vs_oltext[0]);
+ infp->vsi_oltsize = fs32_to_cpu(infp, rsbp->vs_oltsize);
- if (!sb_set_blocksize(sbp, rsbp->vs_bsize)) {
+ j = fs32_to_cpu(infp, rsbp->vs_bsize);
+ if (!sb_set_blocksize(sbp, j)) {
printk(KERN_WARNING "vxfs: unable to set final block size\n");
goto out;
}
@@ -218,7 +285,6 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
goto out;
}
- sbp->s_op = &vxfs_super_ops;
root = vxfs_iget(sbp, VXFS_ROOT_INO);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
@@ -233,11 +299,11 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent)
return 0;
out_free_ilist:
- vxfs_put_fake_inode(infp->vsi_fship);
- vxfs_put_fake_inode(infp->vsi_ilist);
- vxfs_put_fake_inode(infp->vsi_stilist);
+ iput(infp->vsi_fship);
+ iput(infp->vsi_ilist);
+ iput(infp->vsi_stilist);
out:
- brelse(bp);
+ brelse(infp->vsi_bp);
kfree(infp);
return ret;
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6f9c9f6f5157..4d09d4441e3e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1327,6 +1327,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
dirty = inode->i_state & I_DIRTY;
if (inode->i_state & I_DIRTY_TIME) {
if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ wbc->sync_mode == WB_SYNC_ALL ||
unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
unlikely(time_after(jiffies,
(inode->dirtied_time_when +
@@ -1807,8 +1808,8 @@ static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
*/
static unsigned long get_nr_dirty_pages(void)
{
- return global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
+ return global_node_page_state(NR_FILE_DIRTY) +
+ global_node_page_state(NR_UNSTABLE_NFS) +
get_nr_dirty_inodes();
}
diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c
index 7d637e2335fd..15a3d042247e 100644
--- a/fs/fscache/histogram.c
+++ b/fs/fscache/histogram.c
@@ -99,7 +99,6 @@ static int fscache_histogram_open(struct inode *inode, struct file *file)
}
const struct file_operations fscache_histogram_fops = {
- .owner = THIS_MODULE,
.open = fscache_histogram_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 6b028b7c4250..5d5ddaa84b21 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -404,7 +404,6 @@ static int fscache_objlist_release(struct inode *inode, struct file *file)
}
const struct file_operations fscache_objlist_fops = {
- .owner = THIS_MODULE,
.open = fscache_objlist_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c
index 7cfa0aacdf6d..7ac6e839b065 100644
--- a/fs/fscache/stats.c
+++ b/fs/fscache/stats.c
@@ -295,7 +295,6 @@ static int fscache_stats_open(struct inode *inode, struct file *file)
}
const struct file_operations fscache_stats_fops = {
- .owner = THIS_MODULE,
.open = fscache_stats_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index cbece1221417..a94d2ed81ab4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -99,19 +99,6 @@ void fuse_request_free(struct fuse_req *req)
kmem_cache_free(fuse_req_cachep, req);
}
-static void block_sigs(sigset_t *oldset)
-{
- sigset_t mask;
-
- siginitsetinv(&mask, sigmask(SIGKILL));
- sigprocmask(SIG_BLOCK, &mask, oldset);
-}
-
-static void restore_sigs(sigset_t *oldset)
-{
- sigprocmask(SIG_SETMASK, oldset, NULL);
-}
-
void __fuse_get_request(struct fuse_req *req)
{
atomic_inc(&req->count);
@@ -151,15 +138,9 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
atomic_inc(&fc->num_waiting);
if (fuse_block_alloc(fc, for_background)) {
- sigset_t oldset;
- int intr;
-
- block_sigs(&oldset);
- intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
- !fuse_block_alloc(fc, for_background));
- restore_sigs(&oldset);
err = -EINTR;
- if (intr)
+ if (wait_event_killable_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background)))
goto out;
}
/* Matches smp_wmb() in fuse_set_initialized() */
@@ -446,14 +427,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
}
if (!test_bit(FR_FORCE, &req->flags)) {
- sigset_t oldset;
-
/* Only fatal signals may interrupt this */
- block_sigs(&oldset);
- err = wait_event_interruptible(req->waitq,
+ err = wait_event_killable(req->waitq,
test_bit(FR_FINISHED, &req->flags));
- restore_sigs(&oldset);
-
if (!err)
return;
@@ -1525,7 +1501,6 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
goto err;
fuse_copy_finish(cs);
buf[outarg.namelen] = 0;
- name.hash = full_name_hash(name.name, name.len);
down_read(&fc->killsb);
err = -ENOENT;
@@ -1576,7 +1551,6 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
goto err;
fuse_copy_finish(cs);
buf[outarg.namelen] = 0;
- name.hash = full_name_hash(name.name, name.len);
down_read(&fc->killsb);
err = -ENOENT;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index cca7b048c07b..5f1627725791 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -955,6 +955,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
if (!dir)
goto unlock;
+ name->hash = full_name_hash(dir, name->name, name->len);
entry = d_lookup(dir, name);
dput(dir);
if (!entry)
@@ -1204,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file,
fc = get_fuse_conn(dir);
- name.hash = full_name_hash(name.name, name.len);
+ name.hash = full_name_hash(parent, name.name, name.len);
dentry = d_lookup(parent, &name);
if (!dentry) {
retry:
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 9154f8679024..f394aff59c36 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -417,6 +417,10 @@ static int fuse_flush(struct file *file, fl_owner_t id)
fuse_sync_writes(inode);
inode_unlock(inode);
+ err = filemap_check_errors(file->f_mapping);
+ if (err)
+ return err;
+
req = fuse_get_req_nofail_nopages(fc, file);
memset(&inarg, 0, sizeof(inarg));
inarg.fh = ff->fh;
@@ -462,6 +466,16 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
goto out;
fuse_sync_writes(inode);
+
+ /*
+ * Due to implementation of fuse writeback
+ * filemap_write_and_wait_range() does not catch errors.
+ * We have to do this directly after fuse_sync_writes()
+ */
+ err = filemap_check_errors(file->f_mapping);
+ if (err)
+ goto out;
+
err = sync_inode_metadata(inode, 1);
if (err)
goto out;
@@ -562,7 +576,6 @@ static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
*/
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
- bool is_sync = is_sync_kiocb(io->iocb);
int left;
spin_lock(&io->lock);
@@ -572,11 +585,11 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
io->bytes = pos;
left = --io->reqs;
- if (!left && is_sync)
+ if (!left && io->blocking)
complete(io->done);
spin_unlock(&io->lock);
- if (!left && !is_sync) {
+ if (!left && !io->blocking) {
ssize_t res = fuse_get_res_by_io(io);
if (res >= 0) {
@@ -1452,7 +1465,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
list_del(&req->writepages_entry);
for (i = 0; i < req->num_pages; i++) {
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+ dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
}
wake_up(&fi->page_waitq);
@@ -1642,7 +1655,7 @@ static int fuse_writepage_locked(struct page *page)
req->inode = inode;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
list_add(&req->writepages_entry, &fi->writepages);
@@ -1756,7 +1769,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
spin_unlock(&fc->lock);
dec_wb_stat(&bdi->wb, WB_WRITEBACK);
- dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+ dec_node_page_state(page, NR_WRITEBACK_TEMP);
wb_writeout_inc(&bdi->wb);
fuse_writepage_free(fc, new_req);
fuse_request_free(new_req);
@@ -1855,7 +1868,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].length = PAGE_SIZE;
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
- inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+ inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
if (is_writeback && fuse_writepage_in_flight(req, page)) {
@@ -2850,7 +2863,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
struct fuse_io_priv *io;
- bool is_sync = is_sync_kiocb(iocb);
pos = offset;
inode = file->f_mapping->host;
@@ -2885,17 +2897,16 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
*/
io->async = async_dio;
io->iocb = iocb;
+ io->blocking = is_sync_kiocb(iocb);
/*
- * We cannot asynchronously extend the size of a file. We have no method
- * to wait on real async I/O requests, so we must submit this request
- * synchronously.
+ * We cannot asynchronously extend the size of a file.
+ * In such case the aio will behave exactly like sync io.
*/
- if (!is_sync && (offset + count > i_size) &&
- iov_iter_rw(iter) == WRITE)
- io->async = false;
+ if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
+ io->blocking = true;
- if (io->async && is_sync) {
+ if (io->async && io->blocking) {
/*
* Additional reference to keep io around after
* calling fuse_aio_complete()
@@ -2915,7 +2926,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
/* we have a non-extending, async request, so return */
- if (!is_sync)
+ if (!io->blocking)
return -EIOCBQUEUED;
wait_for_completion(&wait);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 929c383432b0..5db5d24f91a5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -259,6 +259,7 @@ struct fuse_io_priv {
struct kiocb *iocb;
struct file *file;
struct completion *done;
+ bool blocking;
};
#define FUSE_IO_PRIV_SYNC(f) \
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9961d8432ce3..9b7cb37b4ba8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -942,7 +942,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
- FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_FLOCK_LOCKS | FUSE_HAS_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
FUSE_PARALLEL_DIROPS;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 8eed66af5b82..02a3845363f7 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -128,7 +128,7 @@ static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
diff --git a/fs/hfs/string.c b/fs/hfs/string.c
index 85b610c3909f..ec9f164c35a5 100644
--- a/fs/hfs/string.c
+++ b/fs/hfs/string.c
@@ -59,7 +59,7 @@ int hfs_hash_dentry(const struct dentry *dentry, struct qstr *this)
if (len > HFS_NAMELEN)
len = HFS_NAMELEN;
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (; len; len--)
hash = partial_name_hash(caseorder[*name++], hash);
this->hash = end_name_hash(hash);
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index ef9fefe364a6..19462d773fe2 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -126,7 +126,7 @@ static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index e8ef121a4d8b..c13c8a240be3 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -346,7 +346,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
astr = str->name;
len = str->len;
while (len > 0) {
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5c57654927a6..90e46cd752fe 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -959,10 +959,11 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
if (S_ISLNK(root_inode->i_mode)) {
char *name = follow_link(host_root_path);
- if (IS_ERR(name))
+ if (IS_ERR(name)) {
err = PTR_ERR(name);
- else
- err = read_name(root_inode, name);
+ goto out_put;
+ }
+ err = read_name(root_inode, name);
kfree(name);
if (err)
goto out_put;
diff --git a/fs/hpfs/dentry.c b/fs/hpfs/dentry.c
index fa27980f2229..60e6d334d79a 100644
--- a/fs/hpfs/dentry.c
+++ b/fs/hpfs/dentry.c
@@ -26,7 +26,7 @@ static int hpfs_hash_dentry(const struct dentry *dentry, struct qstr *qstr)
/*return -ENOENT;*/
x:
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i = 0; i < l; i++)
hash = partial_name_hash(hpfs_upcase(hpfs_sb(dentry->d_sb)->sb_cp_table,qstr->name[i]), hash);
qstr->hash = end_name_hash(hash);
diff --git a/fs/inode.c b/fs/inode.c
index e171f7b5f9e4..ad445542c285 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -345,7 +345,7 @@ EXPORT_SYMBOL(inc_nlink);
void address_space_init_once(struct address_space *mapping)
{
memset(mapping, 0, sizeof(*mapping));
- INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+ INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC | __GFP_ACCOUNT);
spin_lock_init(&mapping->tree_lock);
init_rwsem(&mapping->i_mmap_rwsem);
INIT_LIST_HEAD(&mapping->private_list);
@@ -1619,6 +1619,13 @@ bool atime_needs_update(const struct path *path, struct inode *inode)
if (inode->i_flags & S_NOATIME)
return false;
+
+ /* Atime updates will likely cause i_uid and i_gid to be written
+ * back improprely if their true value is unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return false;
+
if (IS_NOATIME(inode))
return false;
if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 116a333e9c77..0f56deb24ce6 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -590,6 +590,7 @@ static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
goto out;
}
+ same->dest_count = count;
ret = vfs_dedupe_file_range(file, same);
if (ret)
goto out;
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 2ce5b75ee9a5..44af14b2e916 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -361,7 +361,6 @@ static int zisofs_readpage(struct file *file, struct page *page)
const struct address_space_operations zisofs_aops = {
.readpage = zisofs_readpage,
- /* No sync_page operation supported? */
/* No bmap operation supported */
};
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 131dedc920d8..761fade7680f 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -174,7 +174,7 @@ struct iso9660_options{
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hashi_common(struct qstr *qstr, int ms)
+isofs_hashi_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -188,7 +188,7 @@ isofs_hashi_common(struct qstr *qstr, int ms)
len--;
}
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
while (len--) {
c = tolower(*name++);
hash = partial_name_hash(c, hash);
@@ -231,7 +231,7 @@ static int isofs_dentry_cmp_common(
static int
isofs_hashi(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(qstr, 0);
+ return isofs_hashi_common(dentry, qstr, 0);
}
static int
@@ -246,7 +246,7 @@ isofs_dentry_cmpi(const struct dentry *parent, const struct dentry *dentry,
* Compute the hash for the isofs name corresponding to the dentry.
*/
static int
-isofs_hash_common(struct qstr *qstr, int ms)
+isofs_hash_common(const struct dentry *dentry, struct qstr *qstr, int ms)
{
const char *name;
int len;
@@ -258,7 +258,7 @@ isofs_hash_common(struct qstr *qstr, int ms)
len--;
}
- qstr->hash = full_name_hash(name, len);
+ qstr->hash = full_name_hash(dentry, name, len);
return 0;
}
@@ -266,13 +266,13 @@ isofs_hash_common(struct qstr *qstr, int ms)
static int
isofs_hash_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hash_common(qstr, 1);
+ return isofs_hash_common(dentry, qstr, 1);
}
static int
isofs_hashi_ms(const struct dentry *dentry, struct qstr *qstr)
{
- return isofs_hashi_common(qstr, 1);
+ return isofs_hashi_common(dentry, qstr, 1);
}
static int
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 84c4bf3631a2..30eb33ff8189 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -81,6 +81,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
struct jffs2_full_dirent *fd = NULL, *fd_list;
uint32_t ino = 0;
struct inode *inode = NULL;
+ unsigned int nhash;
jffs2_dbg(1, "jffs2_lookup()\n");
@@ -89,11 +90,14 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target,
dir_f = JFFS2_INODE_INFO(dir_i);
+ /* The 'nhash' on the fd_list is not the same as the dentry hash */
+ nhash = full_name_hash(NULL, target->d_name.name, target->d_name.len);
+
mutex_lock(&dir_f->sem);
/* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */
- for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) {
- if (fd_list->nhash == target->d_name.hash &&
+ for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= nhash; fd_list = fd_list->next) {
+ if (fd_list->nhash == nhash &&
(!fd || fd_list->version > fd->version) &&
strlen(fd_list->name) == target->d_name.len &&
!strncmp(fd_list->name, target->d_name.name, target->d_name.len)) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index bfebbf13698c..06a71dbd4833 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -674,7 +674,7 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
}
}
- fd->nhash = full_name_hash(fd->name, rd->nsize);
+ fd->nhash = full_name_hash(NULL, fd->name, rd->nsize);
fd->next = NULL;
fd->name[rd->nsize] = '\0';
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 9ad5ba4b299b..90431dd613b8 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -1100,7 +1100,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo
fd->next = NULL;
fd->version = je32_to_cpu(rd->version);
fd->ino = je32_to_cpu(rd->ino);
- fd->nhash = full_name_hash(fd->name, checkedlen);
+ fd->nhash = full_name_hash(NULL, fd->name, checkedlen);
fd->type = rd->type;
jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index bc5385471a6e..be7c8a6a5748 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -476,7 +476,7 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras
fd->next = NULL;
fd->version = je32_to_cpu(spd->version);
fd->ino = je32_to_cpu(spd->ino);
- fd->nhash = full_name_hash(fd->name, checkedlen);
+ fd->nhash = full_name_hash(NULL, fd->name, checkedlen);
fd->type = spd->type;
jffs2_add_fd_to_list(c, fd, &ic->scan_dents);
diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c
index 7fb187ab2682..cda9a361368e 100644
--- a/fs/jffs2/write.c
+++ b/fs/jffs2/write.c
@@ -245,7 +245,7 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff
fd->version = je32_to_cpu(rd->version);
fd->ino = je32_to_cpu(rd->ino);
- fd->nhash = full_name_hash(name, namelen);
+ fd->nhash = full_name_hash(NULL, name, namelen);
fd->type = rd->type;
memcpy(fd->name, name, namelen);
fd->name[namelen]=0;
@@ -598,7 +598,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f,
jffs2_add_fd_to_list(c, fd, &dir_f->dents);
mutex_unlock(&dir_f->sem);
} else {
- uint32_t nhash = full_name_hash(name, namelen);
+ uint32_t nhash = full_name_hash(NULL, name, namelen);
fd = dir_f->dents;
/* We don't actually want to reserve any space, but we do
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index dd824d9b0b1a..a37eb5f8cbc0 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -58,7 +58,6 @@ static ssize_t jfs_loglevel_proc_write(struct file *file,
}
static const struct file_operations jfs_loglevel_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_loglevel_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a74752146ec9..a21ea8b3e5fa 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2517,7 +2517,6 @@ static int jfs_lmstats_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_lmstats_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_lmstats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c
index e7fa9e513040..489aaa1403e5 100644
--- a/fs/jfs/jfs_metapage.c
+++ b/fs/jfs/jfs_metapage.c
@@ -830,7 +830,6 @@ static int jfs_mpstat_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_mpstat_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_mpstat_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index eddf2b6eda85..2e58978d6f45 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -3040,7 +3040,6 @@ static int jfs_txanchor_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_txanchor_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_txanchor_proc_open,
.read = seq_read,
.llseek = seq_lseek,
@@ -3081,7 +3080,6 @@ static int jfs_txstats_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_txstats_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_txstats_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index 5ad7748860ce..5cde6d2fcfca 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -3894,7 +3894,6 @@ static int jfs_xtstat_proc_open(struct inode *inode, struct file *file)
}
const struct file_operations jfs_xtstat_proc_fops = {
- .owner = THIS_MODULE,
.open = jfs_xtstat_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 539deddecbb0..04baf0dfc40c 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1564,7 +1564,7 @@ static int jfs_ci_hash(const struct dentry *dir, struct qstr *this)
unsigned long hash;
int i;
- hash = init_name_hash();
+ hash = init_name_hash(dir);
for (i=0; i < this->len; i++)
hash = partial_name_hash(tolower(this->name[i]), hash);
this->hash = end_name_hash(hash);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8a652404eef6..e57174d43683 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -336,11 +336,11 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
*/
static unsigned int kernfs_name_hash(const char *name, const void *ns)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(ns);
unsigned int len = strlen(name);
while (len--)
hash = partial_name_hash(*name++, hash);
- hash = (end_name_hash(hash) ^ hash_ptr((void *)ns, 31));
+ hash = end_name_hash(hash);
hash &= 0x7fffffffU;
/* Reserve hash numbers 0, 1 and INT_MAX for magic directory entries */
if (hash < 2)
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 63534f5f9073..b3d73ad52b22 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -152,6 +152,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
struct dentry *root;
info->sb = sb;
+ /* Userspace would break if executables or devices appear on sysfs */
+ sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = magic;
@@ -241,7 +243,8 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
info->root = root;
info->ns = ns;
- sb = sget(fs_type, kernfs_test_super, kernfs_set_super, flags, info);
+ sb = sget_userns(fs_type, kernfs_test_super, kernfs_set_super, flags,
+ &init_user_ns, info);
if (IS_ERR(sb) || sb->s_fs_info != info)
kfree(info);
if (IS_ERR(sb))
diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c
index 2a0a98480e39..8f72cb237ef3 100644
--- a/fs/lockd/procfs.c
+++ b/fs/lockd/procfs.c
@@ -64,7 +64,6 @@ static const struct file_operations lockd_end_grace_operations = {
.read = nlm_end_grace_read,
.llseek = default_llseek,
.release = simple_transaction_release,
- .owner = THIS_MODULE,
};
int __init
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index 2d5336bd4efd..bcd754d216bd 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -95,7 +95,7 @@ static int beyond_eof(struct inode *inode, loff_t bix)
* of each character and pick a prime nearby, preferably a bit-sparse
* one.
*/
-static u32 hash_32(const char *s, int len, u32 seed)
+static u32 logfs_hash_32(const char *s, int len, u32 seed)
{
u32 hash = seed;
int i;
@@ -159,7 +159,7 @@ static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
struct qstr *name = &dentry->d_name;
struct page *page;
struct logfs_disk_dentry *dd;
- u32 hash = hash_32(name->name, name->len, 0);
+ u32 hash = logfs_hash_32(name->name, name->len, 0);
pgoff_t index;
int round;
@@ -370,7 +370,7 @@ static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
{
struct page *page;
struct logfs_disk_dentry *dd;
- u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
+ u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0);
pgoff_t index;
int round, err;
diff --git a/fs/mpage.c b/fs/mpage.c
index 2ca1f39c8cba..7a09c55b4bd0 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -50,7 +50,7 @@ static void mpage_end_io(struct bio *bio)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- page_endio(page, bio_data_dir(bio), bio->bi_error);
+ page_endio(page, bio_op(bio), bio->bi_error);
}
bio_put(bio);
diff --git a/fs/namei.c b/fs/namei.c
index 70580ab1445c..c386a329ab20 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -36,6 +36,7 @@
#include <linux/posix_acl.h>
#include <linux/hash.h>
#include <linux/bitops.h>
+#include <linux/init_task.h>
#include <asm/uaccess.h>
#include "internal.h"
@@ -410,6 +411,14 @@ int __inode_permission(struct inode *inode, int mask)
*/
if (IS_IMMUTABLE(inode))
return -EACCES;
+
+ /*
+ * Updating mtime will likely cause i_uid and i_gid to be
+ * written back improperly if their true value is unknown
+ * to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EACCES;
}
retval = do_inode_permission(inode, mask);
@@ -901,6 +910,7 @@ static inline int may_follow_link(struct nameidata *nd)
{
const struct inode *inode;
const struct inode *parent;
+ kuid_t puid;
if (!sysctl_protected_symlinks)
return 0;
@@ -916,7 +926,8 @@ static inline int may_follow_link(struct nameidata *nd)
return 0;
/* Allowed if parent directory and link owner match. */
- if (uid_eq(parent->i_uid, inode->i_uid))
+ puid = parent->i_uid;
+ if (uid_valid(puid) && uid_eq(puid, inode->i_uid))
return 0;
if (nd->flags & LOOKUP_RCU)
@@ -1089,6 +1100,7 @@ static int follow_automount(struct path *path, struct nameidata *nd,
bool *need_mntput)
{
struct vfsmount *mnt;
+ const struct cred *old_cred;
int err;
if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
@@ -1110,11 +1122,16 @@ static int follow_automount(struct path *path, struct nameidata *nd,
path->dentry->d_inode)
return -EISDIR;
+ if (path->dentry->d_sb->s_user_ns != &init_user_ns)
+ return -EACCES;
+
nd->total_link_count++;
if (nd->total_link_count >= 40)
return -ELOOP;
+ old_cred = override_creds(&init_cred);
mnt = path->dentry->d_op->d_automount(path);
+ revert_creds(old_cred);
if (IS_ERR(mnt)) {
/*
* The filesystem is allowed to return -EISDIR here to indicate
@@ -1449,9 +1466,8 @@ static int follow_dotdot(struct nameidata *nd)
}
/*
- * This looks up the name in dcache, possibly revalidates the old dentry and
- * allocates a new one if not found or not valid. In the need_lookup argument
- * returns whether i_op->lookup is necessary.
+ * This looks up the name in dcache and possibly revalidates the found dentry.
+ * NULL is returned if the dentry does not exist in the cache.
*/
static struct dentry *lookup_dcache(const struct qstr *name,
struct dentry *dir,
@@ -1890,9 +1906,9 @@ static inline unsigned int fold_hash(unsigned long x, unsigned long y)
* payload bytes, to match the way that hash_name() iterates until it
* finds the delimiter after the name.
*/
-unsigned int full_name_hash(const char *name, unsigned int len)
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long a, x = 0, y = 0;
+ unsigned long a, x = 0, y = (unsigned long)salt;
for (;;) {
if (!len)
@@ -1911,15 +1927,19 @@ done:
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
-u64 hashlen_string(const char *name)
+u64 hashlen_string(const void *salt, const char *name)
{
- unsigned long a = 0, x = 0, y = 0, adata, mask, len;
+ unsigned long a = 0, x = 0, y = (unsigned long)salt;
+ unsigned long adata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = -sizeof(unsigned long);
+ len = 0;
+ goto inside;
+
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
+inside:
a = load_unaligned_zeropad(name+len);
} while (!has_zero(a, &adata, &constants));
@@ -1935,15 +1955,19 @@ EXPORT_SYMBOL(hashlen_string);
* Calculate the length and hash of the path component, and
* return the "hash_len" as the result.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long a = 0, b, x = 0, y = 0, adata, bdata, mask, len;
+ unsigned long a = 0, b, x = 0, y = (unsigned long)salt;
+ unsigned long adata, bdata, mask, len;
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
- len = -sizeof(unsigned long);
+ len = 0;
+ goto inside;
+
do {
HASH_MIX(x, y, a);
len += sizeof(unsigned long);
+inside:
a = load_unaligned_zeropad(name+len);
b = a ^ REPEAT_BYTE('/');
} while (!(has_zero(a, &adata, &constants) | has_zero(b, &bdata, &constants)));
@@ -1959,9 +1983,9 @@ static inline u64 hash_name(const char *name)
#else /* !CONFIG_DCACHE_WORD_ACCESS: Slow, byte-at-a-time version */
/* Return the hash of a string of known length */
-unsigned int full_name_hash(const char *name, unsigned int len)
+unsigned int full_name_hash(const void *salt, const char *name, unsigned int len)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
while (len--)
hash = partial_name_hash((unsigned char)*name++, hash);
return end_name_hash(hash);
@@ -1969,9 +1993,9 @@ unsigned int full_name_hash(const char *name, unsigned int len)
EXPORT_SYMBOL(full_name_hash);
/* Return the "hash_len" (hash and length) of a null-terminated string */
-u64 hashlen_string(const char *name)
+u64 hashlen_string(const void *salt, const char *name)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
@@ -1988,9 +2012,9 @@ EXPORT_SYMBOL(hashlen_string);
* We know there's a real path component here of at least
* one character.
*/
-static inline u64 hash_name(const char *name)
+static inline u64 hash_name(const void *salt, const char *name)
{
- unsigned long hash = init_name_hash();
+ unsigned long hash = init_name_hash(salt);
unsigned long len = 0, c;
c = (unsigned char)*name;
@@ -2030,7 +2054,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (err)
return err;
- hash_len = hash_name(name);
+ hash_len = hash_name(nd->path.dentry, name);
type = LAST_NORM;
if (name[0] == '.') switch (hashlen_len(hash_len)) {
@@ -2389,33 +2413,6 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
EXPORT_SYMBOL(vfs_path_lookup);
/**
- * lookup_hash - lookup single pathname component on already hashed name
- * @name: name and hash to lookup
- * @base: base directory to lookup from
- *
- * The name must have been verified and hashed (see lookup_one_len()). Using
- * this after just full_name_hash() is unsafe.
- *
- * This function also doesn't check for search permission on base directory.
- *
- * Use lookup_one_len_unlocked() instead, unless you really know what you are
- * doing.
- *
- * Do not hold i_mutex; this helper takes i_mutex if necessary.
- */
-struct dentry *lookup_hash(const struct qstr *name, struct dentry *base)
-{
- struct dentry *ret;
-
- ret = lookup_dcache(name, base, 0);
- if (!ret)
- ret = lookup_slow(name, base, 0);
-
- return ret;
-}
-EXPORT_SYMBOL(lookup_hash);
-
-/**
* lookup_one_len - filesystem helper to lookup single pathname component
* @name: pathname component to lookup
* @base: base directory to lookup from
@@ -2436,7 +2433,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
@@ -2486,10 +2483,11 @@ struct dentry *lookup_one_len_unlocked(const char *name,
struct qstr this;
unsigned int c;
int err;
+ struct dentry *ret;
this.name = name;
this.len = len;
- this.hash = full_name_hash(name, len);
+ this.hash = full_name_hash(base, name, len);
if (!len)
return ERR_PTR(-EACCES);
@@ -2517,7 +2515,10 @@ struct dentry *lookup_one_len_unlocked(const char *name,
if (err)
return ERR_PTR(err);
- return lookup_hash(&this, base);
+ ret = lookup_dcache(&this, base, 0);
+ if (!ret)
+ ret = lookup_slow(&this, base, 0);
+ return ret;
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
@@ -2757,10 +2758,11 @@ EXPORT_SYMBOL(__check_sticky);
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do antyhing with
* links pointing to it.
- * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
- * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
- * 9. We can't remove a root or mountpoint.
- * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ * 7. If the victim has an unknown uid or gid we can't change the inode.
+ * 8. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ * 9. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ * 10. We can't remove a root or mountpoint.
+ * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
@@ -2782,7 +2784,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
return -EPERM;
if (check_sticky(dir, inode) || IS_APPEND(inode) ||
- IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+ IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode))
return -EPERM;
if (isdir) {
if (!d_is_dir(victim))
@@ -2803,16 +2805,22 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
* 1. We can't do it if child already exists (open has special treatment for
* this case, but since we are inlined it's OK)
* 2. We can't do it if dir is read-only (done in permission())
- * 3. We should have write and exec permissions on dir
- * 4. We can't do it if dir is immutable (done in permission())
+ * 3. We can't do it if the fs can't represent the fsuid or fsgid.
+ * 4. We should have write and exec permissions on dir
+ * 5. We can't do it if dir is immutable (done in permission())
*/
static inline int may_create(struct inode *dir, struct dentry *child)
{
+ struct user_namespace *s_user_ns;
audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
if (child->d_inode)
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
+ s_user_ns = dir->i_sb->s_user_ns;
+ if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
+ !kgid_has_mapping(s_user_ns, current_fsgid()))
+ return -EOVERFLOW;
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
@@ -2881,6 +2889,12 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
}
EXPORT_SYMBOL(vfs_create);
+bool may_open_dev(const struct path *path)
+{
+ return !(path->mnt->mnt_flags & MNT_NODEV) &&
+ !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
+}
+
static int may_open(struct path *path, int acc_mode, int flag)
{
struct dentry *dentry = path->dentry;
@@ -2899,7 +2913,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
break;
case S_IFBLK:
case S_IFCHR:
- if (path->mnt->mnt_flags & MNT_NODEV)
+ if (!may_open_dev(path))
return -EACCES;
/*FALLTHRU*/
case S_IFIFO:
@@ -4151,6 +4165,13 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
+ /*
+ * Updating the link count will likely cause i_uid and i_gid to
+ * be writen back improperly if their true value is unknown to
+ * the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
@@ -4328,7 +4349,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Check source == target.
* On overlayfs need to look at underlying inodes.
*/
- if (vfs_select_inode(old_dentry, 0) == vfs_select_inode(new_dentry, 0))
+ if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
return 0;
error = may_delete(old_dir, old_dentry, is_dir);
diff --git a/fs/namespace.c b/fs/namespace.c
index 419f746d851d..7bb2cda3bfef 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2186,13 +2186,7 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) &&
!(mnt_flags & MNT_NODEV)) {
- /* Was the nodev implicitly added in mount? */
- if ((mnt->mnt_ns->user_ns != &init_user_ns) &&
- !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- mnt_flags |= MNT_NODEV;
- } else {
- return -EPERM;
- }
+ return -EPERM;
}
if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) &&
!(mnt_flags & MNT_NOSUID)) {
@@ -2376,7 +2370,7 @@ unlock:
return err;
}
-static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags);
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
/*
* create a new mount for userspace and request it to be added into the
@@ -2386,7 +2380,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
int mnt_flags, const char *name, void *data)
{
struct file_system_type *type;
- struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns;
struct vfsmount *mnt;
int err;
@@ -2397,26 +2390,6 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
if (!type)
return -ENODEV;
- if (user_ns != &init_user_ns) {
- if (!(type->fs_flags & FS_USERNS_MOUNT)) {
- put_filesystem(type);
- return -EPERM;
- }
- /* Only in special cases allow devices from mounts
- * created outside the initial user namespace.
- */
- if (!(type->fs_flags & FS_USERNS_DEV_MOUNT)) {
- flags |= MS_NODEV;
- mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV;
- }
- if (type->fs_flags & FS_USERNS_VISIBLE) {
- if (!fs_fully_visible(type, &mnt_flags)) {
- put_filesystem(type);
- return -EPERM;
- }
- }
- }
-
mnt = vfs_kern_mount(type, flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
!mnt->mnt_sb->s_subtype)
@@ -2426,6 +2399,11 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
if (IS_ERR(mnt))
return PTR_ERR(mnt);
+ if (mount_too_revealing(mnt, &mnt_flags)) {
+ mntput(mnt);
+ return -EPERM;
+ }
+
err = do_add_mount(real_mount(mnt), path, mnt_flags);
if (err)
mntput(mnt);
@@ -3217,22 +3195,19 @@ bool current_chrooted(void)
return chrooted;
}
-static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
+static bool mnt_already_visible(struct mnt_namespace *ns, struct vfsmount *new,
+ int *new_mnt_flags)
{
- struct mnt_namespace *ns = current->nsproxy->mnt_ns;
int new_flags = *new_mnt_flags;
struct mount *mnt;
bool visible = false;
- if (unlikely(!ns))
- return false;
-
down_read(&namespace_sem);
list_for_each_entry(mnt, &ns->list, mnt_list) {
struct mount *child;
int mnt_flags;
- if (mnt->mnt.mnt_sb->s_type != type)
+ if (mnt->mnt.mnt_sb->s_type != new->mnt_sb->s_type)
continue;
/* This mount is not fully visible if it's root directory
@@ -3241,12 +3216,8 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root)
continue;
- /* Read the mount flags and filter out flags that
- * may safely be ignored.
- */
+ /* A local view of the mount flags */
mnt_flags = mnt->mnt.mnt_flags;
- if (mnt->mnt.mnt_sb->s_iflags & SB_I_NOEXEC)
- mnt_flags &= ~(MNT_LOCK_NOSUID | MNT_LOCK_NOEXEC);
/* Don't miss readonly hidden in the superblock flags */
if (mnt->mnt.mnt_sb->s_flags & MS_RDONLY)
@@ -3258,15 +3229,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
if ((mnt_flags & MNT_LOCK_READONLY) &&
!(new_flags & MNT_READONLY))
continue;
- if ((mnt_flags & MNT_LOCK_NODEV) &&
- !(new_flags & MNT_NODEV))
- continue;
- if ((mnt_flags & MNT_LOCK_NOSUID) &&
- !(new_flags & MNT_NOSUID))
- continue;
- if ((mnt_flags & MNT_LOCK_NOEXEC) &&
- !(new_flags & MNT_NOEXEC))
- continue;
if ((mnt_flags & MNT_LOCK_ATIME) &&
((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK)))
continue;
@@ -3286,9 +3248,6 @@ static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags)
}
/* Preserve the locked attributes */
*new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \
- MNT_LOCK_NODEV | \
- MNT_LOCK_NOSUID | \
- MNT_LOCK_NOEXEC | \
MNT_LOCK_ATIME);
visible = true;
goto found;
@@ -3299,6 +3258,42 @@ found:
return visible;
}
+static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags)
+{
+ const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV;
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ unsigned long s_iflags;
+
+ if (ns->user_ns == &init_user_ns)
+ return false;
+
+ /* Can this filesystem be too revealing? */
+ s_iflags = mnt->mnt_sb->s_iflags;
+ if (!(s_iflags & SB_I_USERNS_VISIBLE))
+ return false;
+
+ if ((s_iflags & required_iflags) != required_iflags) {
+ WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n",
+ required_iflags);
+ return true;
+ }
+
+ return !mnt_already_visible(ns, mnt, new_mnt_flags);
+}
+
+bool mnt_may_suid(struct vfsmount *mnt)
+{
+ /*
+ * Foreign mounts (accessed via fchdir or through /proc
+ * symlinks) are always treated as if they are nosuid. This
+ * prevents namespaces from trusting potentially unsafe
+ * suid/sgid bits, file caps, or security labels that originate
+ * in other namespaces.
+ */
+ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) &&
+ current_in_userns(mnt->mnt_sb->s_user_ns);
+}
+
static struct ns_common *mntns_get(struct task_struct *task)
{
struct ns_common *ns = NULL;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index bfdad003ee56..9add7ab747a5 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -139,7 +139,7 @@ ncp_hash_dentry(const struct dentry *dentry, struct qstr *this)
int i;
t = NCP_IO_TABLE(sb);
- hash = init_name_hash();
+ hash = init_name_hash(dentry);
for (i=0; i<this->len ; i++)
hash = partial_name_hash(ncp_tolower(t, this->name[i]),
hash);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 8664417955a2..6abdda209642 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o
CFLAGS_nfstrace.o += -I$(src)
nfs-y := client.o dir.o file.o getroot.o inode.o super.o \
- direct.o pagelist.o read.o symlink.o unlink.o \
+ io.o direct.o pagelist.o read.o symlink.o unlink.o \
write.o namespace.o mount_clnt.o nfstrace.o
nfs-$(CONFIG_ROOT_NFS) += nfsroot.o
nfs-$(CONFIG_SYSCTL) += sysctl.o
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e5b89675263e..a69ef4e9c24c 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -65,8 +65,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
if (!p)
return -EIO;
b->simple.nr_sigs = be32_to_cpup(p++);
- if (!b->simple.nr_sigs) {
- dprintk("no signature\n");
+ if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
+ dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
return -EIO;
}
@@ -89,7 +89,8 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
memcpy(&b->simple.sigs[i].sig, p,
b->simple.sigs[i].sig_len);
- b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
+ b->simple.len += 8 + 4 + \
+ (XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
}
break;
case PNFS_BLOCK_VOLUME_SLICE:
@@ -104,7 +105,12 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 4);
if (!p)
return -EIO;
+
b->concat.volumes_count = be32_to_cpup(p++);
+ if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->concat.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
if (!p)
@@ -116,8 +122,13 @@ nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_inline_decode(xdr, 8 + 4);
if (!p)
return -EIO;
+
p = xdr_decode_hyper(p, &b->stripe.chunk_size);
b->stripe.volumes_count = be32_to_cpup(p++);
+ if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
+ dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
if (!p)
@@ -224,18 +235,20 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
dev_t dev;
dev = bl_resolve_deviceid(server, v, gfp_mask);
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
- if (IS_ERR(d->bdev)) {
+ bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
- MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
- return PTR_ERR(d->bdev);
+ MAJOR(dev), MINOR(dev), PTR_ERR(bdev));
+ return PTR_ERR(bdev);
}
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
@@ -287,44 +300,71 @@ bl_validate_designator(struct pnfs_block_volume *v)
}
}
+/*
+ * Try to open the udev path for the WWN. At least on Debian the udev
+ * by-id path will always point to the dm-multipath device if one exists.
+ */
+static struct block_device *
+bl_open_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ if (IS_ERR(bdev)) {
+ pr_warn("pNFS: failed to open device %s (%ld)\n",
+ devname, PTR_ERR(bdev));
+ }
+
+ kfree(devname);
+ return bdev;
+}
+
+/*
+ * Try to open the RH/Fedora specific dm-mpath udev path for this WWN, as the
+ * wwn- links will only point to the first discovered SCSI device there.
+ */
+static struct block_device *
+bl_open_dm_mpath_udev_path(struct pnfs_block_volume *v)
+{
+ struct block_device *bdev;
+ const char *devname;
+
+ devname = kasprintf(GFP_KERNEL,
+ "/dev/disk/by-id/dm-uuid-mpath-%d%*phN",
+ v->scsi.designator_type,
+ v->scsi.designator_len, v->scsi.designator);
+ if (!devname)
+ return ERR_PTR(-ENOMEM);
+
+ bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
+ kfree(devname);
+ return bdev;
+}
+
static int
bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
{
struct pnfs_block_volume *v = &volumes[idx];
+ struct block_device *bdev;
const struct pr_ops *ops;
- const char *devname;
int error;
if (!bl_validate_designator(v))
return -EINVAL;
- switch (v->scsi.designator_len) {
- case 8:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%8phN",
- v->scsi.designator);
- break;
- case 12:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%12phN",
- v->scsi.designator);
- break;
- case 16:
- devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%16phN",
- v->scsi.designator);
- break;
- default:
- return -EINVAL;
- }
-
- d->bdev = blkdev_get_by_path(devname, FMODE_READ, NULL);
- if (IS_ERR(d->bdev)) {
- pr_warn("pNFS: failed to open device %s (%ld)\n",
- devname, PTR_ERR(d->bdev));
- kfree(devname);
- return PTR_ERR(d->bdev);
- }
-
- kfree(devname);
+ bdev = bl_open_dm_mpath_udev_path(v);
+ if (IS_ERR(bdev))
+ bdev = bl_open_udev_path(v);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ d->bdev = bdev;
d->len = i_size_read(d->bdev->bd_inode);
d->map = bl_map_simple;
@@ -352,7 +392,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
return 0;
out_blkdev_put:
- blkdev_put(d->bdev, FMODE_READ);
+ blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
return error;
}
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 720b3ff55fa9..992bcb19c11e 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -121,6 +121,16 @@ ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
return be;
}
+static void __ext_put_deviceids(struct list_head *head)
+{
+ struct pnfs_block_extent *be, *tmp;
+
+ list_for_each_entry_safe(be, tmp, head, be_list) {
+ nfs4_put_deviceid_node(be->be_device);
+ kfree(be);
+ }
+}
+
static void
__ext_tree_insert(struct rb_root *root,
struct pnfs_block_extent *new, bool merge_ok)
@@ -163,7 +173,8 @@ free_new:
}
static int
-__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
+__ext_tree_remove(struct rb_root *root,
+ sector_t start, sector_t end, struct list_head *tmp)
{
struct pnfs_block_extent *be;
sector_t len1 = 0, len2 = 0;
@@ -223,8 +234,7 @@ __ext_tree_remove(struct rb_root *root, sector_t start, sector_t end)
struct pnfs_block_extent *next = ext_tree_next(be);
rb_erase(&be->be_node, root);
- nfs4_put_deviceid_node(be->be_device);
- kfree(be);
+ list_add_tail(&be->be_list, tmp);
be = next;
}
@@ -350,16 +360,18 @@ int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
sector_t start, sector_t end)
{
int err, err2;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (rw) {
- err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end);
+ err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
if (!err)
err = err2;
}
spin_unlock(&bl->bl_ext_lock);
+ __ext_put_deviceids(&tmp);
return err;
}
@@ -396,12 +408,13 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
sector_t end = start + len;
struct pnfs_block_extent *be;
int err = 0;
+ LIST_HEAD(tmp);
spin_lock(&bl->bl_ext_lock);
/*
* First remove all COW extents or holes from written to range.
*/
- err = __ext_tree_remove(&bl->bl_ext_ro, start, end);
+ err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
if (err)
goto out;
@@ -459,6 +472,8 @@ ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
}
out:
spin_unlock(&bl->bl_ext_lock);
+
+ __ext_put_deviceids(&tmp);
return err;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index aaa2e8d3df6f..c92a75e066a6 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -119,27 +119,30 @@ out:
* hashed by filehandle.
*/
static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct nfs_server *server;
+ struct nfs_inode *nfsi;
struct inode *ino;
struct pnfs_layout_hdr *lo;
+restart:
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
- if (!nfs4_stateid_match_other(&lo->plh_stateid, stateid))
+ nfsi = NFS_I(lo->plh_inode);
+ if (nfs_compare_fh(fh, &nfsi->fh))
continue;
- if (nfs_compare_fh(fh, &NFS_I(lo->plh_inode)->fh))
+ if (nfsi->layout != lo)
continue;
ino = igrab(lo->plh_inode);
if (!ino)
break;
spin_lock(&ino->i_lock);
/* Is this layout in the process of being freed? */
- if (NFS_I(ino)->layout != lo) {
+ if (nfsi->layout != lo) {
spin_unlock(&ino->i_lock);
iput(ino);
- break;
+ goto restart;
}
pnfs_get_layout_hdr(lo);
spin_unlock(&ino->i_lock);
@@ -151,13 +154,13 @@ static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
}
static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
- struct nfs_fh *fh, nfs4_stateid *stateid)
+ struct nfs_fh *fh)
{
struct pnfs_layout_hdr *lo;
spin_lock(&clp->cl_lock);
rcu_read_lock();
- lo = get_layout_by_fh_locked(clp, fh, stateid);
+ lo = get_layout_by_fh_locked(clp, fh);
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
@@ -167,17 +170,39 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
/*
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/
-static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new)
{
u32 oldseq, newseq;
- oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+ /* Is the stateid still not initialised? */
+ if (!pnfs_layout_is_valid(lo))
+ return NFS4ERR_DELAY;
+
+ /* Mismatched stateid? */
+ if (!nfs4_stateid_match_other(&lo->plh_stateid, new))
+ return NFS4ERR_BAD_STATEID;
+
newseq = be32_to_cpu(new->seqid);
+ /* Are we already in a layout recall situation? */
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+ lo->plh_return_seq != 0) {
+ if (newseq < lo->plh_return_seq)
+ return NFS4ERR_OLD_STATEID;
+ if (newseq > lo->plh_return_seq)
+ return NFS4ERR_DELAY;
+ goto out;
+ }
+ /* Check that the stateid matches what we think it should be. */
+ oldseq = be32_to_cpu(lo->plh_stateid.seqid);
if (newseq > oldseq + 1)
- return false;
- return true;
+ return NFS4ERR_DELAY;
+ /* Crazy server! */
+ if (newseq <= oldseq)
+ return NFS4ERR_OLD_STATEID;
+out:
+ return NFS_OK;
}
static u32 initiate_file_draining(struct nfs_client *clp,
@@ -188,7 +213,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
LIST_HEAD(free_me_list);
- lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
+ lo = get_layout_by_fh(clp, &args->cbl_fh);
if (!lo) {
trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
&args->cbl_stateid, -rv);
@@ -196,18 +221,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
}
ino = lo->plh_inode;
+ pnfs_layoutcommit_inode(ino, false);
+
spin_lock(&ino->i_lock);
- if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
- rv = NFS4ERR_DELAY;
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ if (rv != NFS_OK)
goto unlock;
- }
pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
- spin_unlock(&ino->i_lock);
-
- pnfs_layoutcommit_inode(ino, false);
- spin_lock(&ino->i_lock);
/*
* Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
*/
@@ -223,11 +245,13 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock;
}
+ /* Embrace your forgetfulness! */
+ rv = NFS4ERR_NOMATCHING_LAYOUT;
+
if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
&args->cbl_range);
}
- pnfs_mark_layout_returned_if_empty(lo);
unlock:
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&free_me_list);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index d81f96aacd51..656f68f7fe53 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -925,7 +925,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
if (hdr_arg.minorversion == 0) {
cps.clp = nfs4_find_client_ident(SVC_NET(rqstp), hdr_arg.cb_ident);
if (!cps.clp || !check_gss_callback_principal(cps.clp, rqstp))
- return rpc_drop_reply;
+ goto out_invalidcred;
}
cps.minorversion = hdr_arg.minorversion;
@@ -953,6 +953,10 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
nfs_put_client(cps.clp);
dprintk("%s: done, status = %u\n", __func__, ntohl(status));
return rpc_success;
+
+out_invalidcred:
+ pr_warn_ratelimited("NFS: NFSv4 callback contains invalid cred\n");
+ return rpc_autherr_badcred;
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 0c96528db94a..003ebce4bbc4 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -367,8 +367,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
*/
struct nfs_client *
nfs_get_client(const struct nfs_client_initdata *cl_init,
- const struct rpc_timeout *timeparms,
- const char *ip_addr,
rpc_authflavor_t authflavour)
{
struct nfs_client *clp, *new = NULL;
@@ -399,7 +397,7 @@ nfs_get_client(const struct nfs_client_initdata *cl_init,
&nn->nfs_client_list);
spin_unlock(&nn->nfs_client_lock);
new->cl_flags = cl_init->init_flags;
- return rpc_ops->init_client(new, timeparms, ip_addr);
+ return rpc_ops->init_client(new, cl_init);
}
spin_unlock(&nn->nfs_client_lock);
@@ -470,7 +468,7 @@ EXPORT_SYMBOL_GPL(nfs_init_timeout_values);
* Create an RPC client handle
*/
int nfs_create_rpc_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
+ const struct nfs_client_initdata *cl_init,
rpc_authflavor_t flavor)
{
struct rpc_clnt *clnt = NULL;
@@ -479,8 +477,9 @@ int nfs_create_rpc_client(struct nfs_client *clp,
.protocol = clp->cl_proto,
.address = (struct sockaddr *)&clp->cl_addr,
.addrsize = clp->cl_addrlen,
- .timeout = timeparms,
+ .timeout = cl_init->timeparms,
.servername = clp->cl_hostname,
+ .nodename = cl_init->nodename,
.program = &nfs_program,
.version = clp->rpc_ops->version,
.authflavor = flavor,
@@ -591,14 +590,12 @@ EXPORT_SYMBOL_GPL(nfs_init_server_rpcclient);
* nfs_init_client - Initialise an NFS2 or NFS3 client
*
* @clp: nfs_client to initialise
- * @timeparms: timeout parameters for underlying RPC transport
- * @ip_addr: IP presentation address (not used)
+ * @cl_init: Initialisation parameters
*
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
int error;
@@ -612,7 +609,7 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp,
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -633,6 +630,7 @@ static int nfs_init_server(struct nfs_server *server,
const struct nfs_parsed_mount_data *data,
struct nfs_subversion *nfs_mod)
{
+ struct rpc_timeout timeparms;
struct nfs_client_initdata cl_init = {
.hostname = data->nfs_server.hostname,
.addr = (const struct sockaddr *)&data->nfs_server.address,
@@ -640,8 +638,8 @@ static int nfs_init_server(struct nfs_server *server,
.nfs_mod = nfs_mod,
.proto = data->nfs_server.protocol,
.net = data->net,
+ .timeparms = &timeparms,
};
- struct rpc_timeout timeparms;
struct nfs_client *clp;
int error;
@@ -653,7 +651,7 @@ static int nfs_init_server(struct nfs_server *server,
set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, &timeparms, NULL, RPC_AUTH_UNIX);
+ clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
if (IS_ERR(clp)) {
dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
return PTR_ERR(clp);
@@ -1102,7 +1100,6 @@ static const struct file_operations nfs_server_list_fops = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
- .owner = THIS_MODULE,
};
static int nfs_volume_list_open(struct inode *inode, struct file *file);
@@ -1123,7 +1120,6 @@ static const struct file_operations nfs_volume_list_fops = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_net,
- .owner = THIS_MODULE,
};
/*
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 19d93d0cd400..177fefb26c18 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -232,7 +232,7 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le
* in a page cache page which kmemleak does not scan.
*/
kmemleak_not_leak(string->name);
- string->hash = full_name_hash(name, len);
+ string->hash = full_name_hash(NULL, name, len);
return 0;
}
@@ -502,7 +502,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
if (filename.len == 2 && filename.name[1] == '.')
return;
}
- filename.hash = full_name_hash(filename.name, filename.len);
+ filename.hash = full_name_hash(parent, filename.name, filename.len);
dentry = d_lookup(parent, &filename);
again:
@@ -734,7 +734,7 @@ struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
struct page *page;
for (;;) {
- page = read_cache_page(file_inode(desc->file)->i_mapping,
+ page = read_cache_page(desc->file->f_mapping,
desc->page_index, (filler_t *)nfs_readdir_filler, desc);
if (IS_ERR(page) || grab_page(page))
break;
@@ -1397,19 +1397,18 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
if (IS_ERR(label))
goto out;
- /* Protect against concurrent sillydeletes */
trace_nfs_lookup_enter(dir, dentry, flags);
error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, label);
if (error == -ENOENT)
goto no_entry;
if (error < 0) {
res = ERR_PTR(error);
- goto out_unblock_sillyrename;
+ goto out_label;
}
inode = nfs_fhget(dentry->d_sb, fhandle, fattr, label);
res = ERR_CAST(inode);
if (IS_ERR(res))
- goto out_unblock_sillyrename;
+ goto out_label;
/* Success: notify readdir to use READDIRPLUS */
nfs_advise_use_readdirplus(dir);
@@ -1418,11 +1417,11 @@ no_entry:
res = d_splice_alias(inode, dentry);
if (res != NULL) {
if (IS_ERR(res))
- goto out_unblock_sillyrename;
+ goto out_label;
dentry = res;
}
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-out_unblock_sillyrename:
+out_label:
trace_nfs_lookup_exit(dir, dentry, flags, error);
nfs4_label_free(label);
out:
@@ -2253,21 +2252,37 @@ static struct nfs_access_entry *nfs_access_search_rbtree(struct inode *inode, st
return NULL;
}
-static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res)
+static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, struct nfs_access_entry *res, bool may_block)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_access_entry *cache;
- int err = -ENOENT;
+ bool retry = true;
+ int err;
spin_lock(&inode->i_lock);
- if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
- goto out_zap;
- cache = nfs_access_search_rbtree(inode, cred);
- if (cache == NULL)
- goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
- goto out_stale;
+ for(;;) {
+ if (nfsi->cache_validity & NFS_INO_INVALID_ACCESS)
+ goto out_zap;
+ cache = nfs_access_search_rbtree(inode, cred);
+ err = -ENOENT;
+ if (cache == NULL)
+ goto out;
+ /* Found an entry, is our attribute cache valid? */
+ if (!nfs_attribute_cache_expired(inode) &&
+ !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+ break;
+ err = -ECHILD;
+ if (!may_block)
+ goto out;
+ if (!retry)
+ goto out_zap;
+ spin_unlock(&inode->i_lock);
+ err = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
+ if (err)
+ return err;
+ spin_lock(&inode->i_lock);
+ retry = false;
+ }
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
@@ -2276,12 +2291,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
out:
spin_unlock(&inode->i_lock);
return err;
-out_stale:
- rb_erase(&cache->rb_node, &nfsi->access_cache);
- list_del(&cache->lru);
- spin_unlock(&inode->i_lock);
- nfs_access_free_entry(cache);
- return -ENOENT;
out_zap:
spin_unlock(&inode->i_lock);
nfs_access_zap_cache(inode);
@@ -2308,13 +2317,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
cache = NULL;
if (cache == NULL)
goto out;
- if (!nfs_have_delegated_attributes(inode) &&
- !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
+ if (err)
goto out;
res->jiffies = cache->jiffies;
res->cred = cache->cred;
res->mask = cache->mask;
- err = 0;
out:
rcu_read_unlock();
return err;
@@ -2403,18 +2411,19 @@ EXPORT_SYMBOL_GPL(nfs_access_set_mask);
static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask)
{
struct nfs_access_entry cache;
+ bool may_block = (mask & MAY_NOT_BLOCK) == 0;
int status;
trace_nfs_access_enter(inode);
status = nfs_access_get_cached_rcu(inode, cred, &cache);
if (status != 0)
- status = nfs_access_get_cached(inode, cred, &cache);
+ status = nfs_access_get_cached(inode, cred, &cache, may_block);
if (status == 0)
goto out_cached;
status = -ECHILD;
- if (mask & MAY_NOT_BLOCK)
+ if (!may_block)
goto out;
/* Be clever: ask server to check for all possible rights */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c7326c2af2c3..72b7d13ee3c6 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -196,6 +196,12 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
WARN_ON_ONCE(verfp->committed < 0);
}
+static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
+ const struct nfs_writeverf *v2)
+{
+ return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
+}
+
/*
* nfs_direct_cmp_hdr_verf - compare verifier for pgio header
* @dreq - direct request possibly spanning multiple servers
@@ -215,7 +221,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
}
- return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &hdr->verf);
}
/*
@@ -238,15 +244,13 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
if (verfp->committed < 0)
return 1;
- return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
+ return nfs_direct_cmp_verf(verfp, &data->verf);
}
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @iocb: target I/O control block
- * @iov: array of vectors that define I/O buffer
- * @pos: offset in file to begin the operation
- * @nr_segs: size of iovec array
+ * @iter: I/O buffer
*
* The presence of this routine in the address space ops vector means
* the NFS client supports direct I/O. However, for most direct IO, we
@@ -368,22 +372,10 @@ out:
* Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
* the iocb is still valid here if this is a synchronous request.
*/
-static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write)
+static void nfs_direct_complete(struct nfs_direct_req *dreq)
{
struct inode *inode = dreq->inode;
- if (dreq->iocb && write) {
- loff_t pos = dreq->iocb->ki_pos + dreq->count;
-
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < pos)
- i_size_write(inode, pos);
- spin_unlock(&inode->i_lock);
- }
-
- if (write)
- nfs_zap_mapping(inode, inode->i_mapping);
-
inode_dio_end(inode);
if (dreq->iocb) {
@@ -438,7 +430,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
}
out_put:
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
hdr->release(hdr);
}
@@ -544,7 +536,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
}
if (put_dreq(dreq))
- nfs_direct_complete(dreq, false);
+ nfs_direct_complete(dreq);
return 0;
}
@@ -585,17 +577,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!count)
goto out;
- inode_lock(inode);
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
task_io_account_read(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (dreq == NULL)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
dreq->bytes_left = dreq->max_count = count;
@@ -610,10 +597,12 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
NFS_I(inode)->read_io += count;
result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
@@ -621,13 +610,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
iocb->ki_pos += result;
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
out:
return result;
}
@@ -659,6 +643,8 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ dreq->verf.committed = NFS_INVALID_STABLE_HOW;
+ nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
for (i = 0; i < dreq->mirror_count; i++)
dreq->mirrors[i].count = 0;
get_dreq(dreq);
@@ -777,7 +763,8 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
- nfs_direct_complete(dreq, true);
+ nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
+ nfs_direct_complete(dreq);
}
}
@@ -993,6 +980,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
{
ssize_t result = -EINVAL;
+ size_t count;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
@@ -1003,34 +991,24 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES,
- iov_iter_count(iter));
+ result = generic_write_checks(iocb, iter);
+ if (result <= 0)
+ return result;
+ count = result;
+ nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
pos = iocb->ki_pos;
end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
- inode_lock(inode);
-
- result = nfs_sync_mapping(mapping);
- if (result)
- goto out_unlock;
-
- if (mapping->nrpages) {
- result = invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
- if (result)
- goto out_unlock;
- }
-
- task_io_account_write(iov_iter_count(iter));
+ task_io_account_write(count);
result = -ENOMEM;
dreq = nfs_direct_req_alloc();
if (!dreq)
- goto out_unlock;
+ goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = iov_iter_count(iter);
+ dreq->bytes_left = dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -1042,6 +1020,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ nfs_start_io_direct(inode);
+
result = nfs_direct_write_schedule_iovec(dreq, iter, pos);
if (mapping->nrpages) {
@@ -1049,30 +1029,19 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
pos >> PAGE_SHIFT, end);
}
- inode_unlock(inode);
+ nfs_end_io_direct(inode);
if (!result) {
result = nfs_direct_wait(dreq);
if (result > 0) {
- struct inode *inode = mapping->host;
-
iocb->ki_pos = pos + result;
- spin_lock(&inode->i_lock);
- if (i_size_read(inode) < iocb->ki_pos)
- i_size_write(inode, iocb->ki_pos);
- spin_unlock(&inode->i_lock);
-
/* XXX: should check the generic_write_sync retval */
generic_write_sync(iocb, result);
}
}
- nfs_direct_req_release(dreq);
- return result;
-
out_release:
nfs_direct_req_release(dreq);
-out_unlock:
- inode_unlock(inode);
+out:
return result;
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 717a8d6af52d..7d620970f2e1 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -170,12 +170,14 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to)
iocb->ki_filp,
iov_iter_count(to), (unsigned long) iocb->ki_pos);
- result = nfs_revalidate_mapping_protected(inode, iocb->ki_filp->f_mapping);
+ nfs_start_io_read(inode);
+ result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
result = generic_file_read_iter(iocb, to);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
+ nfs_end_io_read(inode);
return result;
}
EXPORT_SYMBOL_GPL(nfs_file_read);
@@ -191,12 +193,14 @@ nfs_file_splice_read(struct file *filp, loff_t *ppos,
dprintk("NFS: splice_read(%pD2, %lu@%Lu)\n",
filp, (unsigned long) count, (unsigned long long) *ppos);
- res = nfs_revalidate_mapping_protected(inode, filp->f_mapping);
+ nfs_start_io_read(inode);
+ res = nfs_revalidate_mapping(inode, filp->f_mapping);
if (!res) {
res = generic_file_splice_read(filp, ppos, pipe, count, flags);
if (res > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, res);
}
+ nfs_end_io_read(inode);
return res;
}
EXPORT_SYMBOL_GPL(nfs_file_splice_read);
@@ -272,16 +276,13 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
trace_nfs_fsync_enter(inode);
- inode_dio_wait(inode);
do {
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret != 0)
break;
- inode_lock(inode);
ret = nfs_file_fsync_commit(file, start, end, datasync);
if (!ret)
ret = pnfs_sync_inode(inode, !!datasync);
- inode_unlock(inode);
/*
* If nfs_file_fsync_commit detected a server reboot, then
* resend all dirty pages that might have been covered by
@@ -359,19 +360,6 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file, mapping->host->i_ino, len, (long long) pos);
start:
- /*
- * Prevent starvation issues if someone is doing a consistency
- * sync-to-disk
- */
- ret = wait_on_bit_action(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (ret)
- return ret;
- /*
- * Wait for O_DIRECT to complete
- */
- inode_dio_wait(mapping->host);
-
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -432,7 +420,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx)) {
+ if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
status = nfs_wb_all(mapping->host);
if (status < 0)
return status;
@@ -470,31 +458,8 @@ static void nfs_invalidate_page(struct page *page, unsigned int offset,
*/
static int nfs_release_page(struct page *page, gfp_t gfp)
{
- struct address_space *mapping = page->mapping;
-
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
- /* Always try to initiate a 'commit' if relevant, but only
- * wait for it if the caller allows blocking. Even then,
- * only wait 1 second and only if the 'bdi' is not congested.
- * Waiting indefinitely can cause deadlocks when the NFS
- * server is on this machine, when a new TCP connection is
- * needed and in other rare cases. There is no particular
- * need to wait extensively here. A short wait has the
- * benefit that someone else can worry about the freezer.
- */
- if (mapping) {
- struct nfs_server *nfss = NFS_SERVER(mapping->host);
- nfs_commit_inode(mapping->host, 0);
- if (gfpflags_allow_blocking(gfp) &&
- !bdi_write_congested(&nfss->backing_dev_info)) {
- wait_on_page_bit_killable_timeout(page, PG_private,
- HZ);
- if (PagePrivate(page))
- set_bdi_congested(&nfss->backing_dev_info,
- BLK_RW_ASYNC);
- }
- }
/* If PagePrivate() is set, then the page is not freeable */
if (PagePrivate(page))
return 0;
@@ -604,6 +569,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
filp, filp->f_mapping->host->i_ino,
(long long)page_offset(page));
+ sb_start_pagefault(inode->i_sb);
+
/* make sure the cache has finished storing the page */
nfs_fscache_wait_on_page_write(NFS_I(inode), page);
@@ -630,6 +597,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_unlock:
unlock_page(page);
out:
+ sb_end_pagefault(inode->i_sb);
return ret;
}
@@ -645,7 +613,7 @@ static int nfs_need_check_write(struct file *filp, struct inode *inode)
ctx = nfs_file_open_context(filp);
if (test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags) ||
- nfs_ctx_key_to_expire(ctx))
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -656,23 +624,17 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
- size_t count = iov_iter_count(from);
result = nfs_key_timeout_notify(file, inode);
if (result)
return result;
- if (iocb->ki_flags & IOCB_DIRECT) {
- result = generic_write_checks(iocb, from);
- if (result <= 0)
- return result;
+ if (iocb->ki_flags & IOCB_DIRECT)
return nfs_file_direct_write(iocb, from);
- }
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
- file, count, (long long) iocb->ki_pos);
+ file, iov_iter_count(from), (long long) iocb->ki_pos);
- result = -EBUSY;
if (IS_SWAPFILE(inode))
goto out_swapfile;
/*
@@ -684,28 +646,33 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- result = count;
- if (!count)
+ nfs_start_io_write(inode);
+ result = generic_write_checks(iocb, from);
+ if (result > 0) {
+ current->backing_dev_info = inode_to_bdi(inode);
+ result = generic_perform_write(file, from, iocb->ki_pos);
+ current->backing_dev_info = NULL;
+ }
+ nfs_end_io_write(inode);
+ if (result <= 0)
goto out;
- result = generic_file_write_iter(iocb, from);
- if (result > 0)
- written = result;
+ written = generic_write_sync(iocb, result);
+ iocb->ki_pos += written;
/* Return error values */
- if (result >= 0 && nfs_need_check_write(file, inode)) {
+ if (nfs_need_check_write(file, inode)) {
int err = vfs_fsync(file, 0);
if (err < 0)
result = err;
}
- if (result > 0)
- nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
+ nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
out:
return result;
out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
- goto out;
+ return -EBUSY;
}
EXPORT_SYMBOL_GPL(nfs_file_write);
@@ -780,11 +747,6 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
}
static int
-is_time_granular(struct timespec *ts) {
- return ((ts->tv_sec == 0) && (ts->tv_nsec <= 1000));
-}
-
-static int
do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
{
struct inode *inode = filp->f_mapping->host;
@@ -817,12 +779,8 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
* This makes locking act as a cache coherency point.
*/
nfs_sync_mapping(filp->f_mapping);
- if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) {
- if (is_time_granular(&NFS_SERVER(inode)->time_delta))
- __nfs_revalidate_inode(NFS_SERVER(inode), inode);
- else
- nfs_zap_caches(inode);
- }
+ if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+ nfs_zap_mapping(inode, filp->f_mapping);
out:
return status;
}
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index aa59757389dc..a3fc48ba4931 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -255,13 +255,16 @@ static int filelayout_read_done_cb(struct rpc_task *task,
static void
filelayout_set_layoutcommit(struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds ||
- hdr->res.verf->committed != NFS_DATA_SYNC)
+ hdr->res.verf->committed == NFS_FILE_SYNC)
return;
+ if (hdr->res.verf->committed == NFS_DATA_SYNC)
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ pnfs_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
(unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
}
@@ -354,6 +357,12 @@ static int filelayout_write_done_cb(struct rpc_task *task,
}
filelayout_set_layoutcommit(hdr);
+
+ /* zero out the fattr */
+ hdr->fattr.valid = 0;
+ if (task->tk_status >= 0)
+ nfs_writeback_update_inode(hdr);
+
return 0;
}
@@ -375,8 +384,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE)
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 0e8018bc9880..e6206eaf2bdf 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1325,15 +1325,16 @@ ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg)
* we always send layoutcommit after DS writes.
*/
static void
-ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+ff_layout_set_layoutcommit(struct inode *inode,
+ struct pnfs_layout_segment *lseg,
+ loff_t end_offset)
{
- if (!ff_layout_need_layoutcommit(hdr->lseg))
+ if (!ff_layout_need_layoutcommit(lseg))
return;
- pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
- hdr->mds_offset + hdr->res.count);
- dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
- (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+ pnfs_set_layoutcommit(inode, lseg, end_offset);
+ dprintk("%s inode %lu pls_end_pos %llu\n", __func__, inode->i_ino,
+ (unsigned long long) NFS_I(inode)->layout->plh_lwb);
}
static bool
@@ -1469,6 +1470,7 @@ static void ff_layout_read_release(void *data)
static int ff_layout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
+ loff_t end_offs = 0;
int err;
trace_nfs4_pnfs_write(hdr, task->tk_status);
@@ -1494,7 +1496,10 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
if (hdr->res.verf->committed == NFS_FILE_SYNC ||
hdr->res.verf->committed == NFS_DATA_SYNC)
- ff_layout_set_layoutcommit(hdr);
+ end_offs = hdr->mds_offset + (loff_t)hdr->res.count;
+
+ /* Note: if the write is unstable, don't set end_offs until commit */
+ ff_layout_set_layoutcommit(hdr->inode, hdr->lseg, end_offs);
/* zero out fattr since we don't care DS attr at all */
hdr->fattr.valid = 0;
@@ -1530,9 +1535,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
return -EAGAIN;
}
- if (data->verf.committed == NFS_UNSTABLE
- && ff_layout_need_layoutcommit(data->lseg))
- pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb);
+ ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb);
return 0;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index dda689d7a8a7..bf4ec5ecc97e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -662,9 +662,7 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
trace_nfs_getattr_enter(inode);
/* Flush out writes to the server in order to update c/mtime. */
if (S_ISREG(inode->i_mode)) {
- inode_lock(inode);
- err = nfs_sync_inode(inode);
- inode_unlock(inode);
+ err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto out;
}
@@ -879,7 +877,10 @@ void nfs_inode_attach_open_context(struct nfs_open_context *ctx)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- list_add(&ctx->list, &nfsi->open_files);
+ if (ctx->mode & FMODE_WRITE)
+ list_add(&ctx->list, &nfsi->open_files);
+ else
+ list_add_tail(&ctx->list, &nfsi->open_files);
spin_unlock(&inode->i_lock);
}
EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context);
@@ -972,6 +973,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
+ /* pNFS: Attributes aren't updated until we layoutcommit */
+ if (S_ISREG(inode->i_mode)) {
+ status = pnfs_sync_inode(inode, false);
+ if (status)
+ goto out;
+ }
+
status = -ENOMEM;
fattr = nfs_alloc_fattr();
if (fattr == NULL)
@@ -1122,14 +1130,12 @@ out:
}
/**
- * __nfs_revalidate_mapping - Revalidate the pagecache
+ * nfs_revalidate_mapping - Revalidate the pagecache
* @inode - pointer to host inode
* @mapping - pointer to mapping
- * @may_lock - take inode->i_mutex?
*/
-static int __nfs_revalidate_mapping(struct inode *inode,
- struct address_space *mapping,
- bool may_lock)
+int nfs_revalidate_mapping(struct inode *inode,
+ struct address_space *mapping)
{
struct nfs_inode *nfsi = NFS_I(inode);
unsigned long *bitlock = &nfsi->flags;
@@ -1178,12 +1184,7 @@ static int __nfs_revalidate_mapping(struct inode *inode,
nfsi->cache_validity &= ~NFS_INO_INVALID_DATA;
spin_unlock(&inode->i_lock);
trace_nfs_invalidate_mapping_enter(inode);
- if (may_lock) {
- inode_lock(inode);
- ret = nfs_invalidate_mapping(inode, mapping);
- inode_unlock(inode);
- } else
- ret = nfs_invalidate_mapping(inode, mapping);
+ ret = nfs_invalidate_mapping(inode, mapping);
trace_nfs_invalidate_mapping_exit(inode, ret);
clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
@@ -1193,27 +1194,28 @@ out:
return ret;
}
-/**
- * nfs_revalidate_mapping - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- */
-int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, false);
+ struct inode *inode = &nfsi->vfs_inode;
+
+ assert_spin_locked(&inode->i_lock);
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (list_empty(&nfsi->open_files))
+ return false;
+ /* Note: This relies on nfsi->open_files being ordered with writers
+ * being placed at the head of the list.
+ * See nfs_inode_attach_open_context()
+ */
+ return (list_first_entry(&nfsi->open_files,
+ struct nfs_open_context,
+ list)->mode & FMODE_WRITE) == FMODE_WRITE;
}
-/**
- * nfs_revalidate_mapping_protected - Revalidate the pagecache
- * @inode - pointer to host inode
- * @mapping - pointer to mapping
- *
- * Differs from nfs_revalidate_mapping() in that it grabs the inode->i_mutex
- * while invalidating the mapping.
- */
-int nfs_revalidate_mapping_protected(struct inode *inode, struct address_space *mapping)
+static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi)
{
- return __nfs_revalidate_mapping(inode, mapping, true);
+ return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi);
}
static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
@@ -1280,22 +1282,24 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO;
- if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
- inode->i_version != fattr->change_attr)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (!nfs_file_has_buffered_writers(nfsi)) {
+ /* Verify a few of the more important attributes */
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && inode->i_version != fattr->change_attr)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_REVAL_PAGECACHE;
- /* Verify a few of the more important attributes */
- if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
- invalid |= NFS_INO_INVALID_ATTR;
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
+ invalid |= NFS_INO_INVALID_ATTR;
- if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
- cur_size = i_size_read(inode);
- new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec_equal(&inode->i_ctime, &fattr->ctime))
+ invalid |= NFS_INO_INVALID_ATTR;
+
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
}
- if (nfsi->nrequests != 0)
- invalid &= ~NFS_INO_REVAL_PAGECACHE;
/* Have any file permissions changed? */
if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
@@ -1470,28 +1474,12 @@ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct n
((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0);
}
-/*
- * Don't trust the change_attribute, mtime, ctime or size if
- * a pnfs LAYOUTCOMMIT is outstanding
- */
-static void nfs_inode_attrs_handle_layoutcommit(struct inode *inode,
- struct nfs_fattr *fattr)
-{
- if (pnfs_layoutcommit_outstanding(inode))
- fattr->valid &= ~(NFS_ATTR_FATTR_CHANGE |
- NFS_ATTR_FATTR_MTIME |
- NFS_ATTR_FATTR_CTIME |
- NFS_ATTR_FATTR_SIZE);
-}
-
static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
int ret;
trace_nfs_refresh_inode_enter(inode);
- nfs_inode_attrs_handle_layoutcommit(inode, fattr);
-
if (nfs_inode_attrs_need_update(inode, fattr))
ret = nfs_update_inode(inode, fattr);
else
@@ -1527,7 +1515,7 @@ EXPORT_SYMBOL_GPL(nfs_refresh_inode);
static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr)
{
- unsigned long invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ unsigned long invalid = NFS_INO_INVALID_ATTR;
/*
* Don't revalidate the pagecache if we hold a delegation, but do
@@ -1676,6 +1664,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
unsigned long invalid = 0;
unsigned long now = jiffies;
unsigned long save_cache_validity;
+ bool have_writers = nfs_file_has_buffered_writers(nfsi);
bool cache_revalidated = true;
dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n",
@@ -1725,17 +1714,25 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
/* Do atomic weak cache consistency updates */
invalid |= nfs_wcc_update_inode(inode, fattr);
+ if (pnfs_layoutcommit_outstanding(inode)) {
+ nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR;
+ cache_revalidated = false;
+ }
+
/* More cache consistency checks */
if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
if (inode->i_version != fattr->change_attr) {
dprintk("NFS: change_attr change on server for file %s/%ld\n",
inode->i_sb->s_id, inode->i_ino);
- invalid |= NFS_INO_INVALID_ATTR
- | NFS_INO_INVALID_DATA
- | NFS_INO_INVALID_ACCESS
- | NFS_INO_INVALID_ACL;
- if (S_ISDIR(inode->i_mode))
- nfs_force_lookup_revalidate(inode);
+ /* Could it be a race with writeback? */
+ if (!have_writers) {
+ invalid |= NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_DATA
+ | NFS_INO_INVALID_ACCESS
+ | NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ }
inode->i_version = fattr->change_attr;
}
} else {
@@ -1768,9 +1765,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
if (new_isize != cur_isize) {
/* Do we perhaps have any outstanding writes, or has
* the file grown beyond our last write? */
- if ((nfsi->nrequests == 0) || new_isize > cur_isize) {
+ if (nfsi->nrequests == 0 || new_isize > cur_isize) {
i_size_write(inode, new_isize);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (!have_writers)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
}
dprintk("NFS: isize change on server for file %s/%ld "
"(%Ld to %Ld)\n",
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 5154fa65a2f2..7ce5e023c3c3 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -66,13 +66,16 @@ struct nfs_clone_mount {
struct nfs_client_initdata {
unsigned long init_flags;
- const char *hostname;
- const struct sockaddr *addr;
+ const char *hostname; /* Hostname of the server */
+ const struct sockaddr *addr; /* Address of the server */
+ const char *nodename; /* Hostname of the client */
+ const char *ip_addr; /* IP address of the client */
size_t addrlen;
struct nfs_subversion *nfs_mod;
int proto;
u32 minorversion;
struct net *net;
+ const struct rpc_timeout *timeparms;
};
/*
@@ -147,9 +150,8 @@ extern void nfs_umount(const struct nfs_mount_request *info);
extern const struct rpc_program nfs_program;
extern void nfs_clients_init(struct net *net);
extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
-int nfs_create_rpc_client(struct nfs_client *, const struct rpc_timeout *, rpc_authflavor_t);
+int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
- const struct rpc_timeout *, const char *,
rpc_authflavor_t);
int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
void nfs_server_insert_lists(struct nfs_server *);
@@ -184,7 +186,7 @@ extern struct nfs_server *nfs_clone_server(struct nfs_server *,
rpc_authflavor_t);
extern int nfs_wait_client_init_complete(const struct nfs_client *clp);
extern void nfs_mark_client_ready(struct nfs_client *clp, int state);
-extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
@@ -193,7 +195,7 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
-extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo,
unsigned int ds_retrans, rpc_authflavor_t au_flavor);
@@ -338,8 +340,7 @@ nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src)
/* proc.c */
void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
/* dir.c */
extern void nfs_force_use_readdirplus(struct inode *dir);
@@ -411,6 +412,19 @@ extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
+/* io.c */
+extern void nfs_start_io_read(struct inode *inode);
+extern void nfs_end_io_read(struct inode *inode);
+extern void nfs_start_io_write(struct inode *inode);
+extern void nfs_end_io_write(struct inode *inode);
+extern void nfs_start_io_direct(struct inode *inode);
+extern void nfs_end_io_direct(struct inode *inode);
+
+static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi)
+{
+ return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0;
+}
+
/* namespace.c */
#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
@@ -496,9 +510,29 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct inode *inode,
struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode);
void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend);
+
+#ifdef CONFIG_NFS_V4_1
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+ int i;
+
+ for (i = 0; i < cinfo->nbuckets; i++)
+ cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
+#else
+static inline
+void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
+{
+}
+#endif
+
+
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
struct page *, struct page *, enum migrate_mode);
@@ -506,6 +540,13 @@ extern int nfs_migrate_page(struct address_space *,
#define nfs_migrate_page NULL
#endif
+static inline int
+nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
+ const struct nfs_write_verifier *v2)
+{
+ return memcmp(v1->data, v2->data, sizeof(v1->data));
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -521,8 +562,7 @@ extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr);
+ const struct nfs_client_initdata *);
extern int nfs40_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
@@ -623,7 +663,7 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo)
if (!cinfo->dreq) {
struct inode *inode = page_file_mapping(page)->host;
- inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_node_page_state(page, NR_UNSTABLE_NFS);
inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
}
diff --git a/fs/nfs/io.c b/fs/nfs/io.c
new file mode 100644
index 000000000000..1fc5d1ce327e
--- /dev/null
+++ b/fs/nfs/io.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2016 Trond Myklebust
+ *
+ * I/O and data path helper functionality.
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/rwsem.h>
+#include <linux/fs.h>
+#include <linux/nfs_fs.h>
+
+#include "internal.h"
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ clear_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ inode_dio_wait(inode);
+ }
+}
+
+/**
+ * nfs_start_io_read - declare the file is being used for buffered reads
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+void
+nfs_start_io_read(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_read - declare that the buffered read operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_read(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
+
+/**
+ * nfs_start_io_write - declare the file is being used for buffered writes
+ * @inode - file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+void
+nfs_start_io_write(struct inode *inode)
+{
+ down_write(&inode->i_rwsem);
+ nfs_block_o_direct(NFS_I(inode), inode);
+}
+
+/**
+ * nfs_end_io_write - declare that the buffered write operation is done
+ * @inode - file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_write(struct inode *inode)
+{
+ up_write(&inode->i_rwsem);
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode)
+{
+ if (!test_bit(NFS_INO_ODIRECT, &nfsi->flags)) {
+ set_bit(NFS_INO_ODIRECT, &nfsi->flags);
+ nfs_wb_all(inode);
+ }
+}
+
+/**
+ * nfs_end_io_direct - declare the file is being used for direct i/o
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NFS_INO_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NFS_INO_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+void
+nfs_start_io_direct(struct inode *inode)
+{
+ struct nfs_inode *nfsi = NFS_I(inode);
+ /* Be an optimist! */
+ down_read(&inode->i_rwsem);
+ if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0)
+ return;
+ up_read(&inode->i_rwsem);
+ /* Slow path.... */
+ down_write(&inode->i_rwsem);
+ nfs_block_buffered(nfsi, inode);
+ downgrade_write(&inode->i_rwsem);
+}
+
+/**
+ * nfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode - file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void
+nfs_end_io_direct(struct inode *inode)
+{
+ up_read(&inode->i_rwsem);
+}
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 9e9fa347a948..ee753547fb0a 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -76,19 +76,23 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v3,
.proto = ds_proto,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -97,10 +101,12 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/* Use the MDS nfs_client cl_ipaddr. */
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
return clp;
}
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index aa03ed09ba06..33da841a21bb 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -113,15 +113,17 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE))
return -EOPNOTSUPP;
- nfs_wb_all(inode);
inode_lock(inode);
+ err = nfs_sync_inode(inode);
+ if (err)
+ goto out_unlock;
err = nfs42_proc_fallocate(&msg, filep, offset, len);
if (err == 0)
truncate_pagecache_range(inode, offset, (offset + len) -1);
if (err == -EOPNOTSUPP)
NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
-
+out_unlock:
inode_unlock(inode);
return err;
}
@@ -154,11 +156,20 @@ static ssize_t _nfs42_proc_copy(struct file *src, loff_t pos_src,
if (status)
return status;
+ status = nfs_filemap_write_and_wait_range(file_inode(src)->i_mapping,
+ pos_src, pos_src + (loff_t)count - 1);
+ if (status)
+ return status;
+
status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
dst_lock, FMODE_WRITE);
if (status)
return status;
+ status = nfs_sync_inode(dst_inode);
+ if (status)
+ return status;
+
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP)
@@ -258,7 +269,11 @@ static loff_t _nfs42_proc_llseek(struct file *filep,
if (status)
return status;
- nfs_wb_all(inode);
+ status = nfs_filemap_write_and_wait_range(inode->i_mapping,
+ offset, LLONG_MAX);
+ if (status)
+ return status;
+
status = nfs4_call_sync(server->client, server, &msg,
&args.seq_args, &res.seq_res, 0);
if (status == -ENOTSUPP)
@@ -336,8 +351,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
* Mark the bad layout state as invalid, then retry
* with the current stateid.
*/
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
+ pnfs_mark_layout_stateid_invalid(lo, &head);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
} else
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 6dc6f2aea0d6..8b2605882a20 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -330,13 +330,21 @@ static int decode_write_response(struct xdr_stream *xdr,
struct nfs42_write_res *res)
{
__be32 *p;
- int stateids;
p = xdr_inline_decode(xdr, 4 + 8 + 4);
if (unlikely(!p))
goto out_overflow;
- stateids = be32_to_cpup(p++);
+ /*
+ * We never use asynchronous mode, so warn if a server returns
+ * a stateid.
+ */
+ if (unlikely(*p != 0)) {
+ pr_err_once("%s: server has set unrequested "
+ "asynchronous mode\n", __func__);
+ return -EREMOTEIO;
+ }
+ p++;
p = xdr_decode_hyper(p, &res->count);
res->verifier.committed = be32_to_cpup(p);
return decode_verifier(xdr, &res->verifier.verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 768456fa1b17..4be567a54958 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -185,6 +185,7 @@ struct nfs4_state {
struct nfs4_exception {
struct nfs4_state *state;
struct inode *inode;
+ nfs4_stateid *stateid;
long timeout;
unsigned char delay : 1,
recovering : 1,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 10410e8b5853..8d7d08d4f95f 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -349,10 +349,10 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp)
* Returns pointer to an NFS client, or an ERR_PTR value.
*/
struct nfs_client *nfs4_init_client(struct nfs_client *clp,
- const struct rpc_timeout *timeparms,
- const char *ip_addr)
+ const struct nfs_client_initdata *cl_init)
{
char buf[INET6_ADDRSTRLEN + 1];
+ const char *ip_addr = cl_init->ip_addr;
struct nfs_client *old;
int error;
@@ -370,9 +370,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
__set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags);
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_GSS_KRB5I);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I);
if (error == -EINVAL)
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX);
+ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX);
if (error < 0)
goto error;
@@ -793,10 +793,12 @@ static int nfs4_set_client(struct nfs_server *server,
.hostname = hostname,
.addr = addr,
.addrlen = addrlen,
+ .ip_addr = ip_addr,
.nfs_mod = &nfs_v4,
.proto = proto,
.minorversion = minorversion,
.net = net,
+ .timeparms = timeparms,
};
struct nfs_client *clp;
int error;
@@ -809,7 +811,7 @@ static int nfs4_set_client(struct nfs_server *server,
set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
/* Allocate or find a client reference we can use */
- clp = nfs_get_client(&cl_init, timeparms, ip_addr, authflavour);
+ clp = nfs_get_client(&cl_init, authflavour);
if (IS_ERR(clp)) {
error = PTR_ERR(clp);
goto error;
@@ -842,20 +844,24 @@ error:
* low timeout interval so that if a connection is lost, we retry through
* the MDS.
*/
-struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
+struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
const struct sockaddr *ds_addr, int ds_addrlen,
int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
u32 minor_version, rpc_authflavor_t au_flavor)
{
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *mds_clp = mds_srv->nfs_client;
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
+ .nodename = mds_clp->cl_rpcclient->cl_nodename,
+ .ip_addr = mds_clp->cl_ipaddr,
.nfs_mod = &nfs_v4,
.proto = ds_proto,
.minorversion = minor_version,
.net = mds_clp->cl_net,
+ .timeparms = &ds_timeout,
};
- struct rpc_timeout ds_timeout;
struct nfs_client *clp;
char buf[INET6_ADDRSTRLEN + 1];
@@ -863,14 +869,16 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
return ERR_PTR(-EINVAL);
cl_init.hostname = buf;
+ if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
+ __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
+
/*
* Set an authflavor equual to the MDS value. Use the MDS nfs_client
* cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS
* (section 13.1 RFC 5661).
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
- clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- au_flavor);
+ clp = nfs_get_client(&cl_init, au_flavor);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 014b0e41ace5..d085ad794884 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -66,7 +66,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (openflags & O_TRUNC) {
attr.ia_valid |= ATTR_SIZE;
attr.ia_size = 0;
- nfs_sync_inode(inode);
+ filemap_write_and_wait(inode->i_mapping);
}
inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, NULL);
@@ -133,21 +133,9 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
size_t count, unsigned int flags)
{
- struct inode *in_inode = file_inode(file_in);
- struct inode *out_inode = file_inode(file_out);
- int ret;
-
- if (in_inode == out_inode)
+ if (file_inode(file_in) == file_inode(file_out))
return -EINVAL;
- /* flush any pending writes */
- ret = nfs_sync_inode(in_inode);
- if (ret)
- return ret;
- ret = nfs_sync_inode(out_inode);
- if (ret)
- return ret;
-
return nfs42_proc_copy(file_in, pos_in, file_out, pos_out, count);
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ff416d0e24bc..da5c9e58e907 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -363,6 +363,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
+ const nfs4_stateid *stateid = exception->stateid;
struct inode *inode = exception->inode;
int ret = errorcode;
@@ -376,9 +377,18 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs_async_inode_return_delegation(inode,
- NULL) == 0)
- goto wait_on_recovery;
+ if (inode) {
+ int err;
+
+ err = nfs_async_inode_return_delegation(inode,
+ stateid);
+ if (err == 0)
+ goto wait_on_recovery;
+ if (stateid != NULL && stateid->type == NFS4_DELEGATION_STATEID_TYPE) {
+ exception->retry = 1;
+ break;
+ }
+ }
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -427,6 +437,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DELAY:
nfs_inc_server_stats(server, NFSIOS_DELAY);
case -NFS4ERR_GRACE:
+ case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
exception->delay = 1;
return 0;
@@ -2669,28 +2680,17 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
return res;
}
-static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
- struct nfs_fattr *fattr, struct iattr *sattr,
- struct nfs4_state *state, struct nfs4_label *ilabel,
- struct nfs4_label *olabel)
+static int _nfs4_do_setattr(struct inode *inode,
+ struct nfs_setattrargs *arg,
+ struct nfs_setattrres *res,
+ struct rpc_cred *cred,
+ struct nfs4_state *state)
{
struct nfs_server *server = NFS_SERVER(inode);
- struct nfs_setattrargs arg = {
- .fh = NFS_FH(inode),
- .iap = sattr,
- .server = server,
- .bitmask = server->attr_bitmask,
- .label = ilabel,
- };
- struct nfs_setattrres res = {
- .fattr = fattr,
- .label = olabel,
- .server = server,
- };
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETATTR],
- .rpc_argp = &arg,
- .rpc_resp = &res,
+ .rpc_argp = arg,
+ .rpc_resp = res,
.rpc_cred = cred,
};
struct rpc_cred *delegation_cred = NULL;
@@ -2699,17 +2699,13 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
bool truncate;
int status;
- arg.bitmask = nfs4_bitmask(server, ilabel);
- if (ilabel)
- arg.bitmask = nfs4_bitmask(server, olabel);
-
- nfs_fattr_init(fattr);
+ nfs_fattr_init(res->fattr);
/* Servers should only apply open mode checks for file size changes */
- truncate = (sattr->ia_valid & ATTR_SIZE) ? true : false;
+ truncate = (arg->iap->ia_valid & ATTR_SIZE) ? true : false;
fmode = truncate ? FMODE_WRITE : FMODE_READ;
- if (nfs4_copy_delegation_stateid(inode, fmode, &arg.stateid, &delegation_cred)) {
+ if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
/* Use that stateid */
} else if (truncate && state != NULL) {
struct nfs_lockowner lockowner = {
@@ -2719,19 +2715,19 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
if (!nfs4_valid_open_stateid(state))
return -EBADF;
if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
- &arg.stateid, &delegation_cred) == -EIO)
+ &arg->stateid, &delegation_cred) == -EIO)
return -EBADF;
} else
- nfs4_stateid_copy(&arg.stateid, &zero_stateid);
+ nfs4_stateid_copy(&arg->stateid, &zero_stateid);
if (delegation_cred)
msg.rpc_cred = delegation_cred;
- status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
+ status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
put_rpccred(delegation_cred);
if (status == 0 && state != NULL)
renew_lease(server, timestamp);
- trace_nfs4_setattr(inode, &arg.stateid, status);
+ trace_nfs4_setattr(inode, &arg->stateid, status);
return status;
}
@@ -2741,13 +2737,31 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
struct nfs4_label *olabel)
{
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_setattrargs arg = {
+ .fh = NFS_FH(inode),
+ .iap = sattr,
+ .server = server,
+ .bitmask = server->attr_bitmask,
+ .label = ilabel,
+ };
+ struct nfs_setattrres res = {
+ .fattr = fattr,
+ .label = olabel,
+ .server = server,
+ };
struct nfs4_exception exception = {
.state = state,
.inode = inode,
+ .stateid = &arg.stateid,
};
int err;
+
+ arg.bitmask = nfs4_bitmask(server, ilabel);
+ if (ilabel)
+ arg.bitmask = nfs4_bitmask(server, olabel);
+
do {
- err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
+ err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
switch (err) {
case -NFS4ERR_OPENMODE:
if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3267,13 +3281,6 @@ static int nfs4_find_root_sec(struct nfs_server *server, struct nfs_fh *fhandle,
return status;
}
-static int nfs4_do_find_root_sec(struct nfs_server *server,
- struct nfs_fh *fhandle, struct nfs_fsinfo *info)
-{
- int mv = server->nfs_client->cl_minorversion;
- return nfs_v4_minor_ops[mv]->find_root_sec(server, fhandle, info);
-}
-
/**
* nfs4_proc_get_rootfh - get file handle for server's pseudoroot
* @server: initialized nfs_server handle
@@ -3293,7 +3300,8 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle,
status = nfs4_lookup_root(server, fhandle, info);
if (auth_probe || status == NFS4ERR_WRONGSEC)
- status = nfs4_do_find_root_sec(server, fhandle, info);
+ status = server->nfs_client->cl_mvops->find_root_sec(server,
+ fhandle, info);
if (status == 0)
status = nfs4_server_capabilities(server, fhandle);
@@ -4392,7 +4400,8 @@ static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr,
struct rpc_message *msg)
{
hdr->timestamp = jiffies;
- hdr->pgio_done_cb = nfs4_read_done_cb;
+ if (!hdr->pgio_done_cb)
+ hdr->pgio_done_cb = nfs4_read_done_cb;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0);
}
@@ -7869,11 +7878,13 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
struct inode *inode = lgp->args.inode;
struct nfs_server *server = NFS_SERVER(inode);
struct pnfs_layout_hdr *lo;
- int status = task->tk_status;
+ int nfs4err = task->tk_status;
+ int err, status = 0;
+ LIST_HEAD(head);
dprintk("--> %s tk_status => %d\n", __func__, -task->tk_status);
- switch (status) {
+ switch (nfs4err) {
case 0:
goto out;
@@ -7905,45 +7916,42 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EOVERFLOW;
goto out;
}
- /* Fallthrough */
+ status = -EBUSY;
+ break;
case -NFS4ERR_RECALLCONFLICT:
- nfs4_handle_exception(server, -NFS4ERR_RECALLCONFLICT,
- exception);
status = -ERECALLCONFLICT;
- goto out;
+ break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
exception->timeout = 0;
spin_lock(&inode->i_lock);
- if (nfs4_stateid_match(&lgp->args.stateid,
+ lo = NFS_I(inode)->layout;
+ /* If the open stateid was bad, then recover it. */
+ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
+ nfs4_stateid_match_other(&lgp->args.stateid,
&lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
- /* If the open stateid was bad, then recover it. */
exception->state = lgp->args.ctx->state;
break;
}
- lo = NFS_I(inode)->layout;
- if (lo && !test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) &&
- nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) {
- LIST_HEAD(head);
-
- /*
- * Mark the bad layout state as invalid, then retry
- * with the current stateid.
- */
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- pnfs_mark_matching_lsegs_invalid(lo, &head, NULL, 0);
- spin_unlock(&inode->i_lock);
- pnfs_free_lseg_list(&head);
- status = -EAGAIN;
- goto out;
- } else
- spin_unlock(&inode->i_lock);
- }
- status = nfs4_handle_exception(server, status, exception);
- if (exception->retry)
+ /*
+ * Mark the bad layout state as invalid, then retry
+ */
+ pnfs_mark_layout_stateid_invalid(lo, &head);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&head);
status = -EAGAIN;
+ goto out;
+ }
+
+ err = nfs4_handle_exception(server, nfs4err, exception);
+ if (!status) {
+ if (exception->retry)
+ status = -EAGAIN;
+ else
+ status = err;
+ }
out:
dprintk("<-- %s\n", __func__);
return status;
@@ -8129,8 +8137,7 @@ static void nfs4_layoutreturn_release(void *calldata)
spin_lock(&lo->plh_inode->i_lock);
pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range,
be32_to_cpu(lrp->args.stateid.seqid));
- pnfs_mark_layout_returned_if_empty(lo);
- if (lrp->res.lrs_present)
+ if (lrp->res.lrs_present && pnfs_layout_is_valid(lo))
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&lo->plh_inode->i_lock);
@@ -8835,7 +8842,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
#endif
};
-ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
{
ssize_t error, error2;
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 9c150b153782..cfb8f7ce5cf6 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1235,8 +1235,8 @@ DECLARE_EVENT_CLASS(nfs4_idmap_event,
len = 0;
__entry->error = error < 0 ? error : 0;
__entry->id = id;
- memcpy(__get_dynamic_array(name), name, len);
- ((char *)__get_dynamic_array(name))[len] = 0;
+ memcpy(__get_str(name), name, len);
+ __get_str(name)[len] = 0;
),
TP_printk(
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 661e753fe1c9..7bd3a5c09d31 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1985,9 +1985,14 @@ encode_layoutcommit(struct xdr_stream *xdr,
p = xdr_encode_hyper(p, args->lastbytewritten + 1); /* length */
*p = cpu_to_be32(0); /* reclaim */
encode_nfs4_stateid(xdr, &args->stateid);
- p = reserve_space(xdr, 20);
- *p++ = cpu_to_be32(1); /* newoffset = TRUE */
- p = xdr_encode_hyper(p, args->lastbytewritten);
+ if (args->lastbytewritten != U64_MAX) {
+ p = reserve_space(xdr, 20);
+ *p++ = cpu_to_be32(1); /* newoffset = TRUE */
+ p = xdr_encode_hyper(p, args->lastbytewritten);
+ } else {
+ p = reserve_space(xdr, 12);
+ *p++ = cpu_to_be32(0); /* newoffset = FALSE */
+ }
*p++ = cpu_to_be32(0); /* Never send time_modify_changed */
*p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 0b9e5cc9a747..2ca9167bc97d 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -37,7 +37,6 @@
{ 1 << NFS_INO_ADVISE_RDPLUS, "ADVISE_RDPLUS" }, \
{ 1 << NFS_INO_STALE, "STALE" }, \
{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
- { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
@@ -707,9 +706,9 @@ TRACE_EVENT(nfs_sillyrename_unlink,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->error = error;
- memcpy(__get_dynamic_array(name),
+ memcpy(__get_str(name),
data->args.name.name, len);
- ((char *)__get_dynamic_array(name))[len] = 0;
+ __get_str(name)[len] = 0;
),
TP_printk(
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0fbe734cc38c..70806cae0d36 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -259,7 +259,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
* is required.
* Note that caller must hold inode->i_lock.
*/
-static int
+int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
struct list_head *lseg_list)
{
@@ -334,14 +334,17 @@ pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
}
static void
-init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg)
+pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *range,
+ const nfs4_stateid *stateid)
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
atomic_set(&lseg->pls_refcount, 1);
- smp_mb();
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
+ lseg->pls_range = *range;
+ lseg->pls_seq = be32_to_cpu(stateid->seqid);
}
static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
@@ -486,15 +489,6 @@ pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
(end2 == NFS4_MAX_UINT64 || end2 > start1);
}
-static bool
-should_free_lseg(const struct pnfs_layout_range *lseg_range,
- const struct pnfs_layout_range *recall_range)
-{
- return (recall_range->iomode == IOMODE_ANY ||
- lseg_range->iomode == recall_range->iomode) &&
- pnfs_lseg_range_intersecting(lseg_range, recall_range);
-}
-
static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
struct list_head *tmp_list)
{
@@ -533,6 +527,27 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
return (s32)(s1 - s2) > 0;
}
+static bool
+pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
+ const struct pnfs_layout_range *recall_range)
+{
+ return (recall_range->iomode == IOMODE_ANY ||
+ lseg_range->iomode == recall_range->iomode) &&
+ pnfs_lseg_range_intersecting(lseg_range, recall_range);
+}
+
+static bool
+pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
+ const struct pnfs_layout_range *recall_range,
+ u32 seq)
+{
+ if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
+ return false;
+ if (recall_range == NULL)
+ return true;
+ return pnfs_should_free_range(&lseg->pls_range, recall_range);
+}
+
/**
* pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
* @lo: layout header containing the lsegs
@@ -562,10 +577,7 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
if (list_empty(&lo->plh_segs))
return 0;
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (!recall_range ||
- should_free_lseg(&lseg->pls_range, recall_range)) {
- if (seq && pnfs_seqid_is_newer(lseg->pls_seq, seq))
- continue;
+ if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
dprintk("%s: freeing lseg %p iomode %d seq %u"
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode, lseg->pls_seq,
@@ -761,24 +773,25 @@ void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
bool update_barrier)
{
- u32 oldseq, newseq, new_barrier;
- int empty = list_empty(&lo->plh_segs);
+ u32 oldseq, newseq, new_barrier = 0;
+ bool invalid = !pnfs_layout_is_valid(lo);
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
newseq = be32_to_cpu(new->seqid);
- if (empty || pnfs_seqid_is_newer(newseq, oldseq)) {
+ if (invalid || pnfs_seqid_is_newer(newseq, oldseq)) {
nfs4_stateid_copy(&lo->plh_stateid, new);
- if (update_barrier) {
- new_barrier = be32_to_cpu(new->seqid);
- } else {
- /* Because of wraparound, we want to keep the barrier
- * "close" to the current seqids.
- */
- new_barrier = newseq - atomic_read(&lo->plh_outstanding);
- }
- if (empty || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
- lo->plh_barrier = new_barrier;
+ /*
+ * Because of wraparound, we want to keep the barrier
+ * "close" to the current seqids.
+ */
+ new_barrier = newseq - atomic_read(&lo->plh_outstanding);
}
+ if (update_barrier)
+ new_barrier = be32_to_cpu(new->seqid);
+ else if (new_barrier == 0)
+ return;
+ if (invalid || pnfs_seqid_is_newer(new_barrier, lo->plh_barrier))
+ lo->plh_barrier = new_barrier;
}
static bool
@@ -873,15 +886,37 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
}
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+ lo->plh_return_iomode = 0;
+ lo->plh_return_seq = 0;
+ clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
+ nfs4_stateid *stateid,
+ enum pnfs_iomode *iomode)
{
if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
return false;
- lo->plh_return_iomode = 0;
- lo->plh_return_seq = 0;
pnfs_get_layout_hdr(lo);
- clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
+ if (stateid != NULL) {
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
+ }
+ if (iomode != NULL)
+ *iomode = lo->plh_return_iomode;
+ pnfs_clear_layoutreturn_info(lo);
+ return true;
+ }
+ if (stateid != NULL)
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ if (iomode != NULL)
+ *iomode = IOMODE_ANY;
return true;
}
@@ -949,10 +984,7 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
enum pnfs_iomode iomode;
bool send;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- stateid.seqid = cpu_to_be32(lo->plh_return_seq);
- iomode = lo->plh_return_iomode;
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
@@ -989,7 +1021,6 @@ _pnfs_return_layout(struct inode *ino)
dprintk("NFS: %s no layout to return\n", __func__);
goto out;
}
- nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
/* Reference matched in nfs4_layoutreturn_release */
pnfs_get_layout_hdr(lo);
empty = list_empty(&lo->plh_segs);
@@ -1012,8 +1043,7 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
- send = pnfs_prepare_layoutreturn(lo);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
if (send)
@@ -1080,11 +1110,10 @@ bool pnfs_roc(struct inode *ino)
goto out_noroc;
}
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
/* always send layoutreturn if being marked so */
- if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
- &lo->plh_flags))
- layoutreturn = pnfs_prepare_layoutreturn(lo);
+ if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ layoutreturn = pnfs_prepare_layoutreturn(lo,
+ &stateid, NULL);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
/* If we are sending layoutreturn, invalidate all valid lsegs */
@@ -1132,7 +1161,6 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
spin_lock(&ino->i_lock);
lo = NFS_I(ino)->layout;
- pnfs_mark_layout_returned_if_empty(lo);
if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
lo->plh_barrier = barrier;
spin_unlock(&ino->i_lock);
@@ -1505,7 +1533,7 @@ pnfs_update_layout(struct inode *ino,
struct pnfs_layout_segment *lseg = NULL;
nfs4_stateid stateid;
long timeout = 0;
- unsigned long giveup = jiffies + rpc_get_timeout(server->client);
+ unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
bool first;
if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
@@ -1645,33 +1673,44 @@ lookup_again:
lseg = send_layoutget(lo, ctx, &stateid, &arg, &timeout, gfp_flags);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
+ atomic_dec(&lo->plh_outstanding);
if (IS_ERR(lseg)) {
switch(PTR_ERR(lseg)) {
- case -ERECALLCONFLICT:
+ case -EBUSY:
if (time_after(jiffies, giveup))
lseg = NULL;
- /* Fallthrough */
- case -EAGAIN:
- pnfs_put_layout_hdr(lo);
- if (first)
- pnfs_clear_first_layoutget(lo);
- if (lseg) {
- trace_pnfs_update_layout(ino, pos, count,
- iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
- goto lookup_again;
+ break;
+ case -ERECALLCONFLICT:
+ /* Huh? We hold no layouts, how is there a recall? */
+ if (first) {
+ lseg = NULL;
+ break;
}
+ /* Destroy the existing layout and start over */
+ if (time_after(jiffies, giveup))
+ pnfs_destroy_layout(NFS_I(ino));
/* Fallthrough */
+ case -EAGAIN:
+ break;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
lseg = NULL;
}
+ goto out_put_layout_hdr;
+ }
+ if (lseg) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ trace_pnfs_update_layout(ino, pos, count,
+ iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
}
} else {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
}
- atomic_dec(&lo->plh_outstanding);
out_put_layout_hdr:
if (first)
pnfs_clear_first_layoutget(lo);
@@ -1735,9 +1774,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
return lseg;
}
- init_lseg(lo, lseg);
- lseg->pls_range = res->range;
- lseg->pls_seq = be32_to_cpu(res->stateid.seqid);
+ pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
spin_lock(&ino->i_lock);
if (pnfs_layoutgets_blocked(lo)) {
@@ -1758,16 +1795,19 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
* inode invalid, and don't bother validating the stateid
* sequence number.
*/
- pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL, 0);
+ pnfs_mark_layout_stateid_invalid(lo, &free_me);
nfs4_stateid_copy(&lo->plh_stateid, &res->stateid);
lo->plh_barrier = be32_to_cpu(res->stateid.seqid);
}
- clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-
pnfs_get_lseg(lseg);
pnfs_layout_insert_lseg(lo, lseg, &free_me);
+ if (!pnfs_layout_is_valid(lo)) {
+ pnfs_clear_layoutreturn_info(lo);
+ clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+ }
+
if (res->return_on_close)
set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
@@ -1787,14 +1827,14 @@ static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
{
- if (lo->plh_return_iomode == iomode)
- return;
- if (lo->plh_return_iomode != 0)
+ if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
iomode = IOMODE_ANY;
lo->plh_return_iomode = iomode;
set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
- if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
+ if (seq != 0) {
+ WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
lo->plh_return_seq = seq;
+ }
}
/**
@@ -1824,7 +1864,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
assert_spin_locked(&lo->plh_inode->i_lock);
list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
- if (should_free_lseg(&lseg->pls_range, return_range)) {
+ if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
dprintk("%s: marking lseg %p iomode %d "
"offset %llu length %llu\n", __func__,
lseg, lseg->pls_range.iomode,
@@ -1855,19 +1895,17 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
bool return_now = false;
spin_lock(&inode->i_lock);
- pnfs_set_plh_return_info(lo, range.iomode, lseg->pls_seq);
+ pnfs_set_plh_return_info(lo, range.iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (!pnfs_mark_matching_lsegs_return(lo, &free_me,
- &range, lseg->pls_seq)) {
+ if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
nfs4_stateid stateid;
- enum pnfs_iomode iomode = lo->plh_return_iomode;
+ enum pnfs_iomode iomode;
- nfs4_stateid_copy(&stateid, &lo->plh_stateid);
- return_now = pnfs_prepare_layoutreturn(lo);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
pnfs_send_layoutreturn(lo, &stateid, iomode, false);
@@ -2382,7 +2420,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
- data->args.lastbytewritten = end_pos - 1;
+ if (end_pos != 0)
+ data->args.lastbytewritten = end_pos - 1;
+ else
+ data->args.lastbytewritten = U64_MAX;
data->res.server = NFS_SERVER(inode);
if (ld->prepare_layoutcommit) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index b21bd0bee784..31d99b2927b0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -268,6 +268,8 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
const struct pnfs_layout_range *recall_range,
u32 seq);
+int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+ struct list_head *lseg_list);
bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -375,6 +377,11 @@ static inline bool nfs_have_layout(struct inode *inode)
return NFS_I(inode)->layout != NULL;
}
+static inline bool pnfs_layout_is_valid(const struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) == 0;
+}
+
static inline struct nfs4_deviceid_node *
nfs4_get_deviceid(struct nfs4_deviceid_node *d)
{
@@ -545,19 +552,6 @@ pnfs_calc_offset_length(u64 offset, u64 end)
return 1 + end - offset;
}
-/**
- * pnfs_mark_layout_returned_if_empty - marks the layout as returned
- * @lo: layout header
- *
- * Note: Caller must hold inode->i_lock
- */
-static inline void
-pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
-{
- if (list_empty(&lo->plh_segs))
- set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-}
-
static inline void
pnfs_copy_range(struct pnfs_layout_range *dst,
const struct pnfs_layout_range *src)
@@ -629,6 +623,13 @@ pnfs_sync_inode(struct inode *inode, bool datasync)
}
static inline bool
+pnfs_layoutcommit_outstanding(struct inode *inode)
+{
+ return false;
+}
+
+
+static inline bool
pnfs_roc(struct inode *ino)
{
return false;
@@ -716,13 +717,6 @@ pnfs_use_threshold(struct nfs4_threshold **dst, struct nfs4_threshold *src,
return false;
}
-static inline bool
-pnfs_layoutcommit_outstanding(struct inode *inode)
-{
- return false;
-}
-
-
static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
return NULL;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index b38e3c0dc790..f3468b57a32a 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -595,7 +595,7 @@ static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
}
static struct nfs_client *(*get_v3_ds_connect)(
- struct nfs_client *mds_clp,
+ struct nfs_server *mds_srv,
const struct sockaddr *ds_addr,
int ds_addrlen,
int ds_proto,
@@ -654,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
rpc_clnt_test_and_add_xprt, NULL);
} else
- clp = get_v3_ds_connect(mds_srv->nfs_client,
+ clp = get_v3_ds_connect(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, au_flavor);
@@ -690,7 +690,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
dprintk("%s: DS %s: trying address %s\n",
__func__, ds->ds_remotestr, da->da_remotestr);
- clp = nfs4_set_ds_client(mds_srv->nfs_client,
+ clp = nfs4_set_ds_client(mds_srv,
(struct sockaddr *)&da->da_addr,
da->da_addrlen, IPPROTO_TCP,
timeo, retrans, minor_version,
@@ -940,6 +940,13 @@ EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
int
pnfs_nfs_generic_sync(struct inode *inode, bool datasync)
{
+ int ret;
+
+ if (!pnfs_layoutcommit_outstanding(inode))
+ return 0;
+ ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ if (ret < 0)
+ return ret;
if (datasync)
return 0;
return pnfs_layoutcommit_inode(inode, true);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2137e0202f25..18d446e1a82b 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1684,6 +1684,7 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
{
rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR;
unsigned int i;
+ int use_auth_null = false;
/*
* If the sec= mount option is used, the specified flavor or AUTH_NULL
@@ -1691,14 +1692,21 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args,
*
* AUTH_NULL has a special meaning when it's in the server list - it
* means that the server will ignore the rpc creds, so any flavor
- * can be used.
+ * can be used but still use the sec= that was specified.
*/
for (i = 0; i < count; i++) {
flavor = server_authlist[i];
- if (nfs_auth_info_match(&args->auth_info, flavor) ||
- flavor == RPC_AUTH_NULL)
+ if (nfs_auth_info_match(&args->auth_info, flavor))
goto out;
+
+ if (flavor == RPC_AUTH_NULL)
+ use_auth_null = true;
+ }
+
+ if (use_auth_null) {
+ flavor = RPC_AUTH_NULL;
+ goto out;
}
dfprintk(MOUNT,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e1c74d3db64d..3a6724c6eb5f 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -625,7 +625,7 @@ static int nfs_writepage_locked(struct page *page,
int err;
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
+ nfs_pageio_init_write(&pgio, inode, 0,
false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio, launder);
nfs_pageio_complete(&pgio);
@@ -657,16 +657,9 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
- unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio;
int err;
- /* Stop dirtying of new pages while we sync */
- err = wait_on_bit_lock_action(bitlock, NFS_INO_FLUSHING,
- nfs_wait_bit_killable, TASK_KILLABLE);
- if (err)
- goto out_err;
-
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false,
@@ -674,10 +667,6 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
- clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
- smp_mb__after_atomic();
- wake_up_bit(bitlock, NFS_INO_FLUSHING);
-
if (err < 0)
goto out_err;
err = pgio.pg_error;
@@ -898,7 +887,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
static void
nfs_clear_page_commit(struct page *page)
{
- dec_zone_page_state(page, NR_UNSTABLE_NFS);
+ dec_node_page_state(page, NR_UNSTABLE_NFS);
dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
WB_RECLAIMABLE);
}
@@ -1195,9 +1184,11 @@ nfs_key_timeout_notify(struct file *filp, struct inode *inode)
/*
* Test if the open context credential key is marked to expire soon.
*/
-bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx)
+bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode)
{
- return rpcauth_cred_key_to_expire(ctx->cred);
+ struct rpc_auth *auth = NFS_SERVER(inode)->client->cl_auth;
+
+ return rpcauth_cred_key_to_expire(auth, ctx->cred);
}
/*
@@ -1289,6 +1280,9 @@ int nfs_updatepage(struct file *file, struct page *page,
dprintk("NFS: nfs_updatepage(%pD2 %d@%lld)\n",
file, count, (long long)(page_file_offset(page) + offset));
+ if (!count)
+ goto out;
+
if (nfs_can_extend_write(file, page, inode)) {
count = max(count + offset, nfs_page_length(page));
offset = 0;
@@ -1299,7 +1293,7 @@ int nfs_updatepage(struct file *file, struct page *page,
nfs_set_pageerror(page);
else
__set_page_dirty_nobuffers(page);
-
+out:
dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n",
status, (long long)i_size_read(inode));
return status;
@@ -1800,7 +1794,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (!memcmp(&req->wb_verf, &data->verf.verifier, sizeof(req->wb_verf))) {
+ if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
/* We have a match */
nfs_inode_remove_request(req);
dprintk(" OK\n");
@@ -1924,6 +1918,24 @@ out_mark_dirty:
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
+ * Wrapper for filemap_write_and_wait_range()
+ *
+ * Needed for pNFS in order to ensure data becomes visible to the
+ * client.
+ */
+int nfs_filemap_write_and_wait_range(struct address_space *mapping,
+ loff_t lstart, loff_t lend)
+{
+ int ret;
+
+ ret = filemap_write_and_wait_range(mapping, lstart, lend);
+ if (ret == 0)
+ ret = pnfs_sync_inode(mapping->host, true);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(nfs_filemap_write_and_wait_range);
+
+/*
* flush the inode to disk.
*/
int nfs_wb_all(struct inode *inode)
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index c9f583d7bac8..47febcf99185 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -90,6 +90,7 @@ config NFSD_BLOCKLAYOUT
bool "NFSv4.1 server support for pNFS block layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
help
This option enables support for the exporting pNFS block layouts
in the kernel's NFS server. The pNFS block layout enables NFS
@@ -102,6 +103,7 @@ config NFSD_SCSILAYOUT
bool "NFSv4.1 server support for pNFS SCSI layouts"
depends on NFSD_V4 && BLOCK
select NFSD_PNFS
+ select EXPORTFS_BLOCK_OPS
help
This option enables support for the exporting pNFS SCSI layouts
in the kernel's NFS server. The pNFS SCSI layout enables NFS
@@ -111,6 +113,23 @@ config NFSD_SCSILAYOUT
If unsure, say N.
+config NFSD_FLEXFILELAYOUT
+ bool "NFSv4.1 server support for pNFS Flex File layouts"
+ depends on NFSD_V4
+ select NFSD_PNFS
+ help
+ This option enables support for the exporting pNFS Flex File
+ layouts in the kernel's NFS server. The pNFS Flex File layout
+ enables NFS clients to directly perform I/O to NFSv3 devices
+ accesible to both the server and the clients. See
+ draft-ietf-nfsv4-flex-files for more details.
+
+ Warning, this server implements the bare minimum functionality
+ to be a flex file server - it is for testing the client,
+ not for use in production.
+
+ If unsure, say N.
+
config NFSD_V4_SECURITY_LABEL
bool "Provide Security Label support for NFSv4 server"
depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 3ae5f3c77e28..5f5d3a76980c 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -20,3 +20,4 @@ nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
+nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index ad2c05e80a83..5a1708441510 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -163,6 +163,7 @@ nfsd4_block_get_device_info_simple(struct super_block *sb,
static __be32
nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
@@ -355,6 +356,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb,
static __be32
nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdp)
{
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 4ebaaf4b8d8a..ac6f54546fdd 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -44,7 +44,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
switch (b->type) {
case PNFS_BLOCK_VOLUME_SIMPLE:
- len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+ len = 4 + 4 + 8 + 4 + (XDR_QUADLEN(b->simple.sig_len) << 2);
p = xdr_reserve_space(xdr, len);
if (!p)
return -ETOOSMALL;
@@ -55,7 +55,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
break;
case PNFS_BLOCK_VOLUME_SCSI:
- len = 4 + 4 + 4 + 4 + b->scsi.designator_len + 8;
+ len = 4 + 4 + 4 + 4 + (XDR_QUADLEN(b->scsi.designator_len) << 2) + 8;
p = xdr_reserve_space(xdr, len);
if (!p)
return -ETOOSMALL;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b4d84b579f20..43e109cc0ccc 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
- new->ex_layout_type = 0;
+ new->ex_layout_types = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -731,7 +731,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
item->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = item->ex_fslocs.migrated;
item->ex_fslocs.migrated = 0;
- new->ex_layout_type = item->ex_layout_type;
+ new->ex_layout_types = item->ex_layout_types;
new->ex_nflavors = item->ex_nflavors;
for (i = 0; i < MAX_SECINFO_LIST; i++) {
new->ex_flavors[i] = item->ex_flavors[i];
@@ -954,6 +954,16 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
return 0;
}
+
+ /* If the compound op contains a spo_must_allowed op,
+ * it will be sent with integrity/protection which
+ * will have to be expressly allowed on mounts that
+ * don't support it
+ */
+
+ if (nfsd4_spo_must_allow(rqstp))
+ return 0;
+
return nfserr_wrongsec;
}
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 2e315072bf3f..730f15eeb7ed 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -57,7 +57,7 @@ struct svc_export {
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
- enum pnfs_layouttype ex_layout_type;
+ u32 ex_layout_types;
struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
};
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
new file mode 100644
index 000000000000..df880e9fa71f
--- /dev/null
+++ b/fs/nfsd/flexfilelayout.c
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ *
+ * The following implements a super-simple flex-file server
+ * where the NFSv4.1 mds is also the ds. And the storage is
+ * the same. I.e., writing to the mds via a NFSv4.1 WRITE
+ * goes to the same location as the NFSv3 WRITE.
+ */
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include <linux/sunrpc/addr.h>
+
+#include "flexfilelayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+static __be32
+nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ u32 device_generation = 0;
+ int error;
+ uid_t u;
+
+ struct pnfs_ff_layout *fl;
+
+ /*
+ * The super simple flex file server has 1 mirror, 1 data server,
+ * and 1 file handle. So instead of 4 allocs, do 1 for now.
+ * Zero it out for the stateid - don't want junk in there!
+ */
+ error = -ENOMEM;
+ fl = kzalloc(sizeof(*fl), GFP_KERNEL);
+ if (!fl)
+ goto out_error;
+ args->lg_content = fl;
+
+ /*
+ * Avoid layout commit, try to force the I/O to the DS,
+ * and for fun, cause all IOMODE_RW layout segments to
+ * effectively be WRITE only.
+ */
+ fl->flags = FF_FLAGS_NO_LAYOUTCOMMIT | FF_FLAGS_NO_IO_THRU_MDS |
+ FF_FLAGS_NO_READ_IO;
+
+ /* Do not allow a IOMODE_READ segment to have write pemissions */
+ if (seg->iomode == IOMODE_READ) {
+ u = from_kuid(&init_user_ns, inode->i_uid) + 1;
+ fl->uid = make_kuid(&init_user_ns, u);
+ } else
+ fl->uid = inode->i_uid;
+ fl->gid = inode->i_gid;
+
+ error = nfsd4_set_deviceid(&fl->deviceid, fhp, device_generation);
+ if (error)
+ goto out_error;
+
+ fl->fh.size = fhp->fh_handle.fh_size;
+ memcpy(fl->fh.data, &fhp->fh_handle.fh_base, fl->fh.size);
+
+ /* Give whole file layout segments */
+ seg->offset = 0;
+ seg->length = NFS4_MAX_UINT64;
+
+ dprintk("GET: 0x%llx:0x%llx %d\n", seg->offset, seg->length,
+ seg->iomode);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+}
+
+static __be32
+nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
+ struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da;
+
+ u16 port;
+ char addr[INET6_ADDRSTRLEN];
+
+ da = kzalloc(sizeof(struct pnfs_ff_device_addr), GFP_KERNEL);
+ if (!da)
+ return nfserrno(-ENOMEM);
+
+ gdp->gd_device = da;
+
+ da->version = 3;
+ da->minor_version = 0;
+
+ da->rsize = svc_max_payload(rqstp);
+ da->wsize = da->rsize;
+
+ rpc_ntop((struct sockaddr *)&rqstp->rq_daddr,
+ addr, INET6_ADDRSTRLEN);
+ if (rqstp->rq_daddr.ss_family == AF_INET) {
+ struct sockaddr_in *sin;
+
+ sin = (struct sockaddr_in *)&rqstp->rq_daddr;
+ port = ntohs(sin->sin_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp");
+ da->netaddr.netid_len = 3;
+ } else {
+ struct sockaddr_in6 *sin6;
+
+ sin6 = (struct sockaddr_in6 *)&rqstp->rq_daddr;
+ port = ntohs(sin6->sin6_port);
+ snprintf(da->netaddr.netid, FF_NETID_LEN + 1, "tcp6");
+ da->netaddr.netid_len = 4;
+ }
+
+ da->netaddr.addr_len =
+ snprintf(da->netaddr.addr, FF_ADDR_LEN + 1,
+ "%s.%hhu.%hhu", addr, port >> 8, port & 0xff);
+
+ da->tightly_coupled = false;
+
+ return 0;
+}
+
+const struct nfsd4_layout_ops ff_layout_ops = {
+ .notify_types =
+ NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+ .proc_getdeviceinfo = nfsd4_ff_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_ff_proc_layoutget,
+ .encode_layoutget = nfsd4_ff_encode_layoutget,
+};
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
new file mode 100644
index 000000000000..5e3fd7fc1a9f
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "flexfilelayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct ff_idmap {
+ char buf[11];
+ int len;
+};
+
+__be32
+nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_ff_layout *fl = lgp->lg_content;
+ int len, mirror_len, ds_len, fh_len;
+ __be32 *p;
+
+ /*
+ * Unlike nfsd4_encode_user, we know these will
+ * always be stringified.
+ */
+ struct ff_idmap uid;
+ struct ff_idmap gid;
+
+ fh_len = 4 + fl->fh.size;
+
+ uid.len = sprintf(uid.buf, "%u", from_kuid(&init_user_ns, fl->uid));
+ gid.len = sprintf(gid.buf, "%u", from_kgid(&init_user_ns, fl->gid));
+
+ /* 8 + len for recording the length, name, and padding */
+ ds_len = 20 + sizeof(stateid_opaque_t) + 4 + fh_len +
+ 8 + uid.len + 8 + gid.len;
+
+ mirror_len = 4 + ds_len;
+
+ /* The layout segment */
+ len = 20 + mirror_len;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ p = xdr_encode_hyper(p, 0); /* stripe unit of 1 */
+
+ *p++ = cpu_to_be32(1); /* single mirror */
+ *p++ = cpu_to_be32(1); /* single data server */
+
+ p = xdr_encode_opaque_fixed(p, &fl->deviceid,
+ sizeof(struct nfsd4_deviceid));
+
+ *p++ = cpu_to_be32(1); /* efficiency */
+
+ *p++ = cpu_to_be32(fl->stateid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &fl->stateid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* single file handle */
+ p = xdr_encode_opaque(p, fl->fh.data, fl->fh.size);
+
+ p = xdr_encode_opaque(p, uid.buf, uid.len);
+ p = xdr_encode_opaque(p, gid.buf, gid.len);
+
+ *p++ = cpu_to_be32(fl->flags);
+ *p++ = cpu_to_be32(0); /* No stats collect hint */
+
+ return 0;
+}
+
+__be32
+nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_ff_device_addr *da = gdp->gd_device;
+ int len;
+ int ver_len;
+ int addr_len;
+ __be32 *p;
+
+ /* len + padding for two strings */
+ addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
+ ver_len = 20;
+
+ len = 4 + ver_len + 4 + addr_len;
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* 1 netaddr */
+ p = xdr_encode_opaque(p, da->netaddr.netid, da->netaddr.netid_len);
+ p = xdr_encode_opaque(p, da->netaddr.addr, da->netaddr.addr_len);
+
+ *p++ = cpu_to_be32(1); /* 1 versions */
+
+ *p++ = cpu_to_be32(da->version);
+ *p++ = cpu_to_be32(da->minor_version);
+ *p++ = cpu_to_be32(da->rsize);
+ *p++ = cpu_to_be32(da->wsize);
+ *p++ = cpu_to_be32(da->tightly_coupled);
+
+ return 0;
+}
diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h
new file mode 100644
index 000000000000..467defd4e563
--- /dev/null
+++ b/fs/nfsd/flexfilelayoutxdr.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016 Tom Haynes <loghyr@primarydata.com>
+ */
+#ifndef _NFSD_FLEXFILELAYOUTXDR_H
+#define _NFSD_FLEXFILELAYOUTXDR_H 1
+
+#include <linux/inet.h>
+#include "xdr4.h"
+
+#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
+#define FF_FLAGS_NO_READ_IO 4
+
+struct xdr_stream;
+
+#define FF_NETID_LEN (4)
+#define FF_ADDR_LEN (INET6_ADDRSTRLEN + 8)
+struct pnfs_ff_netaddr {
+ char netid[FF_NETID_LEN + 1];
+ char addr[FF_ADDR_LEN + 1];
+ u32 netid_len;
+ u32 addr_len;
+};
+
+struct pnfs_ff_device_addr {
+ struct pnfs_ff_netaddr netaddr;
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+struct pnfs_ff_layout {
+ u32 flags;
+ u32 stats_collect_hint;
+ kuid_t uid;
+ kgid_t gid;
+ struct nfsd4_deviceid deviceid;
+ stateid_t stateid;
+ struct nfs_fh fh;
+};
+
+__be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+
+#endif /* _NFSD_FLEXFILELAYOUTXDR_H */
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 953c0755cb37..2be9602b0221 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -27,6 +27,9 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
static const struct lock_manager_operations nfsd4_layouts_lm_ops;
const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ [LAYOUT_FLEX_FILES] = &ff_layout_ops,
+#endif
#ifdef CONFIG_NFSD_BLOCKLAYOUT
[LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
#endif
@@ -122,28 +125,35 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
void nfsd4_setup_layout_type(struct svc_export *exp)
{
+#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+#endif
if (!(exp->ex_flags & NFSEXP_PNFS))
return;
/*
- * Check if the file system supports exporting a block-like layout.
+ * If flex file is configured, use it by default. Otherwise
+ * check if the file system supports exporting a block-like layout.
* If the block device supports reservations prefer the SCSI layout,
* otherwise advertise the block layout.
*/
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+ exp->ex_layout_types |= 1 << LAYOUT_FLEX_FILES;
+#endif
#ifdef CONFIG_NFSD_BLOCKLAYOUT
+ /* overwrite flex file layout selection if needed */
if (sb->s_export_op->get_uuid &&
sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks)
- exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+ exp->ex_layout_types |= 1 << LAYOUT_BLOCK_VOLUME;
#endif
#ifdef CONFIG_NFSD_SCSILAYOUT
/* overwrite block layout selection if needed */
if (sb->s_export_op->map_blocks &&
sb->s_export_op->commit_blocks &&
sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops)
- exp->ex_layout_type = LAYOUT_SCSI;
+ exp->ex_layout_types |= 1 << LAYOUT_SCSI;
#endif
}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index de1ff1d98bb1..1fb222752b2b 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -605,8 +605,7 @@ nfsd4_create(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
fh_init(&resfh, NFS4_FHSIZE);
- status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR,
- NFSD_MAY_CREATE);
+ status = fh_verify(rqstp, &cstate->current_fh, S_IFDIR, NFSD_MAY_NOP);
if (status)
return status;
@@ -1219,12 +1218,12 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
static const struct nfsd4_layout_ops *
nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
{
- if (!exp->ex_layout_type) {
+ if (!exp->ex_layout_types) {
dprintk("%s: export does not support pNFS\n", __func__);
return NULL;
}
- if (exp->ex_layout_type != layout_type) {
+ if (!(exp->ex_layout_types & (1 << layout_type))) {
dprintk("%s: layout type %d not supported\n",
__func__, layout_type);
return NULL;
@@ -1270,7 +1269,7 @@ nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
nfserr = nfs_ok;
if (gdp->gd_maxcount != 0) {
nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb,
- cstate->session->se_client, gdp);
+ rqstp, cstate->session->se_client, gdp);
}
gdp->gd_notify_types &= ops->notify_types;
@@ -2335,6 +2334,45 @@ static struct nfsd4_operation nfsd4_ops[] = {
},
};
+/**
+ * nfsd4_spo_must_allow - Determine if the compound op contains an
+ * operation that is allowed to be sent with machine credentials
+ *
+ * @rqstp: a pointer to the struct svc_rqst
+ *
+ * Checks to see if the compound contains a spo_must_allow op
+ * and confirms that it was sent with the proper machine creds.
+ */
+
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ struct nfsd4_compoundres *resp = rqstp->rq_resp;
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+ struct nfsd4_op *this = &argp->ops[resp->opcnt - 1];
+ struct nfsd4_compound_state *cstate = &resp->cstate;
+ struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
+ u32 opiter;
+
+ if (!cstate->minorversion)
+ return false;
+
+ if (cstate->spo_must_allowed == true)
+ return true;
+
+ opiter = resp->opcnt;
+ while (opiter < argp->opcnt) {
+ this = &argp->ops[opiter++];
+ if (test_bit(this->opnum, allow->u.longs) &&
+ cstate->clp->cl_mach_cred &&
+ nfsd4_mach_creds_match(cstate->clp, rqstp)) {
+ cstate->spo_must_allowed = true;
+ return true;
+ }
+ }
+ cstate->spo_must_allowed = false;
+ return false;
+}
+
int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op)
{
struct nfsd4_operation *opdesc;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 70d0b9b33031..8410ca275db1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1200,27 +1200,6 @@ free_ol_stateid_reaplist(struct list_head *reaplist)
}
}
-static void release_lockowner(struct nfs4_lockowner *lo)
-{
- struct nfs4_client *clp = lo->lo_owner.so_client;
- struct nfs4_ol_stateid *stp;
- struct list_head reaplist;
-
- INIT_LIST_HEAD(&reaplist);
-
- spin_lock(&clp->cl_lock);
- unhash_lockowner_locked(lo);
- while (!list_empty(&lo->lo_owner.so_stateids)) {
- stp = list_first_entry(&lo->lo_owner.so_stateids,
- struct nfs4_ol_stateid, st_perstateowner);
- WARN_ON(!unhash_lock_stateid(stp));
- put_ol_stateid_locked(stp, &reaplist);
- }
- spin_unlock(&clp->cl_lock);
- free_ol_stateid_reaplist(&reaplist);
- nfs4_put_stateowner(&lo->lo_owner);
-}
-
static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
struct list_head *reaplist)
{
@@ -1972,7 +1951,7 @@ static bool svc_rqst_integrity_protected(struct svc_rqst *rqstp)
service == RPC_GSS_SVC_PRIVACY;
}
-static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
{
struct svc_cred *cr = &rqstp->rq_cred;
@@ -2388,6 +2367,22 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
switch (exid->spa_how) {
case SP4_MACH_CRED:
+ exid->spo_must_enforce[0] = 0;
+ exid->spo_must_enforce[1] = (
+ 1 << (OP_BIND_CONN_TO_SESSION - 32) |
+ 1 << (OP_EXCHANGE_ID - 32) |
+ 1 << (OP_CREATE_SESSION - 32) |
+ 1 << (OP_DESTROY_SESSION - 32) |
+ 1 << (OP_DESTROY_CLIENTID - 32));
+
+ exid->spo_must_allow[0] &= (1 << (OP_CLOSE) |
+ 1 << (OP_OPEN_DOWNGRADE) |
+ 1 << (OP_LOCKU) |
+ 1 << (OP_DELEGRETURN));
+
+ exid->spo_must_allow[1] &= (
+ 1 << (OP_TEST_STATEID - 32) |
+ 1 << (OP_FREE_STATEID - 32));
if (!svc_rqst_integrity_protected(rqstp)) {
status = nfserr_inval;
goto out_nolock;
@@ -2424,7 +2419,7 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
status = nfserr_inval;
goto out;
}
- if (!mach_creds_match(conf, rqstp)) {
+ if (!nfsd4_mach_creds_match(conf, rqstp)) {
status = nfserr_wrong_cred;
goto out;
}
@@ -2473,6 +2468,8 @@ out_new:
goto out;
}
new->cl_minorversion = cstate->minorversion;
+ new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
+ new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
gen_clid(new, nn);
add_to_unconfirmed(new);
@@ -2676,7 +2673,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
if (conf) {
status = nfserr_wrong_cred;
- if (!mach_creds_match(conf, rqstp))
+ if (!nfsd4_mach_creds_match(conf, rqstp))
goto out_free_conn;
cs_slot = &conf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2692,7 +2689,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
goto out_free_conn;
}
status = nfserr_wrong_cred;
- if (!mach_creds_match(unconf, rqstp))
+ if (!nfsd4_mach_creds_match(unconf, rqstp))
goto out_free_conn;
cs_slot = &unconf->cl_cs_slot;
status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
@@ -2801,7 +2798,7 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp,
if (!session)
goto out_no_session;
status = nfserr_wrong_cred;
- if (!mach_creds_match(session->se_client, rqstp))
+ if (!nfsd4_mach_creds_match(session->se_client, rqstp))
goto out;
status = nfsd4_map_bcts_dir(&bcts->dir);
if (status)
@@ -2848,7 +2845,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
if (!ses)
goto out_client_lock;
status = nfserr_wrong_cred;
- if (!mach_creds_match(ses->se_client, r))
+ if (!nfsd4_mach_creds_match(ses->se_client, r))
goto out_put_session;
status = mark_session_dead_locked(ses, 1 + ref_held_by_me);
if (status)
@@ -3087,7 +3084,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
status = nfserr_stale_clientid;
goto out;
}
- if (!mach_creds_match(clp, rqstp)) {
+ if (!nfsd4_mach_creds_match(clp, rqstp)) {
clp = NULL;
status = nfserr_wrong_cred;
goto out;
@@ -3112,7 +3109,7 @@ nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
* We don't take advantage of the rca_one_fs case.
* That's OK, it's optional, we can safely ignore it.
*/
- return nfs_ok;
+ return nfs_ok;
}
status = nfserr_complete_already;
@@ -5945,6 +5942,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
__be32 status;
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct nfs4_client *clp;
+ LIST_HEAD (reaplist);
dprintk("nfsd4_release_lockowner clientid: (%08x/%08x):\n",
clid->cl_boot, clid->cl_id);
@@ -5975,9 +5973,23 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
nfs4_get_stateowner(sop);
break;
}
+ if (!lo) {
+ spin_unlock(&clp->cl_lock);
+ return status;
+ }
+
+ unhash_lockowner_locked(lo);
+ while (!list_empty(&lo->lo_owner.so_stateids)) {
+ stp = list_first_entry(&lo->lo_owner.so_stateids,
+ struct nfs4_ol_stateid,
+ st_perstateowner);
+ WARN_ON(!unhash_lock_stateid(stp));
+ put_ol_stateid_locked(stp, &reaplist);
+ }
spin_unlock(&clp->cl_lock);
- if (lo)
- release_lockowner(lo);
+ free_ol_stateid_reaplist(&reaplist);
+ nfs4_put_stateowner(&lo->lo_owner);
+
return status;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9df898ba648f..0aa0236a1429 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1299,16 +1299,14 @@ nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
break;
case SP4_MACH_CRED:
/* spo_must_enforce */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
-
+ status = nfsd4_decode_bitmap(argp,
+ exid->spo_must_enforce);
+ if (status)
+ goto out;
/* spo_must_allow */
- READ_BUF(4);
- dummy = be32_to_cpup(p++);
- READ_BUF(dummy * 4);
- p += dummy;
+ status = nfsd4_decode_bitmap(argp, exid->spo_must_allow);
+ if (status)
+ goto out;
break;
case SP4_SSV:
/* ssp_ops */
@@ -2164,22 +2162,20 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp,
}
static inline __be32
-nfsd4_encode_layout_type(struct xdr_stream *xdr, enum pnfs_layouttype layout_type)
+nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types)
{
- __be32 *p;
+ __be32 *p;
+ unsigned long i = hweight_long(layout_types);
- if (layout_type) {
- p = xdr_reserve_space(xdr, 8);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(1);
- *p++ = cpu_to_be32(layout_type);
- } else {
- p = xdr_reserve_space(xdr, 4);
- if (!p)
- return nfserr_resource;
- *p++ = cpu_to_be32(0);
- }
+ p = xdr_reserve_space(xdr, 4 + 4 * i);
+ if (!p)
+ return nfserr_resource;
+
+ *p++ = cpu_to_be32(i);
+
+ for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i)
+ if (layout_types & (1 << i))
+ *p++ = cpu_to_be32(i);
return 0;
}
@@ -2754,13 +2750,13 @@ out_acl:
}
#ifdef CONFIG_NFSD_PNFS
if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
if (status)
goto out;
}
if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) {
- status = nfsd4_encode_layout_type(xdr, exp->ex_layout_type);
+ status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types);
if (status)
goto out;
}
@@ -3867,14 +3863,6 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
return nfserr;
}
-static const u32 nfs4_minimal_spo_must_enforce[2] = {
- [1] = 1 << (OP_BIND_CONN_TO_SESSION - 32) |
- 1 << (OP_EXCHANGE_ID - 32) |
- 1 << (OP_CREATE_SESSION - 32) |
- 1 << (OP_DESTROY_SESSION - 32) |
- 1 << (OP_DESTROY_CLIENTID - 32)
-};
-
static __be32
nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_exchange_id *exid)
@@ -3885,6 +3873,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
char *server_scope;
int major_id_sz;
int server_scope_sz;
+ int status = 0;
uint64_t minor_id = 0;
if (nfserr)
@@ -3913,18 +3902,20 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
case SP4_NONE:
break;
case SP4_MACH_CRED:
- /* spo_must_enforce, spo_must_allow */
- p = xdr_reserve_space(xdr, 16);
- if (!p)
- return nfserr_resource;
-
/* spo_must_enforce bitmap: */
- *p++ = cpu_to_be32(2);
- *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[0]);
- *p++ = cpu_to_be32(nfs4_minimal_spo_must_enforce[1]);
- /* empty spo_must_allow bitmap: */
- *p++ = cpu_to_be32(0);
-
+ status = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_enforce[0],
+ exid->spo_must_enforce[1],
+ exid->spo_must_enforce[2]);
+ if (status)
+ goto out;
+ /* spo_must_allow bitmap: */
+ status = nfsd4_encode_bitmap(xdr,
+ exid->spo_must_allow[0],
+ exid->spo_must_allow[1],
+ exid->spo_must_allow[2]);
+ if (status)
+ goto out;
break;
default:
WARN_ON_ONCE(1);
@@ -3951,6 +3942,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
/* Implementation id */
*p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */
return 0;
+out:
+ return status;
}
static __be32
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 9690cb4dd588..65ad0165a94f 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -158,7 +158,6 @@ static const struct file_operations exports_proc_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int exports_nfsd_open(struct inode *inode, struct file *file)
@@ -171,7 +170,6 @@ static const struct file_operations exports_nfsd_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
- .owner = THIS_MODULE,
};
static int export_features_show(struct seq_file *m, void *v)
@@ -217,7 +215,6 @@ static const struct file_operations pool_stats_operations = {
.read = seq_read,
.llseek = seq_lseek,
.release = nfsd_pool_stats_release,
- .owner = THIS_MODULE,
};
static struct file_operations reply_cache_stats_operations = {
@@ -1154,20 +1151,15 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
#endif
/* last one */ {""}
};
- struct net *net = data;
- int ret;
-
- ret = simple_fill_super(sb, 0x6e667364, nfsd_files);
- if (ret)
- return ret;
- sb->s_fs_info = get_net(net);
- return 0;
+ get_net(sb->s_fs_info);
+ return simple_fill_super(sb, 0x6e667364, nfsd_files);
}
static struct dentry *nfsd_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
+ struct net *net = current->nsproxy->net_ns;
+ return mount_ns(fs_type, flags, data, net, net->user_ns, nfsd_fill_super);
}
static void nfsd_umount(struct super_block *sb)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index cf980523898b..9446849888d5 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -124,6 +124,7 @@ void nfs4_state_shutdown_net(struct net *net);
void nfs4_reset_lease(time_t leasetime);
int nfs4_reset_recoverydir(char *recdir);
char * nfs4_recoverydir(void);
+bool nfsd4_spo_must_allow(struct svc_rqst *rqstp);
#else
static inline int nfsd4_init_slabs(void) { return 0; }
static inline void nfsd4_free_slabs(void) { }
@@ -134,6 +135,10 @@ static inline void nfs4_state_shutdown_net(struct net *net) { }
static inline void nfs4_reset_lease(time_t leasetime) { }
static inline int nfs4_reset_recoverydir(char *recdir) { return 0; }
static inline char * nfs4_recoverydir(void) {return NULL; }
+static inline bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
+{
+ return false;
+}
#endif
/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index a8919444c460..cfe7500d5847 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -59,14 +59,20 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry)
* the write call).
*/
static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
+nfsd_mode_check(struct svc_rqst *rqstp, struct dentry *dentry,
+ umode_t requested)
{
- mode &= S_IFMT;
+ umode_t mode = d_inode(dentry)->i_mode & S_IFMT;
if (requested == 0) /* the caller doesn't care */
return nfs_ok;
- if (mode == requested)
+ if (mode == requested) {
+ if (mode == S_IFDIR && !d_can_lookup(dentry)) {
+ WARN_ON_ONCE(1);
+ return nfserr_notdir;
+ }
return nfs_ok;
+ }
/*
* v4 has an error more specific than err_notdir which we should
* return in preference to err_notdir:
@@ -298,7 +304,7 @@ out:
* that it expects something not of the given type.
*
* @access is formed from the NFSD_MAY_* constants defined in
- * include/linux/nfsd/nfsd.h.
+ * fs/nfsd/vfs.h.
*/
__be32
fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
@@ -340,7 +346,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
if (error)
goto out;
- error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type);
+ error = nfsd_mode_check(rqstp, dentry, type);
if (error)
goto out;
@@ -533,7 +539,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry,
* the reference filehandle (if it is in the same export)
* or the export options.
*/
- set_version_and_fsid_type(fhp, exp, ref_fh);
+ set_version_and_fsid_type(fhp, exp, ref_fh);
if (ref_fh == fhp)
fh_put(ref_fh);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 4cd78ef4c95c..e9214768cde9 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -251,9 +251,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
/* Check for NFSD_MAY_WRITE in nfsd_create if necessary */
- nfserr = nfserr_acces;
- if (!argp->len)
- goto done;
nfserr = nfserr_exist;
if (isdotent(argp->name, argp->len))
goto done;
@@ -362,8 +359,8 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp,
nfserr = 0;
if (!inode) {
/* File doesn't exist. Create it and set attrs */
- nfserr = nfsd_create(rqstp, dirfhp, argp->name, argp->len,
- attr, type, rdev, newfhp);
+ nfserr = nfsd_create_locked(rqstp, dirfhp, argp->name,
+ argp->len, attr, type, rdev, newfhp);
} else if (type == S_IFREG) {
dprintk("nfsd: existing %s, valid=%x, size=%ld\n",
argp->name, attr->ia_valid, (long) attr->ia_size);
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 79d964aa8079..41b468a6a90f 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -240,7 +240,7 @@ nfssvc_decode_diropargs(struct svc_rqst *rqstp, __be32 *p,
|| !(p = decode_filename(p, &args->name, &args->len)))
return 0;
- return xdr_argsize_check(rqstp, p);
+ return xdr_argsize_check(rqstp, p);
}
int
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 7d073b9b1553..0c2a716e8741 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -21,6 +21,7 @@ struct nfsd4_layout_ops {
u32 notify_types;
__be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct svc_rqst *rqstp,
struct nfs4_client *clp,
struct nfsd4_getdeviceinfo *gdevp);
__be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
@@ -44,6 +45,9 @@ extern const struct nfsd4_layout_ops bl_layout_ops;
#ifdef CONFIG_NFSD_SCSILAYOUT
extern const struct nfsd4_layout_ops scsi_layout_ops;
#endif
+#ifdef CONFIG_NFSD_FLEXFILELAYOUT
+extern const struct nfsd4_layout_ops ff_layout_ops;
+#endif
__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
struct nfsd4_compound_state *cstate, stateid_t *stateid,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 64053eadeb81..b95adf9a1595 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -345,6 +345,7 @@ struct nfs4_client {
u32 cl_exchange_flags;
/* number of rpc's in progress over an associated session: */
atomic_t cl_refcount;
+ struct nfs4_op_map cl_spo_must_allow;
/* for nfs41 callbacks */
/* We currently support a single back channel with a single slot */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index cd90878a76aa..d97338bb6a39 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -84,7 +84,6 @@ static int nfsd_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations nfsd_proc_fops = {
- .owner = THIS_MODULE,
.open = nfsd_proc_open,
.read = seq_read,
.llseek = seq_lseek,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6fbd81ecb410..ba944123167b 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1135,96 +1135,37 @@ nfsd_check_ignore_resizing(struct iattr *iap)
iap->ia_valid &= ~ATTR_SIZE;
}
-/*
- * Create a file (regular, directory, device, fifo); UNIX sockets
- * not yet implemented.
- * If the response fh has been verified, the parent directory should
- * already be locked. Note that the parent directory is left locked.
- *
- * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
- */
+/* The parent directory should already be locked: */
__be32
-nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
char *fname, int flen, struct iattr *iap,
int type, dev_t rdev, struct svc_fh *resfhp)
{
- struct dentry *dentry, *dchild = NULL;
+ struct dentry *dentry, *dchild;
struct inode *dirp;
__be32 err;
__be32 err2;
int host_err;
- err = nfserr_perm;
- if (!flen)
- goto out;
- err = nfserr_exist;
- if (isdotent(fname, flen))
- goto out;
-
- err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE);
- if (err)
- goto out;
-
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
- err = nfserr_notdir;
- if (!dirp->i_op->lookup)
- goto out;
- /*
- * Check whether the response file handle has been verified yet.
- * If it has, the parent directory should already be locked.
- */
- if (!resfhp->fh_dentry) {
- host_err = fh_want_write(fhp);
- if (host_err)
- goto out_nfserr;
-
- /* called from nfsd_proc_mkdir, or possibly nfsd3_proc_create */
- fh_lock_nested(fhp, I_MUTEX_PARENT);
- dchild = lookup_one_len(fname, dentry, flen);
- host_err = PTR_ERR(dchild);
- if (IS_ERR(dchild))
- goto out_nfserr;
- err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
- if (err)
- goto out;
- } else {
- /* called from nfsd_proc_create */
- dchild = dget(resfhp->fh_dentry);
- if (!fhp->fh_locked) {
- /* not actually possible */
- printk(KERN_ERR
- "nfsd_create: parent %pd2 not locked!\n",
+ dchild = dget(resfhp->fh_dentry);
+ if (!fhp->fh_locked) {
+ WARN_ONCE(1, "nfsd_create: parent %pd2 not locked!\n",
dentry);
- err = nfserr_io;
- goto out;
- }
- }
- /*
- * Make sure the child dentry is still negative ...
- */
- err = nfserr_exist;
- if (d_really_is_positive(dchild)) {
- dprintk("nfsd_create: dentry %pd/%pd not negative!\n",
- dentry, dchild);
- goto out;
+ err = nfserr_io;
+ goto out;
}
+ err = nfsd_permission(rqstp, fhp->fh_export, dentry, NFSD_MAY_CREATE);
+ if (err)
+ goto out;
+
if (!(iap->ia_valid & ATTR_MODE))
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
- err = nfserr_inval;
- if (!S_ISREG(type) && !S_ISDIR(type) && !special_file(type)) {
- printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
- type);
- goto out;
- }
-
- /*
- * Get the dir op function pointer.
- */
err = 0;
host_err = 0;
switch (type) {
@@ -1242,6 +1183,10 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
case S_IFSOCK:
host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
break;
+ default:
+ printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n",
+ type);
+ host_err = -EINVAL;
}
if (host_err < 0)
goto out_nfserr;
@@ -1251,7 +1196,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
/*
* nfsd_create_setattr already committed the child. Transactional
* filesystems had a chance to commit changes for both parent and
- * child * simultaneously making the following commit_metadata a
+ * child simultaneously making the following commit_metadata a
* noop.
*/
err2 = nfserrno(commit_metadata(fhp));
@@ -1263,8 +1208,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
if (!err)
err = fh_update(resfhp);
out:
- if (dchild && !IS_ERR(dchild))
- dput(dchild);
+ dput(dchild);
return err;
out_nfserr:
@@ -1272,6 +1216,50 @@ out_nfserr:
goto out;
}
+/*
+ * Create a filesystem object (regular, directory, special).
+ * Note that the parent directory is left locked.
+ *
+ * N.B. Every call to nfsd_create needs an fh_put for _both_ fhp and resfhp
+ */
+__be32
+nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ char *fname, int flen, struct iattr *iap,
+ int type, dev_t rdev, struct svc_fh *resfhp)
+{
+ struct dentry *dentry, *dchild = NULL;
+ struct inode *dirp;
+ __be32 err;
+ int host_err;
+
+ if (isdotent(fname, flen))
+ return nfserr_exist;
+
+ err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_NOP);
+ if (err)
+ return err;
+
+ dentry = fhp->fh_dentry;
+ dirp = d_inode(dentry);
+
+ host_err = fh_want_write(fhp);
+ if (host_err)
+ return nfserrno(host_err);
+
+ fh_lock_nested(fhp, I_MUTEX_PARENT);
+ dchild = lookup_one_len(fname, dentry, flen);
+ host_err = PTR_ERR(dchild);
+ if (IS_ERR(dchild))
+ return nfserrno(host_err);
+ err = fh_compose(resfhp, fhp->fh_export, dchild, fhp);
+ if (err) {
+ dput(dchild);
+ return err;
+ }
+ return nfsd_create_locked(rqstp, fhp, fname, flen, iap, type,
+ rdev, resfhp);
+}
+
#ifdef CONFIG_NFSD_V3
/*
@@ -1304,12 +1292,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
dentry = fhp->fh_dentry;
dirp = d_inode(dentry);
- /* Get all the sanity checks out of the way before
- * we lock the parent. */
- err = nfserr_notdir;
- if (!dirp->i_op->lookup)
- goto out;
-
host_err = fh_want_write(fhp);
if (host_err)
goto out_nfserr;
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 2d573ec057f8..3cbb1b33777b 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -59,6 +59,9 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
u64, u64);
#endif /* CONFIG_NFSD_V4 */
+__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
+ char *name, int len, struct iattr *attrs,
+ int type, dev_t rdev, struct svc_fh *res);
__be32 nfsd_create(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
int type, dev_t rdev, struct svc_fh *res);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index d9554813e58a..beea0c5edc51 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -59,6 +59,7 @@ struct nfsd4_compound_state {
struct nfsd4_session *session;
struct nfsd4_slot *slot;
int data_offset;
+ bool spo_must_allowed;
size_t iovlen;
u32 minorversion;
__be32 status;
@@ -403,6 +404,8 @@ struct nfsd4_exchange_id {
clientid_t clientid;
u32 seqid;
int spa_how;
+ u32 spo_must_enforce[3];
+ u32 spo_must_allow[3];
};
struct nfsd4_sequence {
@@ -654,6 +657,8 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
}
+
+bool nfsd4_mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp);
int nfs4svc_encode_voidres(struct svc_rqst *, __be32 *, void *);
int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
struct nfsd4_compoundargs *);
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 1a85d94f5b25..2c90e285d7c6 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -622,10 +622,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -663,10 +663,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)req->pr_entry_nr,
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -772,10 +772,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_warning(inode->i_sb, __func__,
- "entry number %llu already freed: ino=%lu",
- (unsigned long long)entry_nrs[j],
- (unsigned long)inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -816,12 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
for (k = 0; k < nempties; k++) {
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete block of entry %llu: ino=%lu, err=%d",
- (unsigned long long)last_nrs[k],
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -835,12 +834,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
- if (ret && ret != -ENOENT) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to delete bitmap block of group %lu: ino=%lu, err=%d",
- group,
- (unsigned long)inode->i_ino, ret);
- }
+ if (ret && ret != -ENOENT)
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index f2a7877e0c8c..01fb1831ca25 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -41,8 +41,8 @@ static int nilfs_bmap_convert_error(struct nilfs_bmap *bmap,
struct inode *inode = bmap->b_inode;
if (err == -EINVAL) {
- nilfs_error(inode->i_sb, fname,
- "broken bmap (inode number=%lu)", inode->i_ino);
+ __nilfs_error(inode->i_sb, fname,
+ "broken bmap (inode number=%lu)", inode->i_ino);
err = -EIO;
}
return err;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b6a4c8f93ac8..2b6ffbe5997a 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_binfo, nilfs_inode, etc */
#include "alloc.h"
#include "dat.h"
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4cca998ec7a0..d5c23da43513 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -41,7 +41,7 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
struct inode *inode = NILFS_BTNC_I(btnc);
struct buffer_head *bh;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return NULL;
@@ -70,7 +70,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
struct page *page;
int err;
- bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
+ bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
if (unlikely(!bh))
return -ENOMEM;
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 982d1e3df3a5..2e315f9f2e51 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -339,12 +339,14 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
* nilfs_btree_node_broken - verify consistency of btree node
* @node: btree node block to be examined
* @size: node size (in bytes)
+ * @inode: host inode of btree
* @blocknr: block number
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
- size_t size, sector_t blocknr)
+ size_t size, struct inode *inode,
+ sector_t blocknr)
{
int level, flags, nchildren;
int ret = 0;
@@ -358,9 +360,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): "
- "level = %d, flags = 0x%x, nchildren = %d\n",
- (unsigned long long)blocknr, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -369,12 +372,12 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
/**
* nilfs_btree_root_broken - verify consistency of btree root node
* @node: btree root node to be examined
- * @ino: inode number
+ * @inode: host inode of btree
*
* Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned.
*/
static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
- unsigned long ino)
+ struct inode *inode)
{
int level, flags, nchildren;
int ret = 0;
@@ -387,8 +390,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n",
- ino, level, flags, nchildren);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -396,13 +400,15 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
int nilfs_btree_broken_node_block(struct buffer_head *bh)
{
+ struct inode *inode;
int ret;
if (buffer_nilfs_checked(bh))
return 0;
+ inode = bh->b_page->mapping->host;
ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data,
- bh->b_size, bh->b_blocknr);
+ bh->b_size, inode, bh->b_blocknr);
if (likely(!ret))
set_buffer_nilfs_checked(bh);
return ret;
@@ -448,13 +454,15 @@ nilfs_btree_get_node(const struct nilfs_bmap *btree,
return node;
}
-static int
-nilfs_btree_bad_node(struct nilfs_btree_node *node, int level)
+static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
+ struct nilfs_btree_node *node, int level)
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- printk(KERN_CRIT "NILFS: btree level mismatch: %d != %d\n",
- nilfs_btree_node_get_level(node), level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -509,6 +517,9 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
+ nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ "I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
+ btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
return -EIO;
}
@@ -568,7 +579,7 @@ static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree,
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
if (!found)
found = nilfs_btree_node_lookup(node, key, &index);
@@ -616,7 +627,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree,
if (ret < 0)
return ret;
node = nilfs_btree_get_nonroot_node(path, level);
- if (nilfs_btree_bad_node(node, level))
+ if (nilfs_btree_bad_node(btree, node, level))
return -EINVAL;
index = nilfs_btree_node_get_nchildren(node) - 1;
ptr = nilfs_btree_node_get_ptr(node, index, ncmax);
@@ -2072,8 +2083,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- printk(KERN_CRIT "%s: key = %llu, level == %d\n",
- __func__, (unsigned long long)key, level);
+ nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2110,12 +2123,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- printk(KERN_WARNING
- "%s: invalid btree level: %d (key=%llu, ino=%lu, "
- "blocknr=%llu)\n",
- __func__, level, (unsigned long long)key,
- NILFS_BMAP_I(btree)->vfs_inode.i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
@@ -2394,8 +2406,7 @@ int nilfs_btree_init(struct nilfs_bmap *bmap)
__nilfs_btree_init(bmap);
- if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap),
- bmap->b_inode->i_ino))
+ if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
return ret;
}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index df1a25faa83b..2184e47fa4bf 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -22,7 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/list.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_btree_node */
#include "btnode.h"
#include "bmap.h"
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 8a3d3b65af3f..a15a1601e931 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -21,7 +21,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "cpfile.h"
@@ -332,9 +331,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
- "[%llu, %llu)\n", __func__,
- (unsigned long long)start, (unsigned long long)end);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "cannot delete checkpoints: invalid range [%llu, %llu)",
+ (unsigned long long)start, (unsigned long long)end);
return -EINVAL;
}
@@ -386,9 +385,9 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- printk(KERN_ERR
- "%s: cannot delete block\n",
- __func__);
+ nilfs_msg(cpfile->i_sb, KERN_ERR,
+ "error %d deleting checkpoint block",
+ ret);
break;
}
}
@@ -991,14 +990,12 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- printk(KERN_ERR
- "NILFS: too small checkpoint size: %zu bytes.\n",
- cpsize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 0249744ae234..6eca972f9673 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -21,7 +21,8 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h> /* nilfs_cpstat */
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 7367610ea807..dffedb2f8817 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -349,10 +349,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
- (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_msg(dat->i_sb, KERN_CRIT,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -479,14 +480,12 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- printk(KERN_ERR
- "NILFS: too small DAT entry size: %zu bytes.\n",
- entry_size);
+ nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index abbfdabcabea..57dc6cf466d0 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -22,6 +22,7 @@
#include <linux/types.h>
#include <linux/buffer_head.h>
#include <linux/fs.h>
+#include <linux/nilfs2_ondisk.h> /* nilfs_inode, nilfs_checkpoint */
struct nilfs_palloc_req;
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index e506f4f7120a..908ebbf0ac7e 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -42,6 +42,28 @@
#include "nilfs.h"
#include "page.h"
+static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen)
+{
+ unsigned int len = le16_to_cpu(dlen);
+
+#if (PAGE_SIZE >= 65536)
+ if (len == NILFS_MAX_REC_LEN)
+ return 1 << 16;
+#endif
+ return len;
+}
+
+static inline __le16 nilfs_rec_len_to_disk(unsigned int len)
+{
+#if (PAGE_SIZE >= 65536)
+ if (len == (1 << 16))
+ return cpu_to_le16(NILFS_MAX_REC_LEN);
+
+ BUG_ON(len > (1 << 16));
+#endif
+ return cpu_to_le16(len);
+}
+
/*
* nilfs uses block-sized chunks. Arguably, sector-sized ones would be
* more robust, but we have what we have
@@ -140,10 +162,9 @@ out:
/* Too bad, we had an error */
Ebadsize:
- nilfs_error(sb, "nilfs_check_page",
+ nilfs_error(sb,
"size of directory #%lu is not a multiple of chunk size",
- dir->i_ino
- );
+ dir->i_ino);
goto fail;
Eshort:
error = "rec_len is smaller than minimal";
@@ -157,19 +178,18 @@ Enamelen:
Espan:
error = "directory entry across blocks";
bad_entry:
- nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
- "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index<<PAGE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode),
+ nilfs_error(sb,
+ "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
+ dir->i_ino, error, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
Eend:
p = (struct nilfs_dir_entry *)(kaddr + offs);
- nilfs_error(sb, "nilfs_check_page",
- "entry in directory #%lu spans the page boundary"
- "offset=%lu, inode=%lu",
- dir->i_ino, (page->index<<PAGE_SHIFT)+offs,
- (unsigned long) le64_to_cpu(p->inode));
+ nilfs_error(sb,
+ "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu",
+ dir->i_ino, (page->index << PAGE_SHIFT) + offs,
+ (unsigned long)le64_to_cpu(p->inode));
fail:
SetPageError(page);
return false;
@@ -267,8 +287,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
struct page *page = nilfs_get_page(inode, n);
if (IS_ERR(page)) {
- nilfs_error(sb, __func__, "bad page in #%lu",
- inode->i_ino);
+ nilfs_error(sb, "bad page in #%lu", inode->i_ino);
ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
@@ -278,8 +297,7 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
NILFS_DIR_REC_LEN(1);
for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
if (de->rec_len == 0) {
- nilfs_error(sb, __func__,
- "zero-length directory entry");
+ nilfs_error(sb, "zero-length directory entry");
nilfs_put_page(page);
return -EIO;
}
@@ -345,7 +363,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
kaddr += nilfs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
nilfs_put_page(page);
goto out;
@@ -360,7 +378,7 @@ nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
n = 0;
/* next page is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"dir %lu size %lld exceeds block count %llu",
dir->i_ino, dir->i_size,
(unsigned long long)dir->i_blocks);
@@ -469,7 +487,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
}
if (de->rec_len == 0) {
- nilfs_error(dir->i_sb, __func__,
+ nilfs_error(dir->i_sb,
"zero-length directory entry");
err = -EIO;
goto out_unlock;
@@ -541,7 +559,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
while ((char *)de < (char *)dir) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
+ nilfs_error(inode->i_sb,
"zero-length directory entry");
err = -EIO;
goto out;
@@ -628,7 +646,7 @@ int nilfs_empty_dir(struct inode *inode)
while ((char *)de <= kaddr) {
if (de->rec_len == 0) {
- nilfs_error(inode->i_sb, __func__,
+ nilfs_error(inode->i_sb,
"zero-length directory entry (kaddr=%p, de=%p)",
kaddr, de);
goto not_empty;
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 251a44928405..96e3ed0d9652 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -337,14 +337,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
- (unsigned long long)key);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid key: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
- (unsigned long long)ptr);
+ nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
+ "%s (ino=%lu): invalid pointer: %llu", __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
index 3015a6e78724..cfe85e848bba 100644
--- a/fs/nilfs2/direct.h
+++ b/fs/nilfs2/direct.h
@@ -24,16 +24,6 @@
#include "bmap.h"
-/**
- * struct nilfs_direct_node - direct node
- * @dn_flags: flags
- * @dn_pad: padding
- */
-struct nilfs_direct_node {
- __u8 dn_flags;
- __u8 pad[7];
-};
-
#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
#define NILFS_DIRECT_KEY_MIN 0
#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index e9148f94d696..853a831dcde0 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -148,8 +148,15 @@ int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
{
wait_on_buffer(bh);
- if (!buffer_uptodate(bh))
+ if (!buffer_uptodate(bh)) {
+ struct inode *inode = bh->b_page->mapping->host;
+
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
+ buffer_nilfs_node(bh) ? "node" : "data",
+ inode->i_ino, (unsigned long long)bh->b_blocknr);
return -EIO;
+ }
if (buffer_dirty(bh))
return -EEXIST;
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 1d2b1805327a..b8fa45c20c63 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -145,15 +145,14 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
int err;
if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
- nilfs_error(sb, __func__, "bad inode number: %lu",
- (unsigned long) ino);
+ nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino);
return -EINVAL;
}
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_warning(sb, __func__, "unable to read inode: %lu",
- (unsigned long) ino);
+ nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 23ad2f091e76..188b94fe0ec5 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -23,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "alloc.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index a0ebdb17e912..af04f553d7c9 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -112,13 +112,10 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- printk(KERN_WARNING
- "nilfs_get_block: a race condition "
- "while inserting a data block. "
- "(inode number=%lu, file block "
- "offset=%llu)\n",
- inode->i_ino,
- (unsigned long long)blkoff);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
@@ -359,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
- ii->i_state = 1 << NILFS_I_NEW;
+ ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
@@ -558,7 +555,7 @@ static int nilfs_iget_set(struct inode *inode, void *opaque)
inode->i_ino = args->ino;
if (args->for_gc) {
- NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
+ NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
NILFS_I(inode)->i_cno = args->cno;
NILFS_I(inode)->i_root = NULL;
} else {
@@ -726,9 +723,9 @@ repeat:
goto repeat;
failed:
- nilfs_warning(ii->vfs_inode.i_sb, __func__,
- "failed to truncate bmap (ino=%lu, err=%d)",
- ii->vfs_inode.i_ino, ret);
+ nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
+ "error %d truncating bmap (ino=%lu)", ret,
+ ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -939,9 +936,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
* This will happen when somebody is freeing
* this inode.
*/
- nilfs_warning(inode->i_sb, __func__,
- "cannot get inode (ino=%lu)",
- inode->i_ino);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /*
* NILFS_I_DIRTY may remain for
@@ -962,8 +959,9 @@ int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_warning(inode->i_sb, __func__,
- "failed to reget inode block.");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -989,8 +987,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_warning(inode->i_sb, __func__,
- "tried to mark bad_inode dirty. ignored.");
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 358b57e2cdf9..f1d7989459fd 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -25,7 +25,6 @@
#include <linux/compat.h> /* compat_ptr() */
#include <linux/mount.h> /* mnt_want_write_file(), mnt_drop_write_file() */
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
#include "segment.h"
#include "bmap.h"
@@ -584,27 +583,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- printk(KERN_CRIT
- "%s: invalid virtual block address (%s): "
- "ino=%llu, cno=%llu, offset=%llu, "
- "blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- printk(KERN_CRIT "%s: conflicting %s buffer: ino=%llu, "
- "cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu\n",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_msg(inode->i_sb, KERN_CRIT,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -854,8 +851,8 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
- msg, ret);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
+ msg);
return ret;
}
@@ -963,10 +960,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
}
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
- if (ret < 0)
- printk(KERN_ERR "NILFS: GC failed during preparation: "
- "cannot read source blocks: err=%d\n", ret);
- else {
+ if (ret < 0) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "error %d preparing GC: cannot read source blocks",
+ ret);
+ } else {
if (nilfs_sb_need_update(nilfs))
set_nilfs_discontinued(nilfs);
ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 0d7b71fbeff8..d56d3a5bea88 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -207,8 +207,12 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
- if (!buffer_uptodate(first_bh))
+ if (!buffer_uptodate(first_bh)) {
+ nilfs_msg(inode->i_sb, KERN_ERR,
+ "I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
+ inode->i_ino, block);
goto failed_bh;
+ }
out:
*out_bh = first_bh;
return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 1ec8ae5995a5..dbcf1dc93a51 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -283,9 +283,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_warning(inode->i_sb, __func__,
- "deleting nonexistent file (%lu), %d",
- inode->i_ino, inode->i_nlink);
+ nilfs_msg(inode->i_sb, KERN_WARNING,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index b1d48bc0532d..33f8c8fc96e8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -23,7 +23,8 @@
#include <linux/buffer_head.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
-#include <linux/nilfs2_fs.h>
+#include <linux/nilfs2_api.h>
+#include <linux/nilfs2_ondisk.h>
#include "the_nilfs.h"
#include "bmap.h"
@@ -119,20 +120,19 @@ enum {
/*
* Macros to check inode numbers
*/
-#define NILFS_MDT_INO_BITS \
- ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
- 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
- 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
+#define NILFS_MDT_INO_BITS \
+ (BIT(NILFS_DAT_INO) | BIT(NILFS_CPFILE_INO) | \
+ BIT(NILFS_SUFILE_INO) | BIT(NILFS_IFILE_INO) | \
+ BIT(NILFS_ATIME_INO) | BIT(NILFS_SKETCH_INO))
-#define NILFS_SYS_INO_BITS \
- ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
+#define NILFS_SYS_INO_BITS (BIT(NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
#define NILFS_FIRST_INO(sb) (((struct the_nilfs *)sb->s_fs_info)->ns_first_ino)
#define NILFS_MDT_INODE(sb, ino) \
- ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
+ ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & BIT(ino)))
#define NILFS_VALID_INODE(sb, ino) \
- ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
+ ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & BIT(ino)))
/**
* struct nilfs_transaction_info: context information for synchronization
@@ -299,10 +299,36 @@ static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
/* super.c */
extern struct inode *nilfs_alloc_inode(struct super_block *);
extern void nilfs_destroy_inode(struct inode *);
+
extern __printf(3, 4)
-void nilfs_error(struct super_block *, const char *, const char *, ...);
+void __nilfs_msg(struct super_block *sb, const char *level,
+ const char *fmt, ...);
extern __printf(3, 4)
-void nilfs_warning(struct super_block *, const char *, const char *, ...);
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...);
+
+#ifdef CONFIG_PRINTK
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ __nilfs_msg(sb, level, fmt, ##__VA_ARGS__)
+#define nilfs_error(sb, fmt, ...) \
+ __nilfs_error(sb, __func__, fmt, ##__VA_ARGS__)
+
+#else
+
+#define nilfs_msg(sb, level, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ (void)(sb); \
+ } while (0)
+#define nilfs_error(sb, fmt, ...) \
+ do { \
+ no_printk(fmt, ##__VA_ARGS__); \
+ __nilfs_error(sb, "", " "); \
+ } while (0)
+
+#endif /* CONFIG_PRINTK */
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index d97ba5f11b77..f11a3ad2df0c 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -30,9 +30,9 @@
#include "mdt.h"
-#define NILFS_BUFFER_INHERENT_BITS \
- ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
- (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Checked))
+#define NILFS_BUFFER_INHERENT_BITS \
+ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \
+ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked))
static struct buffer_head *
__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
@@ -85,9 +85,9 @@ void nilfs_forget_buffer(struct buffer_head *bh)
{
struct page *page = bh->b_page;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
@@ -124,17 +124,17 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
dbh->b_bdev = sbh->b_bdev;
bh = dbh;
- bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
+ bits = sbh->b_state & (BIT(BH_Uptodate) | BIT(BH_Mapped));
while ((bh = bh->b_this_page) != dbh) {
lock_buffer(bh);
bits &= bh->b_state;
unlock_buffer(bh);
}
- if (bits & (1UL << BH_Uptodate))
+ if (bits & BIT(BH_Uptodate))
SetPageUptodate(dpage);
else
ClearPageUptodate(dpage);
- if (bits & (1UL << BH_Mapped))
+ if (bits & BIT(BH_Mapped))
SetPageMappedToDisk(dpage);
else
ClearPageMappedToDisk(dpage);
@@ -215,7 +215,7 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
create_empty_buffers(dst, sbh->b_size, 0);
if (copy_dirty)
- mask |= (1UL << BH_Dirty);
+ mask |= BIT(BH_Dirty);
dbh = dbufs = page_buffers(dst);
do {
@@ -403,11 +403,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard page: offset %lld, ino %lu",
- page_offset(page), inode->i_ino);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -415,18 +414,18 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
if (page_has_buffers(page)) {
struct buffer_head *bh, *head;
const unsigned long clear_bits =
- (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped |
- 1 << BH_Async_Write | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected);
+ (BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
+ BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
bh = head = page_buffers(page);
do {
lock_buffer(bh);
- if (!silent) {
- nilfs_warning(sb, __func__,
- "discard block %llu, size %zu",
- (u64)bh->b_blocknr, bh->b_size);
- }
+ if (!silent)
+ nilfs_msg(sb, KERN_WARNING,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
+
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
} while (bh = bh->b_this_page, bh != head);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d893dc912b62..5139efed1888 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -54,38 +54,37 @@ struct nilfs_recovery_block {
};
-static int nilfs_warn_segment_error(int err)
+static int nilfs_warn_segment_error(struct super_block *sb, int err)
{
+ const char *msg = NULL;
+
switch (err) {
case NILFS_SEG_FAIL_IO:
- printk(KERN_WARNING
- "NILFS warning: I/O error on loading last segment\n");
+ nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
- printk(KERN_WARNING
- "NILFS warning: Segment magic number invalid\n");
+ msg = "Magic number mismatch";
break;
case NILFS_SEG_FAIL_SEQ:
- printk(KERN_WARNING
- "NILFS warning: Sequence number mismatch\n");
+ msg = "Sequence number mismatch";
break;
case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in super root\n");
+ msg = "Checksum error in super root";
break;
case NILFS_SEG_FAIL_CHECKSUM_FULL:
- printk(KERN_WARNING
- "NILFS warning: Checksum error in segment payload\n");
+ msg = "Checksum error in segment payload";
break;
case NILFS_SEG_FAIL_CONSISTENCY:
- printk(KERN_WARNING
- "NILFS warning: Inconsistent segment\n");
+ msg = "Inconsistency found";
break;
case NILFS_SEG_NO_SUPER_ROOT:
- printk(KERN_WARNING
- "NILFS warning: No super root in the last segment\n");
+ msg = "No super root in the last segment";
break;
+ default:
+ nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ return -EINVAL;
}
+ nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -178,7 +177,7 @@ int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block,
brelse(bh_sr);
failed:
- return nilfs_warn_segment_error(ret);
+ return nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
/**
@@ -553,11 +552,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
put_page(page);
failed_inode:
- printk(KERN_WARNING
- "NILFS warning: error recovering data block "
- "(err=%d, ino=%lu, block-offset=%llu)\n",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -680,8 +678,8 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
- sb->s_id, nsalvaged_blocks);
+ nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
+ nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -692,10 +690,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- printk(KERN_ERR
- "NILFS (device %s): Error roll-forwarding "
- "(err=%d, pseg block=%llu). ",
- sb->s_id, err, (unsigned long long)pseg_start);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d roll-forwarding partial segment at blocknr = %llu",
+ err, (unsigned long long)pseg_start);
goto out;
}
@@ -715,9 +712,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- printk(KERN_WARNING
- "NILFS warning: buffer sync write failed during "
- "post-cleaning of recovery.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -752,8 +748,8 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: error loading the latest checkpoint.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d loading the latest checkpoint", err);
return err;
}
@@ -764,8 +760,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Error preparing segments for "
- "recovery.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d preparing segment for recovery",
+ err);
goto failed;
}
@@ -778,8 +775,9 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: Oops! recovery failed. "
- "(err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d writing segment for recovery",
+ err);
goto failed;
}
@@ -961,5 +959,5 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
failed:
brelse(bh_sum);
nilfs_dispose_segment_list(&segments);
- return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
+ return ret < 0 ? ret : nilfs_warn_segment_error(nilfs->ns_sb, ret);
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index a962d7d83447..6f87b2ac1aeb 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -514,7 +514,11 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- printk(KERN_ERR "NILFS: IO error writing segment\n");
+ nilfs_msg(segbuf->sb_super, KERN_ERR,
+ "I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
+ (unsigned long long)segbuf->sb_pseg_start,
+ segbuf->sb_sum.nblocks,
+ (unsigned long long)segbuf->sb_segnum);
err = -EIO;
}
return err;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e78b68a81aec..bedcae2c28e6 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -150,7 +150,8 @@ static void nilfs_dispose_list(struct the_nilfs *, struct list_head *, int);
#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
-static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
+static int nilfs_prepare_segment_lock(struct super_block *sb,
+ struct nilfs_transaction_info *ti)
{
struct nilfs_transaction_info *cur_ti = current->journal_info;
void *save = NULL;
@@ -164,8 +165,7 @@ static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
* it is saved and will be restored on
* nilfs_transaction_commit().
*/
- printk(KERN_WARNING
- "NILFS warning: journal info from a different FS\n");
+ nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
save = current->journal_info;
}
if (!ti) {
@@ -215,7 +215,7 @@ int nilfs_transaction_begin(struct super_block *sb,
int vacancy_check)
{
struct the_nilfs *nilfs;
- int ret = nilfs_prepare_segment_lock(ti);
+ int ret = nilfs_prepare_segment_lock(sb, ti);
struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
@@ -373,7 +373,7 @@ static void nilfs_transaction_lock(struct super_block *sb,
nilfs_segctor_do_immediate_flush(sci);
up_write(&nilfs->ns_segctor_sem);
- yield();
+ cond_resched();
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
@@ -1858,11 +1858,11 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- const unsigned long set_bits = (1 << BH_Uptodate);
+ const unsigned long set_bits = BIT(BH_Uptodate);
const unsigned long clear_bits =
- (1 << BH_Dirty | 1 << BH_Async_Write |
- 1 << BH_Delay | 1 << BH_NILFS_Volatile |
- 1 << BH_NILFS_Redirected);
+ (BIT(BH_Dirty) | BIT(BH_Async_Write) |
+ BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
+ BIT(BH_NILFS_Redirected));
set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
@@ -1951,8 +1951,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_warning(sci->sc_super, __func__,
- "failed to get inode block.");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
mark_buffer_dirty(ibh);
@@ -2131,10 +2132,10 @@ static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
{
spin_lock(&sci->sc_state_lock);
- if (!(sci->sc_flush_request & (1 << bn))) {
+ if (!(sci->sc_flush_request & BIT(bn))) {
unsigned long prev_req = sci->sc_flush_request;
- sci->sc_flush_request |= (1 << bn);
+ sci->sc_flush_request |= BIT(bn);
if (!prev_req)
wake_up(&sci->sc_wait_daemon);
}
@@ -2318,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
}
#define FLUSH_FILE_BIT (0x1) /* data file only */
-#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
+#define FLUSH_DAT_BIT BIT(NILFS_DAT_INO) /* DAT only */
/**
* nilfs_segctor_accept - record accepted sequence count of log-write requests
@@ -2458,8 +2459,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_warning(sb, __func__,
- "segment construction failed. (err=%d)", err);
+ nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2467,9 +2467,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- printk(KERN_WARNING
- "NILFS warning: error %d on discard request, "
- "turning discards off for the device\n", ret);
+ nilfs_msg(sb, KERN_WARNING,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2551,10 +2551,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- printk(KERN_INFO
- "segctord starting. Construction interval = %lu seconds, "
- "CP frequency < %lu seconds\n",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_msg(sci->sc_super, KERN_INFO,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2628,8 +2627,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
- err);
+ nilfs_msg(sci->sc_super, KERN_ERR,
+ "error %d creating segctord thread", err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2739,14 +2738,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_warning(sci->sc_super, __func__,
- "dirty file(s) after the final construction");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_warning(sci->sc_super, __func__,
- "iput queue is not empty");
+ nilfs_msg(sci->sc_super, KERN_WARNING,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2822,8 +2821,8 @@ void nilfs_detach_log_writer(struct super_block *sb)
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_warning(sb, __func__,
- "Hit dirty file after stopped log writer");
+ nilfs_msg(sb, KERN_WARNING,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 6565c10b7b76..1060949d7dd2 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -23,7 +23,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
#include <linux/workqueue.h>
-#include <linux/nilfs2_fs.h>
#include "nilfs.h"
struct nilfs_root;
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 1963595a1580..1541a1e9221a 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -22,7 +22,6 @@
#include <linux/string.h>
#include <linux/buffer_head.h>
#include <linux/errno.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
#include "sufile.h"
@@ -181,9 +180,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING
- "%s: invalid segment number: %llu\n", __func__,
- (unsigned long long)*seg);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -240,8 +239,9 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- printk(KERN_WARNING "%s: invalid segment number: %llu\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -419,8 +419,9 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- printk(KERN_WARNING "%s: segment %llu must be clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu must be clean", __func__,
+ (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -444,7 +445,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
- if (su->su_flags == cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY) &&
+ if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
su->su_nblocks == cpu_to_le32(0)) {
kunmap_atomic(kaddr);
return;
@@ -455,7 +456,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
/* make the segment garbage */
su->su_lastmod = cpu_to_le64(0);
su->su_nblocks = cpu_to_le32(0);
- su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY);
+ su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
kunmap_atomic(kaddr);
nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
@@ -476,8 +477,9 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- printk(KERN_WARNING "%s: segment %llu is already clean\n",
- __func__, (unsigned long long)segnum);
+ nilfs_msg(sufile->i_sb, KERN_WARNING,
+ "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -692,7 +694,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
su2 = su;
for (j = 0; j < n; j++, su = (void *)su + susz) {
if ((le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ERROR)) ||
+ ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
nilfs_segment_is_active(nilfs, segnum + j)) {
ret = -EBUSY;
kunmap_atomic(kaddr);
@@ -859,10 +861,10 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
si->sui_lastmod = le64_to_cpu(su->su_lastmod);
si->sui_nblocks = le32_to_cpu(su->su_nblocks);
si->sui_flags = le32_to_cpu(su->su_flags) &
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
if (nilfs_segment_is_active(nilfs, segnum + j))
si->sui_flags |=
- (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ BIT(NILFS_SEGMENT_USAGE_ACTIVE);
}
kunmap_atomic(kaddr);
brelse(su_bh);
@@ -950,7 +952,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
* disk.
*/
sup->sup_sui.sui_flags &=
- ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
+ ~BIT(NILFS_SEGMENT_USAGE_ACTIVE);
cleansi = nilfs_suinfo_clean(&sup->sup_sui);
cleansu = nilfs_segment_usage_clean(su);
@@ -1175,14 +1177,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- printk(KERN_ERR
- "NILFS: too large segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too large segment usage size: %zu bytes", susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- printk(KERN_ERR
- "NILFS: too small segment usage size: %zu bytes.\n",
- susize);
+ nilfs_msg(sb, KERN_ERR,
+ "too small segment usage size: %zu bytes", susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index 46e89872294c..158a9190c8ec 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -21,7 +21,6 @@
#include <linux/fs.h>
#include <linux/buffer_head.h>
-#include <linux/nilfs2_fs.h>
#include "mdt.h"
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 666107a18a22..c95d369e90aa 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -71,6 +71,22 @@ struct kmem_cache *nilfs_btree_path_cache;
static int nilfs_setup_super(struct super_block *sb, int is_mount);
static int nilfs_remount(struct super_block *sb, int *flags, char *data);
+void __nilfs_msg(struct super_block *sb, const char *level, const char *fmt,
+ ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+ if (sb)
+ printk("%sNILFS (%s): %pV\n", level, sb->s_id, &vaf);
+ else
+ printk("%sNILFS: %pV\n", level, &vaf);
+ va_end(args);
+}
+
static void nilfs_set_error(struct super_block *sb)
{
struct the_nilfs *nilfs = sb->s_fs_info;
@@ -91,19 +107,20 @@ static void nilfs_set_error(struct super_block *sb)
}
/**
- * nilfs_error() - report failure condition on a filesystem
+ * __nilfs_error() - report failure condition on a filesystem
+ *
+ * __nilfs_error() sets an ERROR_FS flag on the superblock as well as
+ * reporting an error message. This function should be called when
+ * NILFS detects incoherences or defects of meta data on disk.
*
- * nilfs_error() sets an ERROR_FS flag on the superblock as well as
- * reporting an error message. It should be called when NILFS detects
- * incoherences or defects of meta data on disk. As for sustainable
- * errors such as a single-shot I/O error, nilfs_warning() or the printk()
- * function should be used instead.
+ * This implements the body of nilfs_error() macro. Normally,
+ * nilfs_error() should be used. As for sustainable errors such as a
+ * single-shot I/O error, nilfs_msg() should be used instead.
*
- * The segment constructor must not call this function because it can
- * kill itself.
+ * Callers should not add a trailing newline since this will do it.
*/
-void nilfs_error(struct super_block *sb, const char *function,
- const char *fmt, ...)
+void __nilfs_error(struct super_block *sb, const char *function,
+ const char *fmt, ...)
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct va_format vaf;
@@ -133,24 +150,6 @@ void nilfs_error(struct super_block *sb, const char *function,
sb->s_id);
}
-void nilfs_warning(struct super_block *sb, const char *function,
- const char *fmt, ...)
-{
- struct va_format vaf;
- va_list args;
-
- va_start(args, fmt);
-
- vaf.fmt = fmt;
- vaf.va = &args;
-
- printk(KERN_WARNING "NILFS warning (device %s): %s: %pV\n",
- sb->s_id, function, &vaf);
-
- va_end(args);
-}
-
-
struct inode *nilfs_alloc_inode(struct super_block *sb)
{
struct nilfs_inode_info *ii;
@@ -196,8 +195,8 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- printk(KERN_ERR
- "NILFS: unable to write superblock (err=%d)\n", err);
+ nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
+ err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -267,8 +266,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_CRIT, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
@@ -378,9 +376,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- printk(KERN_WARNING
- "NILFS warning: unable to move secondary superblock "
- "to block %llu\n", (unsigned long long)newblocknr);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
@@ -543,10 +541,9 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- printk(KERN_ERR
- "NILFS: Invalid checkpoint "
- "(checkpoint number=%llu)\n",
- (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "Invalid checkpoint (checkpoint number=%llu)",
+ (unsigned long long)cno);
err = -EINVAL;
}
goto failed;
@@ -642,9 +639,8 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- printk(KERN_WARNING
- "NILFS warning: fail to count free inodes: err %d.\n",
- err);
+ nilfs_msg(sb, KERN_WARNING,
+ "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -776,9 +772,9 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- printk(KERN_ERR
- "NILFS: \"%s\" option is invalid "
- "for remount.\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "\"%s\" option is invalid for remount",
+ p);
return 0;
}
break;
@@ -792,8 +788,8 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- printk(KERN_ERR
- "NILFS: Unrecognized mount option \"%s\"\n", p);
+ nilfs_msg(sb, KERN_ERR,
+ "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -829,12 +825,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- printk(KERN_WARNING
- "NILFS warning: mounting fs with errors\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- printk(KERN_WARNING
- "NILFS warning: maximal mount count reached\n");
+ nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -897,17 +891,17 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!(sb->s_flags & MS_RDONLY) && features) {
- printk(KERN_ERR "NILFS: couldn't mount RDWR because of "
- "unsupported optional features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
return -EINVAL;
}
return 0;
@@ -923,13 +917,13 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
- printk(KERN_ERR "NILFS: get root inode failed\n");
ret = PTR_ERR(inode);
+ nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- printk(KERN_ERR "NILFS: corrupt root inode.\n");
+ nilfs_msg(sb, KERN_ERR, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -957,7 +951,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- printk(KERN_ERR "NILFS: get root dentry failed\n");
+ nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
goto out;
}
@@ -977,18 +971,18 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- printk(KERN_ERR "NILFS: The specified checkpoint is "
- "not a snapshot (checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "The specified checkpoint is not a snapshot (checkpoint number=%llu)",
+ (unsigned long long)cno);
ret = -EINVAL;
goto out;
}
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- printk(KERN_ERR "NILFS: error loading snapshot "
- "(checkpoint number=%llu).\n",
- (unsigned long long)cno);
+ nilfs_msg(s, KERN_ERR,
+ "error %d while loading snapshot (checkpoint number=%llu)",
+ ret, (unsigned long long)cno);
goto out;
}
ret = nilfs_get_root_dentry(s, root, root_dentry);
@@ -1058,7 +1052,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
__u64 cno;
int err;
- nilfs = alloc_nilfs(sb->s_bdev);
+ nilfs = alloc_nilfs(sb);
if (!nilfs)
return -ENOMEM;
@@ -1083,8 +1077,9 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- printk(KERN_ERR "NILFS: error loading last checkpoint "
- "(checkpoint number=%llu).\n", (unsigned long long)cno);
+ nilfs_msg(sb, KERN_ERR,
+ "error %d while loading last checkpoint (checkpoint number=%llu)",
+ err, (unsigned long long)cno);
goto failed_unload;
}
@@ -1144,9 +1139,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount because the filesystem is in an "
- "incomplete recovery state.\n", sb->s_id);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
@@ -1178,10 +1172,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- printk(KERN_WARNING "NILFS (device %s): couldn't "
- "remount RDWR because of unsupported optional "
- "features (%llx)\n",
- sb->s_id, (unsigned long long)features);
+ nilfs_msg(sb, KERN_WARNING,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1212,6 +1205,38 @@ struct nilfs_super_data {
int flags;
};
+static int nilfs_parse_snapshot_option(const char *option,
+ const substring_t *arg,
+ struct nilfs_super_data *sd)
+{
+ unsigned long long val;
+ const char *msg = NULL;
+ int err;
+
+ if (!(sd->flags & MS_RDONLY)) {
+ msg = "read-only option is not specified";
+ goto parse_error;
+ }
+
+ err = kstrtoull(arg->from, 0, &val);
+ if (err) {
+ if (err == -ERANGE)
+ msg = "too large checkpoint number";
+ else
+ msg = "malformed argument";
+ goto parse_error;
+ } else if (val == 0) {
+ msg = "invalid checkpoint number 0";
+ goto parse_error;
+ }
+ sd->cno = val;
+ return 0;
+
+parse_error:
+ nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ return 1;
+}
+
/**
* nilfs_identify - pre-read mount options needed to identify mount instance
* @data: mount options
@@ -1228,24 +1253,9 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd)
p = strsep(&options, ",");
if (p != NULL && *p) {
token = match_token(p, tokens, args);
- if (token == Opt_snapshot) {
- if (!(sd->flags & MS_RDONLY)) {
- ret++;
- } else {
- sd->cno = simple_strtoull(args[0].from,
- NULL, 0);
- /*
- * No need to see the end pointer;
- * match_token() has done syntax
- * checking.
- */
- if (sd->cno == 0)
- ret++;
- }
- }
- if (ret)
- printk(KERN_ERR
- "NILFS: invalid mount option: %s\n", p);
+ if (token == Opt_snapshot)
+ ret = nilfs_parse_snapshot_option(p, &args[0],
+ sd);
}
if (!options)
break;
@@ -1326,10 +1336,10 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & MS_RDONLY) {
- printk(KERN_ERR "NILFS: the device already "
- "has a %s mount.\n",
- (s->s_flags & MS_RDONLY) ?
- "read-only" : "read/write");
+ nilfs_msg(s, KERN_ERR,
+ "the device already has a %s mount.",
+ (s->s_flags & MS_RDONLY) ?
+ "read-only" : "read/write");
err = -EBUSY;
goto failed_super;
}
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 8ffa42b704d8..490303e3d517 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -272,8 +272,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -295,8 +295,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get checkpoint stat: err=%d", err);
return err;
}
@@ -326,9 +326,9 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -414,8 +414,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to get segment stat: err=%d", err);
return err;
}
@@ -511,9 +511,9 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr,
{
u64 seg_seq;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
seg_seq = nilfs->ns_seg_seq;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq);
}
@@ -525,9 +525,9 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 segnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
segnum = nilfs->ns_segnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", segnum);
}
@@ -539,9 +539,9 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr,
{
__u64 nextnum;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nextnum = nilfs->ns_nextnum;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum);
}
@@ -553,9 +553,9 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr,
{
unsigned long pseg_offset;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
pseg_offset = nilfs->ns_pseg_offset;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset);
}
@@ -567,9 +567,9 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr,
{
__u64 cno;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
cno = nilfs->ns_cno;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", cno);
}
@@ -581,9 +581,9 @@ nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(ctime, buf);
}
@@ -595,9 +595,9 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ctime = nilfs->ns_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime);
}
@@ -609,9 +609,9 @@ nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return NILFS_SHOW_TIME(nongc_ctime, buf);
}
@@ -623,9 +623,9 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr,
{
time_t nongc_ctime;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
nongc_ctime = nilfs->ns_nongc_ctime;
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)nongc_ctime);
@@ -638,9 +638,9 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr,
{
u32 ndirtyblks;
- down_read(&nilfs->ns_sem);
+ down_read(&nilfs->ns_segctor_sem);
ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks);
- up_read(&nilfs->ns_sem);
+ up_read(&nilfs->ns_segctor_sem);
return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks);
}
@@ -789,14 +789,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- printk(KERN_ERR "NILFS: unable to convert string: err=%d\n",
- err);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unable to convert string: err=%d", err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n");
+ nilfs_msg(nilfs->ns_sb, KERN_WARNING,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -999,7 +1000,8 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to allocate memory for device group\n");
+ nilfs_msg(sb, KERN_ERR,
+ "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1109,15 +1111,15 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: unable to create feature group: err %d\n",
- err);
+ nilfs_msg(NULL, KERN_ERR,
+ "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index e9fd241b9a0a..2dd75bf619ad 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -56,12 +56,12 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
/**
* alloc_nilfs - allocate a nilfs object
- * @bdev: block device to which the_nilfs is related
+ * @sb: super block instance
*
* Return Value: On success, pointer to the_nilfs is returned.
* On error, NULL is returned.
*/
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+struct the_nilfs *alloc_nilfs(struct super_block *sb)
{
struct the_nilfs *nilfs;
@@ -69,7 +69,8 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
if (!nilfs)
return NULL;
- nilfs->ns_bdev = bdev;
+ nilfs->ns_sb = sb;
+ nilfs->ns_bdev = sb->s_bdev;
atomic_set(&nilfs->ns_ndirtyblks, 0);
init_rwsem(&nilfs->ns_sem);
mutex_init(&nilfs->ns_snapshot_mount_mutex);
@@ -191,7 +192,10 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- printk(KERN_ERR "NILFS invalid last segment number.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "pointed segment number is out of range: segnum=%llu, nsegments=%lu",
+ (unsigned long long)nilfs->ns_segnum,
+ nilfs->ns_nsegments);
ret = -EINVAL;
}
return ret;
@@ -215,12 +219,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
+ nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
if (s_flags & MS_RDONLY) {
- printk(KERN_INFO "NILFS: INFO: recovery "
- "required for readonly filesystem.\n");
- printk(KERN_INFO "NILFS: write access will "
- "be enabled during recovery.\n");
+ nilfs_msg(sb, KERN_INFO,
+ "recovery required for readonly filesystem");
+ nilfs_msg(sb, KERN_INFO,
+ "write access will be enabled during recovery");
}
}
@@ -235,13 +239,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- printk(KERN_WARNING
- "NILFS warning: unable to fall back to spare"
- "super block\n");
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to fall back to spare super block");
goto scan_error;
}
- printk(KERN_INFO
- "NILFS: try rollback from an earlier position\n");
+ nilfs_msg(sb, KERN_INFO,
+ "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -254,10 +257,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- printk(KERN_WARNING
- "NILFS warning: blocksize differs between "
- "two super blocks (%d != %d)\n",
- blocksize, nilfs->ns_blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -276,7 +278,8 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- printk(KERN_ERR "NILFS: error loading super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
+ err);
goto failed;
}
@@ -287,30 +290,29 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_INFO "NILFS: norecovery option specified. "
- "skipping roll-forward recovery\n");
+ nilfs_msg(sb, KERN_INFO,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- printk(KERN_ERR "NILFS: couldn't proceed with "
- "recovery because of unsupported optional "
- "features (%llx)\n",
- (unsigned long long)features);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't proceed with recovery because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- printk(KERN_ERR "NILFS: write access "
- "unavailable, cannot proceed.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~MS_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- printk(KERN_ERR "NILFS: recovery cancelled because norecovery "
- "option was specified for a read/write mount\n");
+ nilfs_msg(sb, KERN_ERR,
+ "recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
}
@@ -325,11 +327,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- printk(KERN_ERR "NILFS: failed to update super block. "
- "recovery unfinished.\n");
+ nilfs_msg(sb, KERN_ERR,
+ "error %d updating super block. recovery unfinished.",
+ err);
goto failed_unload;
}
- printk(KERN_INFO "NILFS: recovery complete.\n");
+ nilfs_msg(sb, KERN_INFO, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -337,7 +340,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- printk(KERN_ERR "NILFS: error searching super root.\n");
+ nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
goto failed;
failed_unload:
@@ -384,12 +387,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- printk(KERN_ERR "NILFS: unsupported revision "
- "(superblock rev.=%d.%d, current rev.=%d.%d). "
- "Please check the version of mkfs.nilfs.\n",
- le32_to_cpu(sbp->s_rev_level),
- le16_to_cpu(sbp->s_minor_rev_level),
- NILFS_CURRENT_REV, NILFS_MINOR_REV);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
+ le32_to_cpu(sbp->s_rev_level),
+ le16_to_cpu(sbp->s_minor_rev_level),
+ NILFS_CURRENT_REV, NILFS_MINOR_REV);
return -EINVAL;
}
nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
@@ -398,12 +400,14 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- printk(KERN_ERR "NILFS: too large inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too large inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- printk(KERN_ERR "NILFS: too small inode size: %d bytes.\n",
- nilfs->ns_inode_size);
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too small inode size: %d bytes",
+ nilfs->ns_inode_size);
return -EINVAL;
}
@@ -411,7 +415,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- printk(KERN_ERR "NILFS: too short segment.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "too short segment: %lu blocks",
+ nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -420,7 +426,9 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- printk(KERN_ERR "NILFS: invalid reserved segments percentage.\n");
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "invalid reserved segments percentage: %lu",
+ nilfs->ns_r_segments_percentage);
return -EINVAL;
}
@@ -504,16 +512,16 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
if (!sbp[0]) {
if (!sbp[1]) {
- printk(KERN_ERR "NILFS: unable to read superblock\n");
+ nilfs_msg(sb, KERN_ERR, "unable to read superblock");
return -EIO;
}
- printk(KERN_WARNING
- "NILFS warning: unable to read primary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- printk(KERN_WARNING
- "NILFS warning: unable to read secondary superblock "
- "(blocksize = %d)\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -535,14 +543,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
- sb->s_id);
+ nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- printk(KERN_WARNING "NILFS warning: broken superblock. "
- "using spare superblock (blocksize = %d).\n", blocksize);
+ nilfs_msg(sb, KERN_WARNING,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -576,7 +584,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- printk(KERN_ERR "NILFS: unable to set blocksize\n");
+ nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -595,8 +603,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- printk(KERN_ERR "NILFS: couldn't mount because of unsupported "
- "filesystem blocksize %d\n", blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "couldn't mount because of unsupported filesystem blocksize %d",
+ blocksize);
err = -EINVAL;
goto failed_sbh;
}
@@ -604,10 +613,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- printk(KERN_ERR
- "NILFS: blocksize %d too small for device "
- "(sector-size = %d).\n",
- blocksize, hw_blocksize);
+ nilfs_msg(sb, KERN_ERR,
+ "blocksize %d too small for device (sector-size = %d)",
+ blocksize, hw_blocksize);
err = -EINVAL;
goto failed_sbh;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 79369fd6b13b..b305c6f033e7 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,6 +43,7 @@ enum {
* struct the_nilfs - struct to supervise multiple nilfs mount points
* @ns_flags: flags
* @ns_flushed_device: flag indicating if all volatile data was flushed
+ * @ns_sb: back pointer to super block instance
* @ns_bdev: block device
* @ns_sem: semaphore for shared states
* @ns_snapshot_mount_mutex: mutex to protect snapshot mounts
@@ -102,6 +103,7 @@ struct the_nilfs {
unsigned long ns_flags;
int ns_flushed_device;
+ struct super_block *ns_sb;
struct block_device *ns_bdev;
struct rw_semaphore ns_sem;
struct mutex ns_snapshot_mount_mutex;
@@ -120,11 +122,8 @@ struct the_nilfs {
unsigned int ns_sb_update_freq;
/*
- * Following fields are dedicated to a writable FS-instance.
- * Except for the period seeking checkpoint, code outside the segment
- * constructor must lock a segment semaphore while accessing these
- * fields.
- * The writable FS-instance is sole during a lifetime of the_nilfs.
+ * The following fields are updated by a writable FS-instance.
+ * These fields are protected by ns_segctor_sem outside load_nilfs().
*/
u64 ns_seg_seq;
__u64 ns_segnum;
@@ -281,7 +280,7 @@ static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs)
}
void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *bdev);
+struct the_nilfs *alloc_nilfs(struct super_block *sb);
void destroy_nilfs(struct the_nilfs *nilfs);
int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data);
int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index f40972d6df90..e01287c964a8 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1854,7 +1854,7 @@ int ntfs_read_inode_mount(struct inode *vi)
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
- /* Provides readpage() and sync_page() for map_mft_record(). */
+ /* Provides readpage() for map_mft_record(). */
vi->i_mapping->a_ops = &ntfs_mst_aops;
ctx = ntfs_attr_get_search_ctx(ni, m);
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
index 443abecf01b7..358258364616 100644
--- a/fs/ntfs/namei.c
+++ b/fs/ntfs/namei.c
@@ -253,7 +253,7 @@ handle_name:
err = (signed)nls_name.len;
goto err_out;
}
- nls_name.hash = full_name_hash(nls_name.name, nls_name.len);
+ nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
dent = d_add_ci(dent, dent_inode, &nls_name);
kfree(nls_name.name);
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 460c0cedab3a..7dabbc31060e 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6106,6 +6106,43 @@ void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
}
}
+/*
+ * Try to flush truncate logs if we can free enough clusters from it.
+ * As for return value, "< 0" means error, "0" no space and "1" means
+ * we have freed enough spaces and let the caller try to allocate again.
+ */
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed)
+{
+ tid_t target;
+ int ret = 0;
+ unsigned int truncated_clusters;
+
+ inode_lock(osb->osb_tl_inode);
+ truncated_clusters = osb->truncated_clusters;
+ inode_unlock(osb->osb_tl_inode);
+
+ /*
+ * Check whether we can succeed in allocating if we free
+ * the truncate log.
+ */
+ if (truncated_clusters < needed)
+ goto out;
+
+ ret = ocfs2_flush_truncate_log(osb);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
+ jbd2_log_wait_commit(osb->journal->j_journal, target);
+ ret = 1;
+ }
+out:
+ return ret;
+}
+
static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
int slot_num,
struct inode **tl_inode,
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index f3dc1b0dfffc..4a5152ec88a3 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -188,6 +188,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
u64 start_blk,
unsigned int num_clusters);
int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
+int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
+ unsigned int needed);
/*
* Process local structure which describes the block unlinks done
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e97a37179614..98d36548153d 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1645,43 +1645,6 @@ static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
return ret;
}
-/*
- * Try to flush truncate logs if we can free enough clusters from it.
- * As for return value, "< 0" means error, "0" no space and "1" means
- * we have freed enough spaces and let the caller try to allocate again.
- */
-static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
- unsigned int needed)
-{
- tid_t target;
- int ret = 0;
- unsigned int truncated_clusters;
-
- inode_lock(osb->osb_tl_inode);
- truncated_clusters = osb->truncated_clusters;
- inode_unlock(osb->osb_tl_inode);
-
- /*
- * Check whether we can succeed in allocating if we free
- * the truncate log.
- */
- if (truncated_clusters < needed)
- goto out;
-
- ret = ocfs2_flush_truncate_log(osb);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- if (jbd2_journal_start_commit(osb->journal->j_journal, &target)) {
- jbd2_log_wait_commit(osb->journal->j_journal, target);
- ret = 1;
- }
-out:
- return ret;
-}
-
int ocfs2_write_begin_nolock(struct address_space *mapping,
loff_t pos, unsigned len, ocfs2_write_type_t type,
struct page **pagep, void **fsdata,
@@ -2426,7 +2389,7 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
- struct inode *inode = file_inode(file)->i_mapping->host;
+ struct inode *inode = file->f_mapping->host;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
get_block_t *get_block;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 004f2cbe8f71..e9f3705c4c9f 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -47,7 +47,7 @@
#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
/* Intended to make it easier for us to switch out hash functions */
-#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
+#define dlm_lockid_hash(_n, _l) full_name_hash(NULL, _n, _l)
enum dlm_mle_type {
DLM_MLE_BLOCK = 0,
@@ -1004,6 +1004,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
int dlm_do_master_requery(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
u8 nodenum, u8 *real_master);
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res);
int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 13719d3f35f8..6ea06f8a7d29 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2276,9 +2276,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
dlm->name, namelen, lockname, res->owner, r);
dlm_print_one_lock_resource(res);
- BUG();
- }
- return ret ? ret : r;
+ if (r == -ENOMEM)
+ BUG();
+ } else
+ ret = r;
+
+ return ret;
}
int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -2416,48 +2419,26 @@ int dlm_deref_lockres_done_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_lock(&res->spinlock);
- BUG_ON(!(res->state & DLM_LOCK_RES_DROPPING_REF));
- if (!list_empty(&res->purge)) {
- mlog(0, "%s: Removing res %.*s from purgelist\n",
- dlm->name, res->lockname.len, res->lockname.name);
- list_del_init(&res->purge);
- dlm_lockres_put(res);
- dlm->purge_count--;
- }
-
- if (!__dlm_lockres_unused(res)) {
- mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
- BUG();
- }
-
- __dlm_unhash_lockres(dlm, res);
-
- spin_lock(&dlm->track_lock);
- if (!list_empty(&res->tracking))
- list_del_init(&res->tracking);
- else {
- mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
- dlm->name, res->lockname.len, res->lockname.name);
- __dlm_print_one_lock_resource(res);
+ if (!(res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ spin_unlock(&res->spinlock);
+ spin_unlock(&dlm->spinlock);
+ mlog(ML_NOTICE, "%s:%.*s: node %u sends deref done "
+ "but it is already derefed!\n", dlm->name,
+ res->lockname.len, res->lockname.name, node);
+ ret = 0;
+ goto done;
}
- spin_unlock(&dlm->track_lock);
- /* lockres is not in the hash now. drop the flag and wake up
- * any processes waiting in dlm_get_lock_resource.
- */
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+ __dlm_do_purge_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
- dlm_lockres_put(res);
-
spin_unlock(&dlm->spinlock);
ret = 0;
-
done:
+ if (res)
+ dlm_lockres_put(res);
dlm_put(dlm);
return ret;
}
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f6b313898763..dd5cb8bcefd1 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2343,6 +2343,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
struct dlm_lock_resource *res;
int i;
struct hlist_head *bucket;
+ struct hlist_node *tmp;
struct dlm_lock *lock;
@@ -2365,7 +2366,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
bucket = dlm_lockres_hash(dlm, i);
- hlist_for_each_entry(res, bucket, hash_node) {
+ hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
if (dlm_is_recovery_lock(res->lockname.name,
@@ -2386,8 +2387,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
- dlm_lockres_clear_refmap_bit(dlm, res,
- dead_node);
+
+ if ((res->owner == dead_node) &&
+ (res->state & DLM_LOCK_RES_DROPPING_REF)) {
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
+ } else if (res->owner == dlm->node_num)
+ dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
spin_unlock(&res->spinlock);
continue;
}
@@ -2398,14 +2408,17 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
if (res->state & DLM_LOCK_RES_DROPPING_REF) {
mlog(0, "%s:%.*s: owned by "
"dead node %u, this node was "
- "dropping its ref when it died. "
- "continue, dropping the flag.\n",
+ "dropping its ref when master died. "
+ "continue, purging the lockres.\n",
dlm->name, res->lockname.len,
res->lockname.name, dead_node);
+ dlm_lockres_get(res);
+ __dlm_do_purge_lockres(dlm, res);
+ spin_unlock(&res->spinlock);
+ wake_up(&res->wq);
+ dlm_lockres_put(res);
+ continue;
}
- res->state &= ~DLM_LOCK_RES_DROPPING_REF;
- dlm_move_lockres_to_recovery_list(dlm,
- res);
} else if (res->owner == dlm->node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 68d239ba0c63..838a06d4066a 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -160,6 +160,52 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
+/*
+ * Do the real purge work:
+ * unhash the lockres, and
+ * clear flag DLM_LOCK_RES_DROPPING_REF.
+ * It requires dlm and lockres spinlock to be taken.
+ */
+void __dlm_do_purge_lockres(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *res)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&res->spinlock);
+
+ if (!list_empty(&res->purge)) {
+ mlog(0, "%s: Removing res %.*s from purgelist\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ list_del_init(&res->purge);
+ dlm_lockres_put(res);
+ dlm->purge_count--;
+ }
+
+ if (!__dlm_lockres_unused(res)) {
+ mlog(ML_ERROR, "%s: res %.*s in use after deref\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ BUG();
+ }
+
+ __dlm_unhash_lockres(dlm, res);
+
+ spin_lock(&dlm->track_lock);
+ if (!list_empty(&res->tracking))
+ list_del_init(&res->tracking);
+ else {
+ mlog(ML_ERROR, "%s: Resource %.*s not on the Tracking list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ __dlm_print_one_lock_resource(res);
+ }
+ spin_unlock(&dlm->track_lock);
+
+ /*
+ * lockres is not in the hash now. drop the flag and wake up
+ * any processes waiting in dlm_get_lock_resource.
+ */
+ res->state &= ~DLM_LOCK_RES_DROPPING_REF;
+}
+
static void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res)
{
@@ -175,6 +221,13 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
res->lockname.len, res->lockname.name, master);
if (!master) {
+ if (res->state & DLM_LOCK_RES_DROPPING_REF) {
+ mlog(ML_NOTICE, "%s: res %.*s already in DLM_LOCK_RES_DROPPING_REF state\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ spin_unlock(&res->spinlock);
+ return;
+ }
+
res->state |= DLM_LOCK_RES_DROPPING_REF;
/* drop spinlock... retake below */
spin_unlock(&res->spinlock);
@@ -203,8 +256,8 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
dlm->purge_count--;
}
- if (!master && ret != 0) {
- mlog(0, "%s: deref %.*s in progress or master goes down\n",
+ if (!master && ret == DLM_DEREF_RESPONSE_INPROG) {
+ mlog(0, "%s: deref %.*s in progress\n",
dlm->name, res->lockname.len, res->lockname.name);
spin_unlock(&res->spinlock);
return;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index ab6a6cdcf91c..87e577a49b0d 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -483,7 +483,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
struct ocfs2_global_disk_dqblk dqblk;
s64 spacechange, inodechange;
- time_t olditime, oldbtime;
+ time64_t olditime, oldbtime;
err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
sizeof(struct ocfs2_global_disk_dqblk),
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index ced70c8139f7..c9e828ec3c8e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -1007,10 +1007,17 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
lc->oc_type = NO_CONTROLD;
rc = dlm_new_lockspace(conn->cc_name, conn->cc_cluster_name,
- DLM_LSFL_FS, DLM_LVB_LEN,
+ DLM_LSFL_FS | DLM_LSFL_NEWEXCL, DLM_LVB_LEN,
&ocfs2_ls_ops, conn, &ops_rv, &fsdlm);
- if (rc)
+ if (rc) {
+ if (rc == -EEXIST || rc == -EPROTO)
+ printk(KERN_ERR "ocfs2: Unable to create the "
+ "lockspace %s (%d), because a ocfs2-tools "
+ "program is running on this file system "
+ "with the same name lockspace\n",
+ conn->cc_name, rc);
goto out;
+ }
if (ops_rv == -EOPNOTSUPP) {
lc->oc_type = WITH_CONTROLD;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2f19aeec5482..ea47120a85ff 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1164,7 +1164,8 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
int flags,
struct ocfs2_alloc_context **ac)
{
- int status;
+ int status, ret = 0;
+ int retried = 0;
*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
if (!(*ac)) {
@@ -1189,7 +1190,24 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
}
if (status == -ENOSPC) {
+retry:
status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
+ /* Retry if there is sufficient space cached in truncate log */
+ if (status == -ENOSPC && !retried) {
+ retried = 1;
+ ocfs2_inode_unlock((*ac)->ac_inode, 1);
+ inode_unlock((*ac)->ac_inode);
+
+ ret = ocfs2_try_to_free_truncate_log(osb, bits_wanted);
+ if (ret == 1)
+ goto retry;
+
+ if (ret < 0)
+ mlog_errno(ret);
+
+ inode_lock((*ac)->ac_inode);
+ ocfs2_inode_lock((*ac)->ac_inode, NULL, 1);
+ }
if (status < 0) {
if (status != -ENOSPC)
mlog_errno(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d2053853951e..5bb44f7a78ee 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7344,7 +7344,7 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
* 'user' attributes support
*/
static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
- struct dentry *unusde, struct inode *inode,
+ struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
diff --git a/fs/open.c b/fs/open.c
index 93ae3cdee4ab..bf66cf1a9f5c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -840,13 +840,13 @@ EXPORT_SYMBOL(file_path);
int vfs_open(const struct path *path, struct file *file,
const struct cred *cred)
{
- struct inode *inode = vfs_select_inode(path->dentry, file->f_flags);
+ struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags);
- if (IS_ERR(inode))
- return PTR_ERR(inode);
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
file->f_path = *path;
- return do_dentry_open(file, inode, NULL, cred);
+ return do_dentry_open(file, d_backing_inode(dentry), NULL, cred);
}
struct file *dentry_open(const struct path *path, int flags,
diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c
index 5dfc4f3cfe68..00235bf644dc 100644
--- a/fs/orangefs/dcache.c
+++ b/fs/orangefs/dcache.c
@@ -73,6 +73,7 @@ static int orangefs_revalidate_lookup(struct dentry *dentry)
}
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
ret = 1;
out_release_op:
op_release(new_op);
@@ -94,6 +95,9 @@ static int orangefs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int ret;
+ if (time_before(jiffies, dentry->d_time))
+ return 1;
+
if (flags & LOOKUP_RCU)
return -ECHILD;
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 8f2fa94cc4f6..28a0557a69be 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -262,7 +262,7 @@ int orangefs_getattr(struct vfsmount *mnt,
"orangefs_getattr: called on %s\n",
dentry->d_name.name);
- ret = orangefs_inode_getattr(inode, 0, 1);
+ ret = orangefs_inode_getattr(inode, 0, 0);
if (ret == 0) {
generic_fillattr(inode, kstat);
@@ -291,7 +291,7 @@ int orangefs_permission(struct inode *inode, int mask)
}
/* ORANGEDS2 implementation of VFS inode operations for files */
-struct inode_operations orangefs_file_inode_operations = {
+const struct inode_operations orangefs_file_inode_operations = {
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
.setattr = orangefs_setattr,
@@ -384,7 +384,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref
if (!inode || !(inode->i_state & I_NEW))
return inode;
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error) {
iget_failed(inode);
return ERR_PTR(error);
@@ -429,7 +429,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
- error = orangefs_inode_getattr(inode, 1, 0);
+ error = orangefs_inode_getattr(inode, 1, 1);
if (error)
goto out_iput;
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 5a60c508af4e..62c525936ee8 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -72,6 +72,8 @@ static int orangefs_create(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: dentry instantiated for %s\n",
@@ -181,6 +183,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
if (IS_ERR(inode)) {
gossip_debug(GOSSIP_NAME_DEBUG,
@@ -189,6 +193,8 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
+
gossip_debug(GOSSIP_NAME_DEBUG,
"%s:%s:%d "
"Found good inode [%lu] with count [%d]\n",
@@ -316,6 +322,8 @@ static int orangefs_symlink(struct inode *dir,
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Symlink) %pU -> %s\n",
@@ -378,6 +386,8 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
d_instantiate(dentry, inode);
unlock_new_inode(inode);
+ dentry->d_time = jiffies + dcache_timeout_msecs*HZ/1000;
+ ORANGEFS_I(inode)->getattr_time = jiffies - 1;
gossip_debug(GOSSIP_NAME_DEBUG,
"Inode (Directory) %pU -> %s\n",
@@ -405,12 +415,10 @@ static int orangefs_rename(struct inode *old_dir,
int ret;
gossip_debug(GOSSIP_NAME_DEBUG,
- "orangefs_rename: called (%s/%s => %s/%s) ct=%d\n",
- old_dentry->d_parent->d_name.name,
- old_dentry->d_name.name,
- new_dentry->d_parent->d_name.name,
- new_dentry->d_name.name,
- d_count(new_dentry));
+ "orangefs_rename: called (%pd2 => %pd2) ct=%d\n",
+ old_dentry, new_dentry, d_count(new_dentry));
+
+ ORANGEFS_I(new_dentry->d_parent->d_inode)->getattr_time = jiffies - 1;
new_op = op_alloc(ORANGEFS_VFS_OP_RENAME);
if (!new_op)
@@ -442,7 +450,7 @@ static int orangefs_rename(struct inode *old_dir,
}
/* ORANGEFS implementation of VFS inode operations for directories */
-struct inode_operations orangefs_dir_inode_operations = {
+const struct inode_operations orangefs_dir_inode_operations = {
.lookup = orangefs_lookup,
.get_acl = orangefs_get_acl,
.set_acl = orangefs_set_acl,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index c1181e5529af..633c07a6e3d8 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -246,6 +246,8 @@ struct orangefs_inode_s {
* with this object
*/
unsigned long pinode_flags;
+
+ unsigned long getattr_time;
};
#define P_ATIME_FLAG 0
@@ -527,7 +529,7 @@ int orangefs_inode_setxattr(struct inode *inode,
size_t size,
int flags);
-int orangefs_inode_getattr(struct inode *inode, int new, int size);
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass);
int orangefs_inode_check_changed(struct inode *inode);
@@ -546,6 +548,8 @@ extern struct mutex request_mutex;
extern int debug;
extern int op_timeout_secs;
extern int slot_timeout_secs;
+extern int dcache_timeout_msecs;
+extern int getattr_timeout_msecs;
extern struct list_head orangefs_superblocks;
extern spinlock_t orangefs_superblocks_lock;
extern struct list_head orangefs_request_list;
@@ -557,10 +561,10 @@ extern int hash_table_size;
extern const struct address_space_operations orangefs_address_operations;
extern struct backing_dev_info orangefs_backing_dev_info;
-extern struct inode_operations orangefs_file_inode_operations;
+extern const struct inode_operations orangefs_file_inode_operations;
extern const struct file_operations orangefs_file_operations;
-extern struct inode_operations orangefs_symlink_inode_operations;
-extern struct inode_operations orangefs_dir_inode_operations;
+extern const struct inode_operations orangefs_symlink_inode_operations;
+extern const struct inode_operations orangefs_dir_inode_operations;
extern const struct file_operations orangefs_dir_operations;
extern const struct dentry_operations orangefs_dentry_operations;
extern const struct file_operations orangefs_devreq_file_operations;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 6f072a8c0de1..e9fd5755c05f 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -47,6 +47,8 @@ struct client_debug_mask client_debug_mask = { NULL, 0, 0 };
unsigned int kernel_mask_set_mod_init; /* implicitly false */
int op_timeout_secs = ORANGEFS_DEFAULT_OP_TIMEOUT_SECS;
int slot_timeout_secs = ORANGEFS_DEFAULT_SLOT_TIMEOUT_SECS;
+int dcache_timeout_msecs = 50;
+int getattr_timeout_msecs = 50;
MODULE_LICENSE("GPL");
MODULE_AUTHOR("ORANGEFS Development Team");
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 5c03113e3ad2..375708c2db87 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -61,10 +61,21 @@
* Slots are requested and waited for,
* the wait times out after slot_timeout_secs.
*
+ * What: /sys/fs/orangefs/dcache_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time lookup is valid in milliseconds.
+ *
+ * What: /sys/fs/orangefs/getattr_timeout_msecs
+ * Date: Jul 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Time getattr is valid in milliseconds.
*
* What: /sys/fs/orangefs/acache/...
* Date: Jun 2015
- * Contact: Mike Marshall <hubcap@omnibond.com>
+ * Contact: Martin Brandenburg <martin@omnibond.com>
* Description:
* Attribute cache configurable settings.
*
@@ -117,6 +128,8 @@ struct orangefs_obj {
int perf_history_size;
int perf_time_interval_secs;
int slot_timeout_secs;
+ int dcache_timeout_msecs;
+ int getattr_timeout_msecs;
};
struct acache_orangefs_obj {
@@ -658,6 +671,20 @@ static ssize_t sysfs_int_show(char *kobj_id, char *buf, void *attr)
"%d\n",
slot_timeout_secs);
goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "dcache_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "getattr_timeout_msecs")) {
+ rc = scnprintf(buf,
+ PAGE_SIZE,
+ "%d\n",
+ getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -734,6 +761,12 @@ static ssize_t int_store(struct orangefs_obj *orangefs_obj,
} else if (!strcmp(attr->attr.name, "slot_timeout_secs")) {
rc = kstrtoint(buf, 0, &slot_timeout_secs);
goto out;
+ } else if (!strcmp(attr->attr.name, "dcache_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &dcache_timeout_msecs);
+ goto out;
+ } else if (!strcmp(attr->attr.name, "getattr_timeout_msecs")) {
+ rc = kstrtoint(buf, 0, &getattr_timeout_msecs);
+ goto out;
} else {
goto out;
}
@@ -1361,6 +1394,12 @@ static struct orangefs_attribute op_timeout_secs_attribute =
static struct orangefs_attribute slot_timeout_secs_attribute =
__ATTR(slot_timeout_secs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute dcache_timeout_msecs_attribute =
+ __ATTR(dcache_timeout_msecs, 0664, int_orangefs_show, int_store);
+
+static struct orangefs_attribute getattr_timeout_msecs_attribute =
+ __ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+
static struct orangefs_attribute perf_counter_reset_attribute =
__ATTR(perf_counter_reset,
0664,
@@ -1382,6 +1421,8 @@ static struct orangefs_attribute perf_time_interval_secs_attribute =
static struct attribute *orangefs_default_attrs[] = {
&op_timeout_secs_attribute.attr,
&slot_timeout_secs_attribute.attr,
+ &dcache_timeout_msecs_attribute.attr,
+ &getattr_timeout_msecs_attribute.attr,
&perf_counter_reset_attribute.attr,
&perf_history_size_attribute.attr,
&perf_time_interval_secs_attribute.attr,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index c5fbc62357c6..d13c7291fd05 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -251,7 +251,7 @@ static int orangefs_inode_is_stale(struct inode *inode, int new,
return 0;
}
-int orangefs_inode_getattr(struct inode *inode, int new, int size)
+int orangefs_inode_getattr(struct inode *inode, int new, int bypass)
{
struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
struct orangefs_kernel_op_s *new_op;
@@ -261,12 +261,16 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
gossip_debug(GOSSIP_UTILS_DEBUG, "%s: called on inode %pU\n", __func__,
get_khandle_from_ino(inode));
+ if (!new && !bypass) {
+ if (time_before(jiffies, orangefs_inode->getattr_time))
+ return 0;
+ }
+
new_op = op_alloc(ORANGEFS_VFS_OP_GETATTR);
if (!new_op)
return -ENOMEM;
new_op->upcall.req.getattr.refn = orangefs_inode->refn;
- new_op->upcall.req.getattr.mask = size ?
- ORANGEFS_ATTR_SYS_ALL_NOHINT : ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE;
+ new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT;
ret = service_operation(new_op, __func__,
get_interruptible_flag(inode));
@@ -287,20 +291,18 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
case S_IFREG:
inode->i_flags = orangefs_inode_flags(&new_op->
downcall.resp.getattr.attributes);
- if (size) {
- inode_size = (loff_t)new_op->
- downcall.resp.getattr.attributes.size;
- rounded_up_size =
- (inode_size + (4096 - (inode_size % 4096)));
- inode->i_size = inode_size;
- orangefs_inode->blksize =
- new_op->downcall.resp.getattr.attributes.blksize;
- spin_lock(&inode->i_lock);
- inode->i_bytes = inode_size;
- inode->i_blocks =
- (unsigned long)(rounded_up_size / 512);
- spin_unlock(&inode->i_lock);
- }
+ inode_size = (loff_t)new_op->
+ downcall.resp.getattr.attributes.size;
+ rounded_up_size =
+ (inode_size + (4096 - (inode_size % 4096)));
+ inode->i_size = inode_size;
+ orangefs_inode->blksize =
+ new_op->downcall.resp.getattr.attributes.blksize;
+ spin_lock(&inode->i_lock);
+ inode->i_bytes = inode_size;
+ inode->i_blocks =
+ (unsigned long)(rounded_up_size / 512);
+ spin_unlock(&inode->i_lock);
break;
case S_IFDIR:
inode->i_size = PAGE_SIZE;
@@ -345,6 +347,7 @@ int orangefs_inode_getattr(struct inode *inode, int new, int size)
inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) |
orangefs_inode_perms(&new_op->downcall.resp.getattr.attributes);
+ orangefs_inode->getattr_time = jiffies + getattr_timeout_msecs*HZ/1000;
ret = 0;
out:
op_release(new_op);
@@ -418,6 +421,7 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr)
ClearMtimeFlag(orangefs_inode);
ClearCtimeFlag(orangefs_inode);
ClearModeFlag(orangefs_inode);
+ orangefs_inode->getattr_time = jiffies - 1;
}
return ret;
diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h
index 1efc6f8a5224..3d7418c728f5 100644
--- a/fs/orangefs/protocol.h
+++ b/fs/orangefs/protocol.h
@@ -207,14 +207,6 @@ typedef __s64 ORANGEFS_offset;
ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
ORANGEFS_ATTR_SYS_BLKSIZE)
-#define ORANGEFS_ATTR_SYS_ALL_NOHINT_NOSIZE \
- (ORANGEFS_ATTR_SYS_COMMON_ALL | \
- ORANGEFS_ATTR_SYS_LNK_TARGET | \
- ORANGEFS_ATTR_SYS_DFILE_COUNT | \
- ORANGEFS_ATTR_SYS_MIRROR_COPIES_COUNT | \
- ORANGEFS_ATTR_SYS_DIRENT_COUNT | \
- ORANGEFS_ATTR_SYS_BLKSIZE)
-
#define ORANGEFS_XATTR_REPLACE 0x2
#define ORANGEFS_XATTR_CREATE 0x1
#define ORANGEFS_MAX_SERVER_ADDR_LEN 256
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 6418dd638680..8fecf823f5ba 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -8,7 +8,7 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-struct inode_operations orangefs_symlink_inode_operations = {
+const struct inode_operations orangefs_symlink_inode_operations = {
.readlink = generic_readlink,
.get_link = simple_get_link,
.setattr = orangefs_setattr,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 80aa6f1eb336..54e5d6681786 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -292,6 +292,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
+ ovl_inode_update(d_inode(dentry), d_inode(newdentry));
newdentry = NULL;
/*
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 5c9d2d80ff70..12bcd07b9e32 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -138,9 +138,12 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
int err;
enum ovl_path_type type;
struct path realpath;
+ const struct cred *old_cred;
type = ovl_path_real(dentry, &realpath);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat);
+ revert_creds(old_cred);
if (err)
return err;
@@ -158,6 +161,22 @@ static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
return 0;
}
+/* Common operations required to be done after creation of file on upper */
+static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
+ struct dentry *newdentry, bool hardlink)
+{
+ ovl_dentry_version_inc(dentry->d_parent);
+ ovl_dentry_update(dentry, newdentry);
+ if (!hardlink) {
+ ovl_inode_update(inode, d_inode(newdentry));
+ ovl_copyattr(newdentry->d_inode, inode);
+ } else {
+ WARN_ON(ovl_inode_real(inode, NULL) != d_inode(newdentry));
+ inc_nlink(inode);
+ }
+ d_instantiate(dentry, inode);
+}
+
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
@@ -177,10 +196,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput;
- ovl_dentry_version_inc(dentry->d_parent);
- ovl_dentry_update(dentry, newdentry);
- ovl_copyattr(newdentry->d_inode, inode);
- d_instantiate(dentry, inode);
+ ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput:
dput(newdentry);
@@ -291,23 +307,29 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
+ enum ovl_path_type type = ovl_path_type(dentry);
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
- if (err)
+ if (err) {
ret = ERR_PTR(err);
- else {
- /*
- * If no upperdentry then skip clearing whiteouts.
- *
- * Can race with copy-up, since we don't hold the upperdir
- * mutex. Doesn't matter, since copy-up can't create a
- * non-empty directory from an empty one.
- */
- if (ovl_dentry_upper(dentry))
- ret = ovl_clear_empty(dentry, &list);
+ goto out_free;
}
+ /*
+ * When removing an empty opaque directory, then it makes no sense to
+ * replace it with an exact replica of itself.
+ *
+ * If no upperdentry then skip clearing whiteouts.
+ *
+ * Can race with copy-up, since we don't hold the upperdir mutex.
+ * Doesn't matter, since copy-up can't create a non-empty directory
+ * from an empty one.
+ */
+ if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type))
+ ret = ovl_clear_empty(dentry, &list);
+
+out_free:
ovl_cache_free(&list);
return ret;
@@ -347,7 +369,23 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_dput2;
- if (S_ISDIR(stat->mode)) {
+ /*
+ * mode could have been mutilated due to umask (e.g. sgid directory)
+ */
+ if (!hardlink &&
+ !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) {
+ struct iattr attr = {
+ .ia_valid = ATTR_MODE,
+ .ia_mode = stat->mode,
+ };
+ inode_lock(newdentry->d_inode);
+ err = notify_change(newdentry, &attr, NULL);
+ inode_unlock(newdentry->d_inode);
+ if (err)
+ goto out_cleanup;
+ }
+
+ if (!hardlink && S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
@@ -363,10 +401,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
if (err)
goto out_cleanup;
}
- ovl_dentry_version_inc(dentry->d_parent);
- ovl_dentry_update(dentry, newdentry);
- ovl_copyattr(newdentry->d_inode, inode);
- d_instantiate(dentry, inode);
+ ovl_instantiate(dentry, inode, newdentry, !!hardlink);
newdentry = NULL;
out_dput2:
dput(upper);
@@ -382,52 +417,42 @@ out_cleanup:
goto out_dput2;
}
-static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
- const char *link, struct dentry *hardlink)
+static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
+ struct kstat *stat, const char *link,
+ struct dentry *hardlink)
{
int err;
- struct inode *inode;
- struct kstat stat = {
- .mode = mode,
- .rdev = rdev,
- };
-
- err = -ENOMEM;
- inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
- if (!inode)
- goto out;
+ const struct cred *old_cred;
+ struct cred *override_cred;
err = ovl_copy_up(dentry->d_parent);
if (err)
- goto out_iput;
-
- if (!ovl_dentry_is_opaque(dentry)) {
- err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
- } else {
- const struct cred *old_cred;
- struct cred *override_cred;
-
- old_cred = ovl_override_creds(dentry->d_sb);
-
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (override_cred) {
- override_cred->fsuid = old_cred->fsuid;
- override_cred->fsgid = old_cred->fsgid;
- put_cred(override_creds(override_cred));
- put_cred(override_cred);
+ return err;
- err = ovl_create_over_whiteout(dentry, inode, &stat,
- link, hardlink);
- }
- revert_creds(old_cred);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (override_cred) {
+ override_cred->fsuid = inode->i_uid;
+ override_cred->fsgid = inode->i_gid;
+ put_cred(override_creds(override_cred));
+ put_cred(override_cred);
+
+ if (!ovl_dentry_is_opaque(dentry))
+ err = ovl_create_upper(dentry, inode, stat, link,
+ hardlink);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, stat,
+ link, hardlink);
}
+ revert_creds(old_cred);
+ if (!err) {
+ struct inode *realinode = d_inode(ovl_dentry_upper(dentry));
- if (!err)
- inode = NULL;
-out_iput:
- iput(inode);
-out:
+ WARN_ON(inode->i_mode != realinode->i_mode);
+ WARN_ON(!uid_eq(inode->i_uid, realinode->i_uid));
+ WARN_ON(!gid_eq(inode->i_gid, realinode->i_gid));
+ }
return err;
}
@@ -435,13 +460,30 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
+ struct inode *inode;
+ struct kstat stat = {
+ .rdev = rdev,
+ };
err = ovl_want_write(dentry);
- if (!err) {
- err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
- ovl_drop_write(dentry);
- }
+ if (err)
+ goto out;
+
+ err = -ENOMEM;
+ inode = ovl_new_inode(dentry->d_sb, mode);
+ if (!inode)
+ goto out_drop_write;
+
+ inode_init_owner(inode, dentry->d_parent->d_inode, mode);
+ stat.mode = inode->i_mode;
+ err = ovl_create_or_link(dentry, inode, &stat, link, NULL);
+ if (err)
+ iput(inode);
+
+out_drop_write:
+ ovl_drop_write(dentry);
+out:
return err;
}
@@ -476,7 +518,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
- struct dentry *upper;
+ struct inode *inode;
err = ovl_want_write(old);
if (err)
@@ -486,8 +528,12 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
if (err)
goto out_drop_write;
- upper = ovl_dentry_upper(old);
- err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
+ inode = d_inode(old);
+ ihold(inode);
+
+ err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old));
+ if (err)
+ iput(inode);
out_drop_write:
ovl_drop_write(old);
@@ -511,24 +557,10 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
return -EROFS;
if (is_dir) {
- if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
- opaquedir = ovl_check_empty_and_clear(dentry);
- err = PTR_ERR(opaquedir);
- if (IS_ERR(opaquedir))
- goto out;
- } else {
- LIST_HEAD(list);
-
- /*
- * When removing an empty opaque directory, then it
- * makes no sense to replace it with an exact replica of
- * itself. But emptiness still needs to be checked.
- */
- err = ovl_check_empty_dir(dentry, &list);
- ovl_cache_free(&list);
- if (err)
- goto out;
- }
+ opaquedir = ovl_check_empty_and_clear(dentry);
+ err = PTR_ERR(opaquedir);
+ if (IS_ERR(opaquedir))
+ goto out;
}
err = ovl_lock_rename_workdir(workdir, upperdir);
@@ -633,6 +665,8 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
+ const struct cred *old_cred;
+
err = ovl_check_sticky(dentry);
if (err)
@@ -647,14 +681,18 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
goto out_drop_write;
type = ovl_path_type(dentry);
- if (OVL_TYPE_PURE_UPPER(type)) {
- err = ovl_remove_upper(dentry, is_dir);
- } else {
- const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ if (OVL_TYPE_PURE_UPPER(type))
+ err = ovl_remove_upper(dentry, is_dir);
+ else
err = ovl_remove_and_whiteout(dentry, is_dir);
-
- revert_creds(old_cred);
+ revert_creds(old_cred);
+ if (!err) {
+ if (is_dir)
+ clear_nlink(dentry->d_inode);
+ else
+ drop_nlink(dentry->d_inode);
}
out_drop_write:
ovl_drop_write(dentry);
@@ -760,8 +798,7 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
- if (old_opaque || new_opaque)
- old_cred = ovl_override_creds(old->d_sb);
+ old_cred = ovl_override_creds(old->d_sb);
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
@@ -891,8 +928,7 @@ out_dput_old:
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
- if (old_opaque || new_opaque)
- revert_creds(old_cred);
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(old);
out:
@@ -913,8 +949,10 @@ const struct inode_operations ovl_dir_inode_operations = {
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
};
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d1cdc60dd68f..1b885c156028 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -41,6 +41,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
+ const struct cred *old_cred;
/*
* Check for permissions before trying to copy-up. This is redundant
@@ -84,7 +85,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = notify_change(upperdentry, attr, NULL);
+ revert_creds(old_cred);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
@@ -102,96 +105,46 @@ static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
+ const struct cred *old_cred;
+ int err;
ovl_path_real(dentry, &realpath);
- return vfs_getattr(&realpath, stat);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = vfs_getattr(&realpath, stat);
+ revert_creds(old_cred);
+ return err;
}
int ovl_permission(struct inode *inode, int mask)
{
- struct ovl_entry *oe;
- struct dentry *alias = NULL;
- struct inode *realinode;
- struct dentry *realdentry;
bool is_upper;
+ struct inode *realinode = ovl_inode_real(inode, &is_upper);
+ const struct cred *old_cred;
int err;
- if (S_ISDIR(inode->i_mode)) {
- oe = inode->i_private;
- } else if (mask & MAY_NOT_BLOCK) {
- return -ECHILD;
- } else {
- /*
- * For non-directories find an alias and get the info
- * from there.
- */
- alias = d_find_any_alias(inode);
- if (WARN_ON(!alias))
- return -ENOENT;
-
- oe = alias->d_fsdata;
- }
-
- realdentry = ovl_entry_real(oe, &is_upper);
-
- if (ovl_is_default_permissions(inode)) {
- struct kstat stat;
- struct path realpath = { .dentry = realdentry };
-
- if (mask & MAY_NOT_BLOCK)
- return -ECHILD;
-
- realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
-
- err = vfs_getattr(&realpath, &stat);
- if (err)
- goto out_dput;
-
- err = -ESTALE;
- if ((stat.mode ^ inode->i_mode) & S_IFMT)
- goto out_dput;
-
- inode->i_mode = stat.mode;
- inode->i_uid = stat.uid;
- inode->i_gid = stat.gid;
-
- err = generic_permission(inode, mask);
- goto out_dput;
- }
-
/* Careful in RCU walk mode */
- realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
- err = -ENOENT;
- goto out_dput;
+ return -ECHILD;
}
- if (mask & MAY_WRITE) {
- umode_t mode = realinode->i_mode;
-
- /*
- * Writes will always be redirected to upper layer, so
- * ignore lower layer being read-only.
- *
- * If the overlay itself is read-only then proceed
- * with the permission check, don't return EROFS.
- * This will only happen if this is the lower layer of
- * another overlayfs.
- *
- * If upper fs becomes read-only after the overlay was
- * constructed return EROFS to prevent modification of
- * upper layer.
- */
- err = -EROFS;
- if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
- (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
- goto out_dput;
+ /*
+ * Check overlay inode with the creds of task and underlying inode
+ * with creds of mounter
+ */
+ err = generic_permission(inode, mask);
+ if (err)
+ return err;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ if (!is_upper && !special_file(realinode->i_mode) && mask & MAY_WRITE) {
+ mask &= ~(MAY_WRITE | MAY_APPEND);
+ /* Make sure mounter can read file for copy up later */
+ mask |= MAY_READ;
}
+ err = inode_permission(realinode, mask);
+ revert_creds(old_cred);
- err = __inode_permission(realinode, mask);
-out_dput:
- dput(alias);
return err;
}
@@ -201,6 +154,8 @@ static const char *ovl_get_link(struct dentry *dentry,
{
struct dentry *realdentry;
struct inode *realinode;
+ const struct cred *old_cred;
+ const char *p;
if (!dentry)
return ERR_PTR(-ECHILD);
@@ -211,13 +166,18 @@ static const char *ovl_get_link(struct dentry *dentry,
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
- return realinode->i_op->get_link(realdentry, realinode, done);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ p = realinode->i_op->get_link(realdentry, realinode, done);
+ revert_creds(old_cred);
+ return p;
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
+ const struct cred *old_cred;
+ int err;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
@@ -225,15 +185,17 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
if (!realinode->i_op->readlink)
return -EINVAL;
- touch_atime(&realpath);
-
- return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
+ revert_creds(old_cred);
+ return err;
}
-
static bool ovl_is_private_xattr(const char *name)
{
- return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
+#define OVL_XATTR_PRE_NAME OVL_XATTR_PREFIX "."
+ return strncmp(name, OVL_XATTR_PRE_NAME,
+ sizeof(OVL_XATTR_PRE_NAME) - 1) == 0;
}
int ovl_setxattr(struct dentry *dentry, struct inode *inode,
@@ -242,21 +204,20 @@ int ovl_setxattr(struct dentry *dentry, struct inode *inode,
{
int err;
struct dentry *upperdentry;
+ const struct cred *old_cred;
err = ovl_want_write(dentry);
if (err)
goto out;
- err = -EPERM;
- if (ovl_is_private_xattr(name))
- goto out_drop_write;
-
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_setxattr(upperdentry, name, value, size, flags);
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(dentry);
@@ -268,11 +229,16 @@ ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size)
{
struct dentry *realdentry = ovl_dentry_real(dentry);
+ ssize_t res;
+ const struct cred *old_cred;
if (ovl_is_private_xattr(name))
return -ENODATA;
- return vfs_getxattr(realdentry, name, value, size);
+ old_cred = ovl_override_creds(dentry->d_sb);
+ res = vfs_getxattr(realdentry, name, value, size);
+ revert_creds(old_cred);
+ return res;
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
@@ -280,8 +246,11 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
int off;
+ const struct cred *old_cred;
+ old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_listxattr(realdentry, list, size);
+ revert_creds(old_cred);
if (res <= 0 || size == 0)
return res;
@@ -308,6 +277,7 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
+ const struct cred *old_cred;
err = ovl_want_write(dentry);
if (err)
@@ -329,13 +299,34 @@ int ovl_removexattr(struct dentry *dentry, const char *name)
ovl_path_upper(dentry, &realpath);
}
+ old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_removexattr(realpath.dentry, name);
+ revert_creds(old_cred);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
+struct posix_acl *ovl_get_acl(struct inode *inode, int type)
+{
+ struct inode *realinode = ovl_inode_real(inode, NULL);
+ const struct cred *old_cred;
+ struct posix_acl *acl;
+
+ if (!IS_POSIXACL(realinode))
+ return NULL;
+
+ if (!realinode->i_op->get_acl)
+ return NULL;
+
+ old_cred = ovl_override_creds(inode->i_sb);
+ acl = realinode->i_op->get_acl(realinode, type);
+ revert_creds(old_cred);
+
+ return acl;
+}
+
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
@@ -351,46 +342,60 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
return true;
}
-struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
+int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
{
- int err;
+ int err = 0;
struct path realpath;
enum ovl_path_type type;
- if (d_is_dir(dentry))
- return d_backing_inode(dentry);
-
type = ovl_path_real(dentry, &realpath);
if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
err = ovl_want_write(dentry);
- if (err)
- return ERR_PTR(err);
+ if (!err) {
+ if (file_flags & O_TRUNC)
+ err = ovl_copy_up_truncate(dentry);
+ else
+ err = ovl_copy_up(dentry);
+ ovl_drop_write(dentry);
+ }
+ }
- if (file_flags & O_TRUNC)
- err = ovl_copy_up_truncate(dentry);
- else
- err = ovl_copy_up(dentry);
- ovl_drop_write(dentry);
- if (err)
- return ERR_PTR(err);
+ return err;
+}
- ovl_path_upper(dentry, &realpath);
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags)
+{
+ struct dentry *alias;
+ struct path upperpath;
+
+ if (!(flags & S_ATIME))
+ return 0;
+
+ alias = d_find_any_alias(inode);
+ if (!alias)
+ return 0;
+
+ ovl_path_upper(alias, &upperpath);
+ if (upperpath.dentry) {
+ touch_atime(&upperpath);
+ inode->i_atime = d_inode(upperpath.dentry)->i_atime;
}
- if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
- return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
+ dput(alias);
- return d_backing_inode(realpath.dentry);
+ return 0;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .get_acl = ovl_get_acl,
+ .update_time = ovl_update_time,
};
static const struct inode_operations ovl_symlink_inode_operations = {
@@ -398,29 +403,22 @@ static const struct inode_operations ovl_symlink_inode_operations = {
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
- .setxattr = ovl_setxattr,
+ .setxattr = generic_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
+ .update_time = ovl_update_time,
};
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
- struct ovl_entry *oe)
+static void ovl_fill_inode(struct inode *inode, umode_t mode)
{
- struct inode *inode;
-
- inode = new_inode(sb);
- if (!inode)
- return NULL;
-
inode->i_ino = get_next_ino();
inode->i_mode = mode;
- inode->i_flags |= S_NOATIME | S_NOCMTIME;
+ inode->i_flags |= S_NOCMTIME;
mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
- inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
@@ -429,6 +427,10 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
inode->i_op = &ovl_symlink_inode_operations;
break;
+ default:
+ WARN(1, "illegal file type: %i\n", mode);
+ /* Fall through */
+
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
@@ -436,11 +438,42 @@ struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
+ }
+}
- default:
- WARN(1, "illegal file type: %i\n", mode);
- iput(inode);
- inode = NULL;
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
+{
+ struct inode *inode;
+
+ inode = new_inode(sb);
+ if (inode)
+ ovl_fill_inode(inode, mode);
+
+ return inode;
+}
+
+static int ovl_inode_test(struct inode *inode, void *data)
+{
+ return ovl_inode_real(inode, NULL) == data;
+}
+
+static int ovl_inode_set(struct inode *inode, void *data)
+{
+ inode->i_private = (void *) (((unsigned long) data) | OVL_ISUPPER_MASK);
+ return 0;
+}
+
+struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
+
+{
+ struct inode *inode;
+
+ inode = iget5_locked(sb, (unsigned long) realinode,
+ ovl_inode_test, ovl_inode_set, realinode);
+ if (inode && inode->i_state & I_NEW) {
+ ovl_fill_inode(inode, realinode->i_mode);
+ set_nlink(inode, realinode->i_nlink);
+ unlock_new_inode(inode);
}
return inode;
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index cfbca53590d0..e4f5c9536bfe 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -23,9 +23,11 @@ enum ovl_path_type {
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-#define OVL_XATTR_PRE_NAME "trusted.overlay."
-#define OVL_XATTR_PRE_LEN 16
-#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
+
+#define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay"
+#define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX ".opaque"
+
+#define OVL_ISUPPER_MASK 1UL
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
@@ -131,6 +133,16 @@ static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
return err;
}
+static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper)
+{
+ unsigned long x = (unsigned long) READ_ONCE(inode->i_private);
+
+ if (is_upper)
+ *is_upper = x & OVL_ISUPPER_MASK;
+
+ return (struct inode *) (x & ~OVL_ISUPPER_MASK);
+}
+
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
@@ -141,11 +153,9 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
-struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
-bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
@@ -155,6 +165,7 @@ void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+void ovl_inode_update(struct inode *inode, struct inode *upperinode);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
@@ -179,15 +190,20 @@ ssize_t ovl_getxattr(struct dentry *dentry, struct inode *inode,
const char *name, void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
-struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
+struct posix_acl *ovl_get_acl(struct inode *inode, int type);
+int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
- struct ovl_entry *oe);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
+struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
to->i_mode = from->i_mode;
+ to->i_atime = from->i_atime;
+ to->i_mtime = from->i_mtime;
+ to->i_ctime = from->i_ctime;
}
/* dir.c */
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 9a7693d5f8ff..4036132842b5 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -16,10 +16,10 @@
#include <linux/slab.h>
#include <linux/parser.h>
#include <linux/module.h>
-#include <linux/pagemap.h>
#include <linux/sched.h>
#include <linux/statfs.h>
#include <linux/seq_file.h>
+#include <linux/posix_acl_xattr.h>
#include "overlayfs.h"
MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
@@ -145,18 +145,11 @@ struct dentry *ovl_dentry_real(struct dentry *dentry)
return realdentry;
}
-struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
+static void ovl_inode_init(struct inode *inode, struct inode *realinode,
+ bool is_upper)
{
- struct dentry *realdentry;
-
- realdentry = ovl_upperdentry_dereference(oe);
- if (realdentry) {
- *is_upper = true;
- } else {
- realdentry = __ovl_dentry_lower(oe);
- *is_upper = false;
- }
- return realdentry;
+ WRITE_ONCE(inode->i_private, (unsigned long) realinode |
+ (is_upper ? OVL_ISUPPER_MASK : 0));
}
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
@@ -178,13 +171,6 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
return oe->cache;
}
-bool ovl_is_default_permissions(struct inode *inode)
-{
- struct ovl_fs *ofs = inode->i_sb->s_fs_info;
-
- return ofs->config.default_permissions;
-}
-
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -235,7 +221,6 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
WARN_ON(oe->__upperdentry);
- BUG_ON(!upperdentry->d_inode);
/*
* Make sure upperdentry is consistent before making it visible to
* ovl_upperdentry_dereference().
@@ -244,6 +229,16 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
oe->__upperdentry = upperdentry;
}
+void ovl_inode_update(struct inode *inode, struct inode *upperinode)
+{
+ WARN_ON(!upperinode);
+ WARN_ON(!inode_unhashed(inode));
+ WRITE_ONCE(inode->i_private,
+ (unsigned long) upperinode | OVL_ISUPPER_MASK);
+ if (!S_ISDIR(upperinode->i_mode))
+ __insert_inode_hash(inode, (unsigned long) upperinode);
+}
+
void ovl_dentry_version_inc(struct dentry *dentry)
{
struct ovl_entry *oe = dentry->d_fsdata;
@@ -304,7 +299,9 @@ static void ovl_dentry_release(struct dentry *dentry)
}
}
-static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry,
+ const struct inode *inode,
+ unsigned int open_flags)
{
struct dentry *real;
@@ -314,6 +311,16 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode)
goto bug;
}
+ if (d_is_negative(dentry))
+ return dentry;
+
+ if (open_flags) {
+ int err = ovl_open_maybe_copy_up(dentry, open_flags);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
real = ovl_dentry_upper(dentry);
if (real && (!inode || inode == d_inode(real)))
return real;
@@ -326,11 +333,9 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct inode *inode)
return real;
/* Handle recursion */
- if (real->d_flags & DCACHE_OP_REAL)
- return real->d_op->d_real(real, inode);
-
+ return d_real(real, inode, open_flags);
bug:
- WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry,
+ WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
return dentry;
}
@@ -378,13 +383,11 @@ static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags)
static const struct dentry_operations ovl_dentry_operations = {
.d_release = ovl_dentry_release,
- .d_select_inode = ovl_d_select_inode,
.d_real = ovl_d_real,
};
static const struct dentry_operations ovl_reval_dentry_operations = {
.d_release = ovl_dentry_release,
- .d_select_inode = ovl_d_select_inode,
.d_real = ovl_d_real,
.d_revalidate = ovl_dentry_revalidate,
.d_weak_revalidate = ovl_dentry_weak_revalidate,
@@ -404,7 +407,8 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
static bool ovl_dentry_remote(struct dentry *dentry)
{
return dentry->d_flags &
- (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE);
+ (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
+ DCACHE_OP_REAL);
}
static bool ovl_dentry_weird(struct dentry *dentry)
@@ -415,12 +419,16 @@ static bool ovl_dentry_weird(struct dentry *dentry)
DCACHE_OP_COMPARE);
}
-static inline struct dentry *ovl_lookup_real(struct dentry *dir,
- struct qstr *name)
+static inline struct dentry *ovl_lookup_real(struct super_block *ovl_sb,
+ struct dentry *dir,
+ const struct qstr *name)
{
+ const struct cred *old_cred;
struct dentry *dentry;
- dentry = lookup_hash(name, dir);
+ old_cred = ovl_override_creds(ovl_sb);
+ dentry = lookup_one_len_unlocked(name->name, dir, name->len);
+ revert_creds(old_cred);
if (IS_ERR(dentry)) {
if (PTR_ERR(dentry) == -ENOENT)
@@ -473,7 +481,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
upperdir = ovl_upperdentry_dereference(poe);
if (upperdir) {
- this = ovl_lookup_real(upperdir, &dentry->d_name);
+ this = ovl_lookup_real(dentry->d_sb, upperdir, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this))
goto out;
@@ -506,7 +514,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
bool opaque = false;
struct path lowerpath = poe->lowerstack[i];
- this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
+ this = ovl_lookup_real(dentry->d_sb,
+ lowerpath.dentry, &dentry->d_name);
err = PTR_ERR(this);
if (IS_ERR(this)) {
/*
@@ -561,12 +570,19 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperdentry || ctr) {
struct dentry *realdentry;
+ struct inode *realinode;
realdentry = upperdentry ? upperdentry : stack[0].dentry;
+ realinode = d_inode(realdentry);
err = -ENOMEM;
- inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode,
- oe);
+ if (upperdentry && !d_is_dir(upperdentry)) {
+ inode = ovl_get_inode(dentry->d_sb, realinode);
+ } else {
+ inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
+ if (inode)
+ ovl_inode_init(inode, realinode, !!upperdentry);
+ }
if (!inode)
goto out_free_oe;
ovl_copyattr(realdentry->d_inode, inode);
@@ -595,7 +611,7 @@ out:
struct file *ovl_path_open(struct path *path, int flags)
{
- return dentry_open(path, flags, current_cred());
+ return dentry_open(path, flags | O_NOATIME, current_cred());
}
static void ovl_put_super(struct super_block *sb)
@@ -678,6 +694,7 @@ static const struct super_operations ovl_super_operations = {
.statfs = ovl_statfs,
.show_options = ovl_show_options,
.remount_fs = ovl_remount,
+ .drop_inode = generic_delete_inode,
};
enum {
@@ -950,11 +967,102 @@ static unsigned int ovl_split_lowerdirs(char *str)
return ctr;
}
+static int ovl_posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct dentry *workdir = ovl_workdir(dentry);
+ struct inode *realinode = ovl_inode_real(inode, NULL);
+ struct posix_acl *acl = NULL;
+ int err;
+
+ /* Check that everything is OK before copy-up */
+ if (value) {
+ acl = posix_acl_from_xattr(&init_user_ns, value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ err = -EOPNOTSUPP;
+ if (!IS_POSIXACL(d_inode(workdir)))
+ goto out_acl_release;
+ if (!realinode->i_op->set_acl)
+ goto out_acl_release;
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) {
+ err = acl ? -EACCES : 0;
+ goto out_acl_release;
+ }
+ err = -EPERM;
+ if (!inode_owner_or_capable(inode))
+ goto out_acl_release;
+
+ posix_acl_release(acl);
+
+ return ovl_setxattr(dentry, inode, handler->name, value, size, flags);
+
+out_acl_release:
+ posix_acl_release(acl);
+ return err;
+}
+
+static int ovl_other_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return ovl_setxattr(dentry, inode, name, value, size, flags);
+}
+
+static int ovl_own_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, struct inode *inode,
+ const char *name, const void *value,
+ size_t size, int flags)
+{
+ return -EPERM;
+}
+
+static const struct xattr_handler ovl_posix_acl_access_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_ACCESS,
+ .flags = ACL_TYPE_ACCESS,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler ovl_posix_acl_default_xattr_handler = {
+ .name = XATTR_NAME_POSIX_ACL_DEFAULT,
+ .flags = ACL_TYPE_DEFAULT,
+ .set = ovl_posix_acl_xattr_set,
+};
+
+static const struct xattr_handler ovl_own_xattr_handler = {
+ .prefix = OVL_XATTR_PREFIX,
+ .set = ovl_own_xattr_set,
+};
+
+static const struct xattr_handler ovl_other_xattr_handler = {
+ .prefix = "", /* catch all */
+ .set = ovl_other_xattr_set,
+};
+
+static const struct xattr_handler *ovl_xattr_handlers[] = {
+ &ovl_posix_acl_access_xattr_handler,
+ &ovl_posix_acl_default_xattr_handler,
+ &ovl_own_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL
+};
+
+static const struct xattr_handler *ovl_xattr_noacl_handlers[] = {
+ &ovl_own_xattr_handler,
+ &ovl_other_xattr_handler,
+ NULL,
+};
+
static int ovl_fill_super(struct super_block *sb, void *data, int silent)
{
struct path upperpath = { NULL, NULL };
struct path workpath = { NULL, NULL };
struct dentry *root_dentry;
+ struct inode *realinode;
struct ovl_entry *oe;
struct ovl_fs *ufs;
struct path *stack = NULL;
@@ -1061,6 +1169,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
pr_err("overlayfs: failed to clone upperpath\n");
goto out_put_lowerpath;
}
+ /* Don't inherit atime flags */
+ ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+
+ sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
err = PTR_ERR(ufs->workdir);
@@ -1108,7 +1220,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
* Make lower_mnt R/O. That way fchmod/fchown on lower file
* will fail instead of modifying lower fs.
*/
- mnt->mnt_flags |= MNT_READONLY;
+ mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
ufs->lower_mnt[ufs->numlower] = mnt;
ufs->numlower++;
@@ -1132,7 +1244,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
if (!oe)
goto out_put_cred;
- root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, oe));
+ root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
if (!root_dentry)
goto out_free_oe;
@@ -1151,13 +1263,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
root_dentry->d_fsdata = oe;
- ovl_copyattr(ovl_dentry_real(root_dentry)->d_inode,
- root_dentry->d_inode);
+ realinode = d_inode(ovl_dentry_real(root_dentry));
+ ovl_inode_init(d_inode(root_dentry), realinode, !!upperpath.dentry);
+ ovl_copyattr(realinode, d_inode(root_dentry));
sb->s_magic = OVERLAYFS_SUPER_MAGIC;
sb->s_op = &ovl_super_operations;
+ if (IS_ENABLED(CONFIG_FS_POSIX_ACL))
+ sb->s_xattr = ovl_xattr_handlers;
+ else
+ sb->s_xattr = ovl_xattr_noacl_handlers;
sb->s_root = root_dentry;
sb->s_fs_info = ufs;
+ sb->s_flags |= MS_POSIXACL;
return 0;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index edc452c2a563..59d47ab0791a 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -205,7 +205,7 @@ posix_acl_clone(const struct posix_acl *acl, gfp_t flags)
* Check if an acl is valid. Returns 0 if it is, or -E... otherwise.
*/
int
-posix_acl_valid(const struct posix_acl *acl)
+posix_acl_valid(struct user_namespace *user_ns, const struct posix_acl *acl)
{
const struct posix_acl_entry *pa, *pe;
int state = ACL_USER_OBJ;
@@ -225,7 +225,7 @@ posix_acl_valid(const struct posix_acl *acl)
case ACL_USER:
if (state != ACL_USER)
return -EINVAL;
- if (!uid_valid(pa->e_uid))
+ if (!kuid_has_mapping(user_ns, pa->e_uid))
return -EINVAL;
needs_mask = 1;
break;
@@ -240,7 +240,7 @@ posix_acl_valid(const struct posix_acl *acl)
case ACL_GROUP:
if (state != ACL_GROUP)
return -EINVAL;
- if (!gid_valid(pa->e_gid))
+ if (!kgid_has_mapping(user_ns, pa->e_gid))
return -EINVAL;
needs_mask = 1;
break;
@@ -834,7 +834,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl)
return -EPERM;
if (acl) {
- int ret = posix_acl_valid(acl);
+ int ret = posix_acl_valid(inode->i_sb->s_user_ns, acl);
if (ret)
return ret;
}
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 7151ea428041..12c6922c913c 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -4,6 +4,7 @@
obj-y += proc.o
+CFLAGS_task_mmu.o += $(call cc-option,-Wno-override-init,)
proc-y := nommu.o task_nommu.o
proc-$(CONFIG_MMU) := task_mmu.o
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a11eb7196ec8..54e270262979 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -579,11 +579,8 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns,
unsigned long totalpages = totalram_pages + total_swap_pages;
unsigned long points = 0;
- read_lock(&tasklist_lock);
- if (pid_alive(task))
- points = oom_badness(task, NULL, NULL, totalpages) *
- 1000 / totalpages;
- read_unlock(&tasklist_lock);
+ points = oom_badness(task, NULL, NULL, totalpages) *
+ 1000 / totalpages;
seq_printf(m, "%lu\n", points);
return 0;
@@ -1024,23 +1021,107 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
char buffer[PROC_NUMBUF];
int oom_adj = OOM_ADJUST_MIN;
size_t len;
- unsigned long flags;
if (!task)
return -ESRCH;
- if (lock_task_sighand(task, &flags)) {
- if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
- oom_adj = OOM_ADJUST_MAX;
- else
- oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
- OOM_SCORE_ADJ_MAX;
- unlock_task_sighand(task, &flags);
- }
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
+static int __set_oom_adj(struct file *file, int oom_adj, bool legacy)
+{
+ static DEFINE_MUTEX(oom_adj_mutex);
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ int err = 0;
+
+ task = get_proc_task(file_inode(file));
+ if (!task)
+ return -ESRCH;
+
+ mutex_lock(&oom_adj_mutex);
+ if (legacy) {
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+ } else {
+ if ((short)oom_adj < task->signal->oom_score_adj_min &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_unlock;
+ }
+ }
+
+ /*
+ * Make sure we will check other processes sharing the mm if this is
+ * not vfrok which wants its own oom_score_adj.
+ * pin the mm so it doesn't go away and get reused after task_unlock
+ */
+ if (!task->vfork_done) {
+ struct task_struct *p = find_lock_task_mm(task);
+
+ if (p) {
+ if (atomic_read(&p->mm->mm_users) > 1) {
+ mm = p->mm;
+ atomic_inc(&mm->mm_count);
+ }
+ task_unlock(p);
+ }
+ }
+
+ task->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ task->signal->oom_score_adj_min = (short)oom_adj;
+ trace_oom_score_adj_update(task);
+
+ if (mm) {
+ struct task_struct *p;
+
+ rcu_read_lock();
+ for_each_process(p) {
+ if (same_thread_group(task, p))
+ continue;
+
+ /* do not touch kernel threads or the global init */
+ if (p->flags & PF_KTHREAD || is_global_init(p))
+ continue;
+
+ task_lock(p);
+ if (!p->vfork_done && process_shares_mm(p, mm)) {
+ pr_info("updating oom_score_adj for %d (%s) from %d to %d because it shares mm with %d (%s). Report if this is unexpected.\n",
+ task_pid_nr(p), p->comm,
+ p->signal->oom_score_adj, oom_adj,
+ task_pid_nr(task), task->comm);
+ p->signal->oom_score_adj = oom_adj;
+ if (!legacy && has_capability_noaudit(current, CAP_SYS_RESOURCE))
+ p->signal->oom_score_adj_min = (short)oom_adj;
+ }
+ task_unlock(p);
+ }
+ rcu_read_unlock();
+ mmdrop(mm);
+ }
+err_unlock:
+ mutex_unlock(&oom_adj_mutex);
+ put_task_struct(task);
+ return err;
+}
+
/*
* /proc/pid/oom_adj exists solely for backwards compatibility with previous
* kernels. The effective policy is defined by oom_score_adj, which has a
@@ -1054,10 +1135,8 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task;
char buffer[PROC_NUMBUF];
int oom_adj;
- unsigned long flags;
int err;
memset(buffer, 0, sizeof(buffer));
@@ -1077,23 +1156,6 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file_inode(file));
- if (!task) {
- err = -ESRCH;
- goto out;
- }
-
- task_lock(task);
- if (!task->mm) {
- err = -EINVAL;
- goto err_task_lock;
- }
-
- if (!lock_task_sighand(task, &flags)) {
- err = -ESRCH;
- goto err_task_lock;
- }
-
/*
* Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
* value is always attainable.
@@ -1103,27 +1165,7 @@ static ssize_t oom_adj_write(struct file *file, const char __user *buf,
else
oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
- if (oom_adj < task->signal->oom_score_adj &&
- !capable(CAP_SYS_RESOURCE)) {
- err = -EACCES;
- goto err_sighand;
- }
-
- /*
- * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
- * /proc/pid/oom_score_adj instead.
- */
- pr_warn_once("%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
- current->comm, task_pid_nr(current), task_pid_nr(task),
- task_pid_nr(task));
-
- task->signal->oom_score_adj = oom_adj;
- trace_oom_score_adj_update(task);
-err_sighand:
- unlock_task_sighand(task, &flags);
-err_task_lock:
- task_unlock(task);
- put_task_struct(task);
+ err = __set_oom_adj(file, oom_adj, true);
out:
return err < 0 ? err : count;
}
@@ -1140,15 +1182,11 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file_inode(file));
char buffer[PROC_NUMBUF];
short oom_score_adj = OOM_SCORE_ADJ_MIN;
- unsigned long flags;
size_t len;
if (!task)
return -ESRCH;
- if (lock_task_sighand(task, &flags)) {
- oom_score_adj = task->signal->oom_score_adj;
- unlock_task_sighand(task, &flags);
- }
+ oom_score_adj = task->signal->oom_score_adj;
put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
@@ -1157,9 +1195,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task;
char buffer[PROC_NUMBUF];
- unsigned long flags;
int oom_score_adj;
int err;
@@ -1180,39 +1216,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto out;
}
- task = get_proc_task(file_inode(file));
- if (!task) {
- err = -ESRCH;
- goto out;
- }
-
- task_lock(task);
- if (!task->mm) {
- err = -EINVAL;
- goto err_task_lock;
- }
-
- if (!lock_task_sighand(task, &flags)) {
- err = -ESRCH;
- goto err_task_lock;
- }
-
- if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
- !capable(CAP_SYS_RESOURCE)) {
- err = -EACCES;
- goto err_sighand;
- }
-
- task->signal->oom_score_adj = (short)oom_score_adj;
- if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
- task->signal->oom_score_adj_min = (short)oom_score_adj;
- trace_oom_score_adj_update(task);
-
-err_sighand:
- unlock_task_sighand(task, &flags);
-err_task_lock:
- task_unlock(task);
- put_task_struct(task);
+ err = __set_oom_adj(file, oom_score_adj, false);
out:
return err < 0 ? err : count;
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 42305ddcbaa0..c1b72388e571 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -457,17 +457,30 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
return inode;
}
-int proc_fill_super(struct super_block *s)
+int proc_fill_super(struct super_block *s, void *data, int silent)
{
+ struct pid_namespace *ns = get_pid_ns(s->s_fs_info);
struct inode *root_inode;
int ret;
+ if (!proc_parse_options(data, ns))
+ return -EINVAL;
+
+ /* User space would break if executables or devices appear on proc */
+ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV;
s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
s->s_magic = PROC_SUPER_MAGIC;
s->s_op = &proc_sops;
s->s_time_gran = 1;
+
+ /*
+ * procfs isn't actually a stacking filesystem; however, there is
+ * too much magic going on inside it to permit stacking things on
+ * top of it
+ */
+ s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
pde_get(&proc_root);
root_inode = proc_get_inode(s, &proc_root);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index aa2781095bd1..7931c558c192 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -212,7 +212,7 @@ extern const struct inode_operations proc_pid_link_inode_operations;
extern void proc_init_inodecache(void);
extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-extern int proc_fill_super(struct super_block *);
+extern int proc_fill_super(struct super_block *, void *data, int flags);
extern void proc_entry_rundown(struct proc_dir_entry *);
/*
@@ -268,6 +268,7 @@ static inline void proc_tty_init(void) {}
* root.c
*/
extern struct proc_dir_entry proc_root;
+extern int proc_parse_options(char *options, struct pid_namespace *pid);
extern void proc_self_init(void);
extern int proc_remount(struct super_block *, int *, char *);
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index cf301a9ef512..09e18fdf61e5 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -40,7 +40,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
si_swapinfo(&i);
committed = percpu_counter_read_positive(&vm_committed_as);
- cached = global_page_state(NR_FILE_PAGES) -
+ cached = global_node_page_state(NR_FILE_PAGES) -
total_swapcache_pages() - i.bufferram;
if (cached < 0)
cached = 0;
@@ -138,23 +138,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
#endif
K(i.totalswap),
K(i.freeswap),
- K(global_page_state(NR_FILE_DIRTY)),
- K(global_page_state(NR_WRITEBACK)),
- K(global_page_state(NR_ANON_PAGES)),
- K(global_page_state(NR_FILE_MAPPED)),
+ K(global_node_page_state(NR_FILE_DIRTY)),
+ K(global_node_page_state(NR_WRITEBACK)),
+ K(global_node_page_state(NR_ANON_MAPPED)),
+ K(global_node_page_state(NR_FILE_MAPPED)),
K(i.sharedram),
K(global_page_state(NR_SLAB_RECLAIMABLE) +
global_page_state(NR_SLAB_UNRECLAIMABLE)),
K(global_page_state(NR_SLAB_RECLAIMABLE)),
K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
- global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+ global_page_state(NR_KERNEL_STACK_KB),
K(global_page_state(NR_PAGETABLE)),
#ifdef CONFIG_QUICKLIST
K(quicklist_total_size()),
#endif
- K(global_page_state(NR_UNSTABLE_NFS)),
+ K(global_node_page_state(NR_UNSTABLE_NFS)),
K(global_page_state(NR_BOUNCE)),
- K(global_page_state(NR_WRITEBACK_TEMP)),
+ K(global_node_page_state(NR_WRITEBACK_TEMP)),
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
@@ -164,9 +164,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- , K(global_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
- , K(global_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
- , K(global_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR)
+ , K(global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR)
#endif
#ifdef CONFIG_CMA
, K(totalcma_pages)
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 5e57c3e46e1d..b59db94d2ff4 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -623,7 +623,7 @@ static bool proc_sys_fill_cache(struct file *file,
qname.name = table->procname;
qname.len = strlen(table->procname);
- qname.hash = full_name_hash(qname.name, qname.len);
+ qname.hash = full_name_hash(dir, qname.name, qname.len);
child = d_lookup(dir, &qname);
if (!child) {
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 06702783bf40..8d3e484055a6 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -23,21 +23,6 @@
#include "internal.h"
-static int proc_test_super(struct super_block *sb, void *data)
-{
- return sb->s_fs_info == data;
-}
-
-static int proc_set_super(struct super_block *sb, void *data)
-{
- int err = set_anon_super(sb, NULL);
- if (!err) {
- struct pid_namespace *ns = (struct pid_namespace *)data;
- sb->s_fs_info = get_pid_ns(ns);
- }
- return err;
-}
-
enum {
Opt_gid, Opt_hidepid, Opt_err,
};
@@ -48,7 +33,7 @@ static const match_table_t tokens = {
{Opt_err, NULL},
};
-static int proc_parse_options(char *options, struct pid_namespace *pid)
+int proc_parse_options(char *options, struct pid_namespace *pid)
{
char *p;
substring_t args[MAX_OPT_ARGS];
@@ -100,52 +85,16 @@ int proc_remount(struct super_block *sb, int *flags, char *data)
static struct dentry *proc_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
- int err;
- struct super_block *sb;
struct pid_namespace *ns;
- char *options;
if (flags & MS_KERNMOUNT) {
- ns = (struct pid_namespace *)data;
- options = NULL;
+ ns = data;
+ data = NULL;
} else {
ns = task_active_pid_ns(current);
- options = data;
-
- /* Does the mounter have privilege over the pid namespace? */
- if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- }
-
- sb = sget(fs_type, proc_test_super, proc_set_super, flags, ns);
- if (IS_ERR(sb))
- return ERR_CAST(sb);
-
- /*
- * procfs isn't actually a stacking filesystem; however, there is
- * too much magic going on inside it to permit stacking things on
- * top of it
- */
- sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
-
- if (!proc_parse_options(options, ns)) {
- deactivate_locked_super(sb);
- return ERR_PTR(-EINVAL);
- }
-
- if (!sb->s_root) {
- err = proc_fill_super(sb);
- if (err) {
- deactivate_locked_super(sb);
- return ERR_PTR(err);
- }
-
- sb->s_flags |= MS_ACTIVE;
- /* User space would break if executables appear on proc */
- sb->s_iflags |= SB_I_NOEXEC;
}
- return dget(sb->s_root);
+ return mount_ns(fs_type, flags, data, ns, ns->user_ns, proc_fill_super);
}
static void proc_kill_sb(struct super_block *sb)
@@ -165,7 +114,7 @@ static struct file_system_type proc_fs_type = {
.name = "proc",
.mount = proc_mount,
.kill_sb = proc_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
void __init proc_root_init(void)
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 510413eb25b8..7907e456ac4f 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -80,19 +80,17 @@ static u64 get_iowait_time(int cpu)
static int show_stat(struct seq_file *p, void *v)
{
int i, j;
- unsigned long jif;
u64 user, nice, system, idle, iowait, irq, softirq, steal;
u64 guest, guest_nice;
u64 sum = 0;
u64 sum_softirq = 0;
unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
- struct timespec boottime;
+ struct timespec64 boottime;
user = nice = system = idle = iowait =
irq = softirq = steal = 0;
guest = guest_nice = 0;
- getboottime(&boottime);
- jif = boottime.tv_sec;
+ getboottime64(&boottime);
for_each_possible_cpu(i) {
user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -163,12 +161,12 @@ static int show_stat(struct seq_file *p, void *v)
seq_printf(p,
"\nctxt %llu\n"
- "btime %lu\n"
+ "btime %llu\n"
"processes %lu\n"
"procs_running %lu\n"
"procs_blocked %lu\n",
nr_context_switches(),
- (unsigned long)jif,
+ (unsigned long long)boottime.tv_sec,
total_forks,
nr_running(),
nr_iowait());
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 47516a794011..7a034d62cf8c 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -486,30 +486,21 @@ static int ramoops_parse_dt(struct platform_device *pdev,
struct ramoops_platform_data *pdata)
{
struct device_node *of_node = pdev->dev.of_node;
- struct device_node *mem_region;
- struct resource res;
+ struct resource *res;
u32 value;
int ret;
dev_dbg(&pdev->dev, "using Device Tree\n");
- mem_region = of_parse_phandle(of_node, "memory-region", 0);
- if (!mem_region) {
- dev_err(&pdev->dev, "no memory-region phandle\n");
- return -ENODEV;
- }
-
- ret = of_address_to_resource(mem_region, 0, &res);
- of_node_put(mem_region);
- if (ret) {
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ if (!res) {
dev_err(&pdev->dev,
- "failed to translate memory-region to resource: %d\n",
- ret);
- return ret;
+ "failed to locate DT /reserved-memory resource\n");
+ return -EINVAL;
}
- pdata->mem_size = resource_size(&res);
- pdata->mem_address = res.start;
+ pdata->mem_size = resource_size(res);
+ pdata->mem_address = res->start;
pdata->mem_type = of_property_read_bool(of_node, "unbuffered");
pdata->dump_oops = !of_property_read_bool(of_node, "no-dump-oops");
@@ -652,11 +643,11 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
- kfree(cxt->mprz);
+ persistent_ram_free(cxt->mprz);
fail_init_mprz:
- kfree(cxt->fprz);
+ persistent_ram_free(cxt->fprz);
fail_init_fprz:
- kfree(cxt->cprz);
+ persistent_ram_free(cxt->cprz);
fail_init_cprz:
ramoops_free_przs(cxt);
fail_out:
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ff21980d0119..1bfac28b7e7d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -841,6 +841,9 @@ struct dquot *dqget(struct super_block *sb, struct kqid qid)
unsigned int hashent = hashfn(sb, qid);
struct dquot *dquot, *empty = NULL;
+ if (!qid_has_mapping(sb->s_user_ns, qid))
+ return ERR_PTR(-EINVAL);
+
if (!sb_has_quota_active(sb, qid.type))
return ERR_PTR(-ESRCH);
we_slept:
@@ -1133,7 +1136,7 @@ static void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
else
dquot->dq_dqb.dqb_curinodes = 0;
if (dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
- dquot->dq_dqb.dqb_itime = (time_t) 0;
+ dquot->dq_dqb.dqb_itime = (time64_t) 0;
clear_bit(DQ_INODES_B, &dquot->dq_flags);
}
@@ -1145,7 +1148,7 @@ static void dquot_decr_space(struct dquot *dquot, qsize_t number)
else
dquot->dq_dqb.dqb_curspace = 0;
if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
- dquot->dq_dqb.dqb_btime = (time_t) 0;
+ dquot->dq_dqb.dqb_btime = (time64_t) 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1292,7 +1295,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
if (dquot->dq_dqb.dqb_isoftlimit &&
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime &&
- get_seconds() >= dquot->dq_dqb.dqb_itime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime &&
!ignore_hardlimit(dquot)) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN);
return -EDQUOT;
@@ -1302,7 +1305,7 @@ static int check_idq(struct dquot *dquot, qsize_t inodes,
newinodes > dquot->dq_dqb.dqb_isoftlimit &&
dquot->dq_dqb.dqb_itime == 0) {
prepare_warning(warn, dquot, QUOTA_NL_ISOFTWARN);
- dquot->dq_dqb.dqb_itime = get_seconds() +
+ dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() +
sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace;
}
@@ -1334,7 +1337,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
if (dquot->dq_dqb.dqb_bsoftlimit &&
tspace > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime &&
- get_seconds() >= dquot->dq_dqb.dqb_btime &&
+ ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN);
@@ -1346,7 +1349,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc,
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN);
- dquot->dq_dqb.dqb_btime = get_seconds() +
+ dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() +
sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace;
}
else
@@ -2268,6 +2271,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = -EINVAL;
goto out_fmt;
}
+ /* Filesystems outside of init_user_ns not yet supported */
+ if (sb->s_user_ns != &init_user_ns) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
/* Usage always has to be set... */
if (!(flags & DQUOT_USAGE_ENABLED)) {
error = -EINVAL;
@@ -2695,7 +2703,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_SPC_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
+ dm->dqb_btime = ktime_get_real_seconds() + dqi->dqi_bgrace;
}
if (check_ilim) {
if (!dm->dqb_isoftlimit ||
@@ -2704,7 +2712,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
clear_bit(DQ_INODES_B, &dquot->dq_flags);
} else if (!(di->d_fieldmask & QC_INO_TIMER))
/* Set grace only if user hasn't provided his own... */
- dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
+ dm->dqb_itime = ktime_get_real_seconds() + dqi->dqi_igrace;
}
if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit ||
dm->dqb_isoftlimit)
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 0f10ee9892ce..35df08ee9c97 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -211,7 +211,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
if (ret)
@@ -237,7 +237,7 @@ static int quota_getnextquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &fdq);
if (ret)
@@ -288,7 +288,7 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
copy_from_if_dqblk(&fdq, &idq);
return sb->s_qcop->set_dqblk(sb, qid, &fdq);
@@ -581,10 +581,10 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->set_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
/* Are we actually setting timer / warning limits for all users? */
- if (from_kqid(&init_user_ns, qid) == 0 &&
+ if (from_kqid(sb->s_user_ns, qid) == 0 &&
fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) {
struct qc_info qinfo;
int ret;
@@ -642,7 +642,7 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_dqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
if (ret)
@@ -669,7 +669,7 @@ static int quota_getnextxquota(struct super_block *sb, int type, qid_t id,
if (!sb->s_qcop->get_nextdqblk)
return -ENOSYS;
qid = make_kqid(current_user_ns(), type, id);
- if (!qid_valid(qid))
+ if (!qid_has_mapping(sb->s_user_ns, qid))
return -EINVAL;
ret = sb->s_qcop->get_nextdqblk(sb, &qid, &qdq);
if (ret)
diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c
index b751eea32e20..5db6f45b3fed 100644
--- a/fs/reiserfs/ibalance.c
+++ b/fs/reiserfs/ibalance.c
@@ -1153,8 +1153,9 @@ int balance_internal(struct tree_balance *tb,
insert_ptr);
}
- memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
insert_ptr[0] = new_insert_ptr;
+ if (new_insert_ptr)
+ memcpy(new_insert_key_addr, &new_insert_key, KEY_SIZE);
return order;
}
diff --git a/fs/super.c b/fs/super.c
index 5806ffd45563..c2ff475c1711 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -33,6 +33,7 @@
#include <linux/cleancache.h>
#include <linux/fsnotify.h>
#include <linux/lockdep.h>
+#include <linux/user_namespace.h>
#include "internal.h"
@@ -165,6 +166,7 @@ static void destroy_super(struct super_block *s)
list_lru_destroy(&s->s_inode_lru);
security_sb_free(s);
WARN_ON(!list_empty(&s->s_mounts));
+ put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
kfree(s->s_options);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -174,11 +176,13 @@ static void destroy_super(struct super_block *s)
* alloc_super - create new superblock
* @type: filesystem type superblock should belong to
* @flags: the mount flags
+ * @user_ns: User namespace for the super_block
*
* Allocates and initializes a new &struct super_block. alloc_super()
* returns a pointer new superblock or %NULL if allocation had failed.
*/
-static struct super_block *alloc_super(struct file_system_type *type, int flags)
+static struct super_block *alloc_super(struct file_system_type *type, int flags,
+ struct user_namespace *user_ns)
{
struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
static const struct super_operations default_op;
@@ -188,6 +192,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
return NULL;
INIT_LIST_HEAD(&s->s_mounts);
+ s->s_user_ns = get_user_ns(user_ns);
if (security_sb_alloc(s))
goto fail;
@@ -201,6 +206,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
init_waitqueue_head(&s->s_writers.wait_unfrozen);
s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
+ if (s->s_user_ns != &init_user_ns)
+ s->s_iflags |= SB_I_NODEV;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
mutex_init(&s->s_sync_lock);
@@ -445,29 +452,42 @@ void generic_shutdown_super(struct super_block *sb)
EXPORT_SYMBOL(generic_shutdown_super);
/**
- * sget - find or create a superblock
+ * sget_userns - find or create a superblock
* @type: filesystem type superblock should belong to
* @test: comparison callback
* @set: setup callback
* @flags: mount flags
+ * @user_ns: User namespace for the super_block
* @data: argument to each of them
*/
-struct super_block *sget(struct file_system_type *type,
+struct super_block *sget_userns(struct file_system_type *type,
int (*test)(struct super_block *,void *),
int (*set)(struct super_block *,void *),
- int flags,
+ int flags, struct user_namespace *user_ns,
void *data)
{
struct super_block *s = NULL;
struct super_block *old;
int err;
+ if (!(flags & MS_KERNMOUNT) &&
+ !(type->fs_flags & FS_USERNS_MOUNT) &&
+ !capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
retry:
spin_lock(&sb_lock);
if (test) {
hlist_for_each_entry(old, &type->fs_supers, s_instances) {
if (!test(old, data))
continue;
+ if (user_ns != old->s_user_ns) {
+ spin_unlock(&sb_lock);
+ if (s) {
+ up_write(&s->s_umount);
+ destroy_super(s);
+ }
+ return ERR_PTR(-EBUSY);
+ }
if (!grab_super(old))
goto retry;
if (s) {
@@ -480,7 +500,7 @@ retry:
}
if (!s) {
spin_unlock(&sb_lock);
- s = alloc_super(type, flags);
+ s = alloc_super(type, flags, user_ns);
if (!s)
return ERR_PTR(-ENOMEM);
goto retry;
@@ -503,6 +523,31 @@ retry:
return s;
}
+EXPORT_SYMBOL(sget_userns);
+
+/**
+ * sget - find or create a superblock
+ * @type: filesystem type superblock should belong to
+ * @test: comparison callback
+ * @set: setup callback
+ * @flags: mount flags
+ * @data: argument to each of them
+ */
+struct super_block *sget(struct file_system_type *type,
+ int (*test)(struct super_block *,void *),
+ int (*set)(struct super_block *,void *),
+ int flags,
+ void *data)
+{
+ struct user_namespace *user_ns = current_user_ns();
+
+ /* Ensure the requestor has permissions over the target filesystem */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ return sget_userns(type, test, set, flags, user_ns, data);
+}
+
EXPORT_SYMBOL(sget);
void drop_super(struct super_block *sb)
@@ -920,12 +965,20 @@ static int ns_set_super(struct super_block *sb, void *data)
return set_anon_super(sb, NULL);
}
-struct dentry *mount_ns(struct file_system_type *fs_type, int flags,
- void *data, int (*fill_super)(struct super_block *, void *, int))
+struct dentry *mount_ns(struct file_system_type *fs_type,
+ int flags, void *data, void *ns, struct user_namespace *user_ns,
+ int (*fill_super)(struct super_block *, void *, int))
{
struct super_block *sb;
- sb = sget(fs_type, ns_test_super, ns_set_super, flags, data);
+ /* Don't allow mounting unless the caller has CAP_SYS_ADMIN
+ * over the namespace.
+ */
+ if (!(flags & MS_KERNMOUNT) && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
+ sb = sget_userns(fs_type, ns_test_super, ns_set_super, flags,
+ user_ns, ns);
if (IS_ERR(sb))
return ERR_CAST(sb);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index f3db82071cfb..20b8f82e115b 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -41,8 +41,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
if (IS_ERR(root) || !new_sb)
kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
else if (new_sb)
- /* Userspace would break if executables appear on sysfs */
- root->d_sb->s_iflags |= SB_I_NOEXEC;
+ root->d_sb->s_iflags |= SB_I_USERNS_VISIBLE;
return root;
}
@@ -59,7 +58,7 @@ static struct file_system_type sysfs_fs_type = {
.name = "sysfs",
.mount = sysfs_mount,
.kill_sb = sysfs_kill_sb,
- .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT,
+ .fs_flags = FS_USERNS_MOUNT,
};
int __init sysfs_init(void)
diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index 90b60c03b588..a42de45ce40d 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -33,7 +33,7 @@ static int sysv_hash(const struct dentry *dentry, struct qstr *qstr)
function. */
if (qstr->len > SYSV_NAMELEN) {
qstr->len = SYSV_NAMELEN;
- qstr->hash = full_name_hash(qstr->name, qstr->len);
+ qstr->hash = full_name_hash(dentry, qstr->name, qstr->len);
}
return 0;
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 4a0e48f92104..ad40b64c5e2f 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -541,9 +541,6 @@ void tracefs_remove(struct dentry *dentry)
return;
parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
- return;
-
inode_lock(parent->d_inode);
ret = __tracefs_remove(dentry, parent);
inode_unlock(parent->d_inode);
@@ -566,10 +563,6 @@ void tracefs_remove_recursive(struct dentry *dentry)
if (IS_ERR_OR_NULL(dentry))
return;
- parent = dentry->d_parent;
- if (!parent || !parent->d_inode)
- return;
-
parent = dentry;
down:
inode_lock(parent->d_inode);
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index 9718da86ad01..821b34816976 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -100,10 +100,6 @@ static int switch_gc_head(struct ubifs_info *c)
if (err)
return err;
- err = ubifs_wbuf_sync_nolock(wbuf);
- if (err)
- return err;
-
err = ubifs_add_bud_to_log(c, GCHD, gc_lnum, 0);
if (err)
return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 70349954e78b..4ec051089186 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -520,19 +520,19 @@ static int init_constants_early(struct ubifs_info *c)
c->max_write_shift = fls(c->max_write_size) - 1;
if (c->leb_size < UBIFS_MIN_LEB_SZ) {
- ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes",
- c->leb_size, UBIFS_MIN_LEB_SZ);
+ ubifs_errc(c, "too small LEBs (%d bytes), min. is %d bytes",
+ c->leb_size, UBIFS_MIN_LEB_SZ);
return -EINVAL;
}
if (c->leb_cnt < UBIFS_MIN_LEB_CNT) {
- ubifs_err(c, "too few LEBs (%d), min. is %d",
- c->leb_cnt, UBIFS_MIN_LEB_CNT);
+ ubifs_errc(c, "too few LEBs (%d), min. is %d",
+ c->leb_cnt, UBIFS_MIN_LEB_CNT);
return -EINVAL;
}
if (!is_power_of_2(c->min_io_size)) {
- ubifs_err(c, "bad min. I/O size %d", c->min_io_size);
+ ubifs_errc(c, "bad min. I/O size %d", c->min_io_size);
return -EINVAL;
}
@@ -543,8 +543,8 @@ static int init_constants_early(struct ubifs_info *c)
if (c->max_write_size < c->min_io_size ||
c->max_write_size % c->min_io_size ||
!is_power_of_2(c->max_write_size)) {
- ubifs_err(c, "bad write buffer size %d for %d min. I/O unit",
- c->max_write_size, c->min_io_size);
+ ubifs_errc(c, "bad write buffer size %d for %d min. I/O unit",
+ c->max_write_size, c->min_io_size);
return -EINVAL;
}
@@ -2108,8 +2108,9 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
*/
ubi = open_ubi(name, UBI_READONLY);
if (IS_ERR(ubi)) {
- pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
- current->pid, name, (int)PTR_ERR(ubi));
+ if (!(flags & MS_SILENT))
+ pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d",
+ current->pid, name, (int)PTR_ERR(ubi));
return ERR_CAST(ubi);
}
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index ddf9f6b9eee2..4617d459022a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1783,8 +1783,8 @@ void ubifs_err(const struct ubifs_info *c, const char *fmt, ...);
__printf(2, 3)
void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...);
/*
- * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description
- * object as an argument.
+ * A conditional variant of 'ubifs_err()' which doesn't output anything
+ * if probing (ie. MS_SILENT set).
*/
#define ubifs_errc(c, fmt, ...) \
do { \
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index b5fc27969e9d..e237811f09ce 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -592,19 +592,19 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
return __ubifs_removexattr(inode, name);
}
-const struct xattr_handler ubifs_user_xattr_handler = {
+static const struct xattr_handler ubifs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.get = ubifs_xattr_get,
.set = ubifs_xattr_set,
};
-const struct xattr_handler ubifs_trusted_xattr_handler = {
+static const struct xattr_handler ubifs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = ubifs_xattr_get,
.set = ubifs_xattr_set,
};
-const struct xattr_handler ubifs_security_xattr_handler = {
+static const struct xattr_handler ubifs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
.get = ubifs_xattr_get,
.set = ubifs_xattr_set,
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 57dcceda17d6..fa3bda1a860f 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -279,12 +279,6 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr,
de = (struct ufs_dir_entry *) kaddr;
kaddr += ufs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
- if (de->d_reclen == 0) {
- ufs_error(dir->i_sb, __func__,
- "zero-length directory entry");
- ufs_put_page(page);
- goto out;
- }
if (ufs_match(sb, namelen, name, de))
goto found;
de = ufs_next_entry(sb, de);
@@ -414,11 +408,8 @@ ufs_validate_entry(struct super_block *sb, char *base,
{
struct ufs_dir_entry *de = (struct ufs_dir_entry*)(base + offset);
struct ufs_dir_entry *p = (struct ufs_dir_entry*)(base + (offset&mask));
- while ((char*)p < (char*)de) {
- if (p->d_reclen == 0)
- break;
+ while ((char*)p < (char*)de)
p = ufs_next_entry(sb, p);
- }
return (char *)p - base;
}
@@ -469,12 +460,6 @@ ufs_readdir(struct file *file, struct dir_context *ctx)
de = (struct ufs_dir_entry *)(kaddr+offset);
limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1);
for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) {
- if (de->d_reclen == 0) {
- ufs_error(sb, __func__,
- "zero-length directory entry");
- ufs_put_page(page);
- return -EIO;
- }
if (de->d_ino) {
unsigned char d_type = DT_UNKNOWN;
diff --git a/fs/xattr.c b/fs/xattr.c
index 4beafc43daa5..c243905835ab 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -38,6 +38,13 @@ xattr_permission(struct inode *inode, const char *name, int mask)
if (mask & MAY_WRITE) {
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
+ /*
+ * Updating an xattr will likely cause i_uid and i_gid
+ * to be writen back improperly if their true value is
+ * unknown to the vfs.
+ */
+ if (HAS_UNMAPPED_ID(inode))
+ return -EPERM;
}
/*
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 3542d94fddce..52c288514be1 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,5 +121,4 @@ xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
-xfs-$(CONFIG_NFSD_BLOCKLAYOUT) += xfs_pnfs.o
-xfs-$(CONFIG_NFSD_SCSILAYOUT) += xfs_pnfs.o
+xfs-$(CONFIG_EXPORTFS_BLOCK_OPS) += xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index a1b2dd828b9d..fe1bfee35898 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -246,7 +246,7 @@ const struct export_operations xfs_export_operations = {
.fh_to_parent = xfs_fs_fh_to_parent,
.get_parent = xfs_fs_get_parent,
.commit_metadata = xfs_fs_nfs_commit_metadata,
-#ifdef CONFIG_NFSD_BLOCKLAYOUT
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
.get_uuid = xfs_fs_get_uuid,
.map_blocks = xfs_fs_map_blocks,
.commit_blocks = xfs_fs_commit_blocks,
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index 93f74853961b..e8339f74966b 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -1,7 +1,7 @@
#ifndef _XFS_PNFS_H
#define _XFS_PNFS_H 1
-#if defined(CONFIG_NFSD_BLOCKLAYOUT) || defined(CONFIG_NFSD_SCSILAYOUT)
+#ifdef CONFIG_EXPORTFS_BLOCK_OPS
int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
struct iomap *iomap, bool write, u32 *device_generation);
@@ -15,5 +15,5 @@ xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
{
return 0;
}
-#endif /* CONFIG_NFSD_PNFS */
+#endif /* CONFIG_EXPORTFS_BLOCK_OPS */
#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 8686df6c7609..d266e835ecc3 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -128,7 +128,6 @@ static int xqm_proc_open(struct inode *inode, struct file *file)
}
static const struct file_operations xqm_proc_fops = {
- .owner = THIS_MODULE,
.open = xqm_proc_open,
.read = seq_read,
.llseek = seq_lseek,