summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c2
-rw-r--r--fs/Kconfig24
-rw-r--r--fs/Kconfig.binfmt7
-rw-r--r--fs/Makefile2
-rw-r--r--fs/affs/affs.h2
-rw-r--r--fs/affs/amigaffs.c13
-rw-r--r--fs/affs/bitmap.c1
-rw-r--r--fs/affs/dir.c11
-rw-r--r--fs/affs/file.c49
-rw-r--r--fs/affs/inode.c7
-rw-r--r--fs/affs/namei.c47
-rw-r--r--fs/affs/super.c69
-rw-r--r--fs/afs/rxrpc.c14
-rw-r--r--fs/afs/volume.c2
-rw-r--r--fs/aio.c20
-rw-r--r--fs/befs/linuxvfs.c6
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_som.c299
-rw-r--r--fs/block_dev.c77
-rw-r--r--fs/btrfs/Kconfig1
-rw-r--r--fs/btrfs/backref.c41
-rw-r--r--fs/btrfs/backref.h3
-rw-r--r--fs/btrfs/btrfs_inode.h3
-rw-r--r--fs/btrfs/ctree.c55
-rw-r--r--fs/btrfs/ctree.h40
-rw-r--r--fs/btrfs/delayed-inode.c46
-rw-r--r--fs/btrfs/dev-replace.c25
-rw-r--r--fs/btrfs/disk-io.c108
-rw-r--r--fs/btrfs/disk-io.h6
-rw-r--r--fs/btrfs/extent-tree.c264
-rw-r--r--fs/btrfs/extent_io.c91
-rw-r--r--fs/btrfs/extent_io.h65
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/free-space-cache.c13
-rw-r--r--fs/btrfs/inode-item.c9
-rw-r--r--fs/btrfs/inode.c166
-rw-r--r--fs/btrfs/qgroup.c3
-rw-r--r--fs/btrfs/raid56.c103
-rw-r--r--fs/btrfs/raid56.h11
-rw-r--r--fs/btrfs/reada.c19
-rw-r--r--fs/btrfs/relocation.c12
-rw-r--r--fs/btrfs/scrub.c315
-rw-r--r--fs/btrfs/send.c9
-rw-r--r--fs/btrfs/super.c20
-rw-r--r--fs/btrfs/sysfs.c10
-rw-r--r--fs/btrfs/tests/extent-buffer-tests.c2
-rw-r--r--fs/btrfs/tests/extent-io-tests.c3
-rw-r--r--fs/btrfs/tests/inode-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c23
-rw-r--r--fs/btrfs/transaction.c29
-rw-r--r--fs/btrfs/transaction.h7
-rw-r--r--fs/btrfs/tree-log.c235
-rw-r--r--fs/btrfs/volumes.c242
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/ceph/acl.c14
-rw-r--r--fs/ceph/addr.c22
-rw-r--r--fs/ceph/caps.c127
-rw-r--r--fs/ceph/dir.c33
-rw-r--r--fs/ceph/file.c39
-rw-r--r--fs/ceph/inode.c43
-rw-r--r--fs/ceph/locks.c63
-rw-r--r--fs/ceph/mds_client.c131
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/snap.c54
-rw-r--r--fs/ceph/super.c24
-rw-r--r--fs/ceph/super.h5
-rw-r--r--fs/char_dev.c24
-rw-r--r--fs/cifs/cifs_debug.c6
-rw-r--r--fs/cifs/cifsglob.h6
-rw-r--r--fs/cifs/connect.c2
-rw-r--r--fs/cifs/file.c41
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/ioctl.c21
-rw-r--r--fs/cifs/netmisc.c12
-rw-r--r--fs/cifs/readdir.c10
-rw-r--r--fs/cifs/smb2misc.c12
-rw-r--r--fs/cifs/smb2ops.c3
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2transport.c2
-rw-r--r--fs/cifs/smbencrypt.c2
-rw-r--r--fs/coda/dir.c138
-rw-r--r--fs/coda/inode.c2
-rw-r--r--fs/configfs/configfs_internal.h2
-rw-r--r--fs/configfs/inode.c17
-rw-r--r--fs/configfs/mount.c11
-rw-r--r--fs/dax.c534
-rw-r--r--fs/dcache.c189
-rw-r--r--fs/debugfs/inode.c291
-rw-r--r--fs/dlm/netlink.c7
-rw-r--r--fs/drop_caches.c14
-rw-r--r--fs/ecryptfs/inode.c1
-rw-r--r--fs/ecryptfs/main.c2
-rw-r--r--fs/efivarfs/Kconfig1
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/eventfd.c12
-rw-r--r--fs/eventpoll.c4
-rw-r--r--fs/exec.c10
-rw-r--r--fs/exofs/inode.c3
-rw-r--r--fs/exofs/super.c2
-rw-r--r--fs/ext2/Kconfig11
-rw-r--r--fs/ext2/Makefile1
-rw-r--r--fs/ext2/ext2.h10
-rw-r--r--fs/ext2/file.c44
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c38
-rw-r--r--fs/ext2/namei.c13
-rw-r--r--fs/ext2/super.c53
-rw-r--r--fs/ext2/xip.c91
-rw-r--r--fs/ext2/xip.h26
-rw-r--r--fs/ext3/super.c2
-rw-r--r--fs/ext4/ext4.h6
-rw-r--r--fs/ext4/extents.c4
-rw-r--r--fs/ext4/file.c270
-rw-r--r--fs/ext4/indirect.c18
-rw-r--r--fs/ext4/inode.c159
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c95
-rw-r--r--fs/f2fs/Kconfig10
-rw-r--r--fs/f2fs/Makefile1
-rw-r--r--fs/f2fs/acl.c6
-rw-r--r--fs/f2fs/checkpoint.c95
-rw-r--r--fs/f2fs/data.c218
-rw-r--r--fs/f2fs/debug.c59
-rw-r--r--fs/f2fs/dir.c3
-rw-r--r--fs/f2fs/f2fs.h120
-rw-r--r--fs/f2fs/file.c101
-rw-r--r--fs/f2fs/gc.c38
-rw-r--r--fs/f2fs/gc.h33
-rw-r--r--fs/f2fs/inline.c32
-rw-r--r--fs/f2fs/inode.c37
-rw-r--r--fs/f2fs/namei.c2
-rw-r--r--fs/f2fs/node.c154
-rw-r--r--fs/f2fs/node.h45
-rw-r--r--fs/f2fs/recovery.c11
-rw-r--r--fs/f2fs/segment.c194
-rw-r--r--fs/f2fs/segment.h29
-rw-r--r--fs/f2fs/super.c75
-rw-r--r--fs/f2fs/trace.c159
-rw-r--r--fs/f2fs/trace.h46
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/fs-writeback.c76
-rw-r--r--fs/fs_pin.c96
-rw-r--r--fs/fuse/dev.c51
-rw-r--r--fs/fuse/dir.c31
-rw-r--r--fs/fuse/file.c11
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/inode.c6
-rw-r--r--fs/gfs2/acl.c2
-rw-r--r--fs/gfs2/aops.c2
-rw-r--r--fs/gfs2/dir.c3
-rw-r--r--fs/gfs2/file.c5
-rw-r--r--fs/gfs2/glock.c14
-rw-r--r--fs/gfs2/inode.c3
-rw-r--r--fs/gfs2/ops_fstype.c1
-rw-r--r--fs/gfs2/quota.c60
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/hugetlbfs/inode.c13
-rw-r--r--fs/inode.c138
-rw-r--r--fs/internal.h9
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/isofs/rock.c3
-rw-r--r--fs/isofs/util.c18
-rw-r--r--fs/jffs2/compr_rubin.c5
-rw-r--r--fs/jffs2/scan.c5
-rw-r--r--fs/jfs/endian24.h49
-rw-r--r--fs/jfs/file.c2
-rw-r--r--fs/jfs/jfs_dtree.c4
-rw-r--r--fs/jfs/jfs_types.h55
-rw-r--r--fs/jfs/jfs_xtree.h25
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/kernfs/dir.c36
-rw-r--r--fs/kernfs/file.c4
-rw-r--r--fs/kernfs/inode.c13
-rw-r--r--fs/kernfs/kernfs-internal.h1
-rw-r--r--fs/kernfs/mount.c1
-rw-r--r--fs/libfs.c2
-rw-r--r--fs/lockd/mon.c13
-rw-r--r--fs/lockd/svc.c8
-rw-r--r--fs/lockd/svclock.c4
-rw-r--r--fs/lockd/svcsubs.c26
-rw-r--r--fs/lockd/xdr.c8
-rw-r--r--fs/locks.c588
-rw-r--r--fs/mount.h4
-rw-r--r--fs/namei.c143
-rw-r--r--fs/namespace.c50
-rw-r--r--fs/ncpfs/dir.c98
-rw-r--r--fs/ncpfs/inode.c3
-rw-r--r--fs/ncpfs/ncp_fs_i.h1
-rw-r--r--fs/ncpfs/ncplib_kernel.h30
-rw-r--r--fs/nfs/Kconfig5
-rw-r--r--fs/nfs/Makefile3
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.c8
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c8
-rw-r--r--fs/nfs/delegation.c45
-rw-r--r--fs/nfs/direct.c120
-rw-r--r--fs/nfs/file.c1
-rw-r--r--fs/nfs/filelayout/filelayout.c366
-rw-r--r--fs/nfs/filelayout/filelayout.h40
-rw-r--r--fs/nfs/filelayout/filelayoutdev.c469
-rw-r--r--fs/nfs/flexfilelayout/Makefile5
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1533
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h155
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayoutdev.c552
-rw-r--r--fs/nfs/idmap.c3
-rw-r--r--fs/nfs/inode.c14
-rw-r--r--fs/nfs/internal.h69
-rw-r--r--fs/nfs/nfs2xdr.c10
-rw-r--r--fs/nfs/nfs3_fs.h2
-rw-r--r--fs/nfs/nfs3client.c41
-rw-r--r--fs/nfs/nfs3proc.c9
-rw-r--r--fs/nfs/nfs3super.c2
-rw-r--r--fs/nfs/nfs3xdr.c3
-rw-r--r--fs/nfs/nfs4_fs.h9
-rw-r--r--fs/nfs/nfs4client.c51
-rw-r--r--fs/nfs/nfs4proc.c411
-rw-r--r--fs/nfs/nfs4session.c2
-rw-r--r--fs/nfs/nfs4session.h6
-rw-r--r--fs/nfs/nfs4state.c101
-rw-r--r--fs/nfs/nfs4super.c4
-rw-r--r--fs/nfs/nfs4xdr.c145
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/objlayout/objio_osd.c5
-rw-r--r--fs/nfs/pagelist.c300
-rw-r--r--fs/nfs/pnfs.c471
-rw-r--r--fs/nfs/pnfs.h139
-rw-r--r--fs/nfs/pnfs_nfs.c870
-rw-r--r--fs/nfs/read.c33
-rw-r--r--fs/nfs/super.c33
-rw-r--r--fs/nfs/write.c111
-rw-r--r--fs/nfsd/Kconfig10
-rw-r--r--fs/nfsd/Makefile8
-rw-r--r--fs/nfsd/blocklayout.c189
-rw-r--r--fs/nfsd/blocklayoutxdr.c157
-rw-r--r--fs/nfsd/blocklayoutxdr.h62
-rw-r--r--fs/nfsd/export.c8
-rw-r--r--fs/nfsd/export.h2
-rw-r--r--fs/nfsd/nfs4callback.c99
-rw-r--r--fs/nfsd/nfs4layouts.c721
-rw-r--r--fs/nfsd/nfs4proc.c310
-rw-r--r--fs/nfsd/nfs4state.c97
-rw-r--r--fs/nfsd/nfs4xdr.c362
-rw-r--r--fs/nfsd/nfsctl.c9
-rw-r--r--fs/nfsd/nfsd.h16
-rw-r--r--fs/nfsd/nfsfh.h18
-rw-r--r--fs/nfsd/nfssvc.c1
-rw-r--r--fs/nfsd/pnfs.h86
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c5
-rw-r--r--fs/nfsd/trace.h54
-rw-r--r--fs/nfsd/xdr4.h59
-rw-r--r--fs/nfsd/xdr4cb.h7
-rw-r--r--fs/nilfs2/file.c1
-rw-r--r--fs/nilfs2/gcinode.c1
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/nilfs.h2
-rw-r--r--fs/nilfs2/page.c4
-rw-r--r--fs/nilfs2/page.h3
-rw-r--r--fs/nilfs2/segment.c44
-rw-r--r--fs/nilfs2/segment.h5
-rw-r--r--fs/nilfs2/super.c6
-rw-r--r--fs/notify/Kconfig1
-rw-r--r--fs/notify/fanotify/fanotify.c2
-rw-r--r--fs/notify/fanotify/fanotify_user.c45
-rw-r--r--fs/ntfs/file.c3
-rw-r--r--fs/ocfs2/acl.c14
-rw-r--r--fs/ocfs2/alloc.c18
-rw-r--r--fs/ocfs2/aops.c242
-rw-r--r--fs/ocfs2/cluster/tcp.c3
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h12
-rw-r--r--fs/ocfs2/dir.c10
-rw-r--r--fs/ocfs2/dlm/dlmast.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c14
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h1
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c12
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c14
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/file.c80
-rw-r--r--fs/ocfs2/file.h9
-rw-r--r--fs/ocfs2/inode.c2
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c111
-rw-r--r--fs/ocfs2/journal.h5
-rw-r--r--fs/ocfs2/mmap.c1
-rw-r--r--fs/ocfs2/namei.c327
-rw-r--r--fs/ocfs2/namei.h8
-rw-r--r--fs/ocfs2/ocfs2.h25
-rw-r--r--fs/ocfs2/ocfs2_fs.h14
-rw-r--r--fs/ocfs2/quota.h1
-rw-r--r--fs/ocfs2/quota_local.c20
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/reservations.c2
-rw-r--r--fs/ocfs2/super.c51
-rw-r--r--fs/ocfs2/xattr.c10
-rw-r--r--fs/open.c15
-rw-r--r--fs/proc/array.c44
-rw-r--r--fs/proc/generic.c27
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/page.c16
-rw-r--r--fs/proc/task_mmu.c250
-rw-r--r--fs/proc/vmcore.c8
-rw-r--r--fs/proc_namespace.c1
-rw-r--r--fs/pstore/Kconfig10
-rw-r--r--fs/pstore/Makefile2
-rw-r--r--fs/pstore/inode.c26
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/pmsg.c114
-rw-r--r--fs/pstore/ram.c53
-rw-r--r--fs/quota/Kconfig1
-rw-r--r--fs/quota/dquot.c186
-rw-r--r--fs/quota/quota.c214
-rw-r--r--fs/quota/quota_v1.c4
-rw-r--r--fs/quota/quota_v2.c16
-rw-r--r--fs/ramfs/file-nommu.c7
-rw-r--r--fs/ramfs/inode.c21
-rw-r--r--fs/read_write.c48
-rw-r--r--fs/reiserfs/inode.c2
-rw-r--r--fs/romfs/mmap-nommu.c10
-rw-r--r--fs/romfs/super.c3
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c32
-rw-r--r--fs/splice.c23
-rw-r--r--fs/super.c63
-rw-r--r--fs/sync.c8
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysfs/group.c2
-rw-r--r--fs/ubifs/debug.c4
-rw-r--r--fs/ubifs/dir.c18
-rw-r--r--fs/ubifs/file.c5
-rw-r--r--fs/ubifs/replay.c19
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h4
-rw-r--r--fs/ubifs/xattr.c112
-rw-r--r--fs/udf/Kconfig10
-rw-r--r--fs/udf/dir.c31
-rw-r--r--fs/udf/file.c2
-rw-r--r--fs/udf/inode.c42
-rw-r--r--fs/udf/namei.c17
-rw-r--r--fs/udf/super.c5
-rw-r--r--fs/udf/symlink.c57
-rw-r--r--fs/udf/udfdecl.h3
-rw-r--r--fs/udf/unicode.c28
-rw-r--r--fs/ufs/super.c8
-rw-r--r--fs/xfs/xfs_buf.c13
-rw-r--r--fs/xfs/xfs_file.c3
-rw-r--r--fs/xfs/xfs_qm.c12
-rw-r--r--fs/xfs/xfs_qm.h4
-rw-r--r--fs/xfs/xfs_qm_syscalls.c230
-rw-r--r--fs/xfs/xfs_quotaops.c67
-rw-r--r--fs/xfs/xfs_super.c7
358 files changed, 14179 insertions, 6077 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
}
init_rwsem(&v9ses->rename_sem);
- rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&v9ses->bdi, "9p");
if (rc) {
kfree(v9ses->aname);
kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = v9fs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = v9fs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..ec35851e5b71 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
source "fs/ext2/Kconfig"
source "fs/ext3/Kconfig"
source "fs/ext4/Kconfig"
-
-config FS_XIP
-# execute in place
- bool
- depends on EXT2_FS_XIP
- default y
-
source "fs/jbd/Kconfig"
source "fs/jbd2/Kconfig"
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig"
+config FS_DAX
+ bool "Direct Access (DAX) support"
+ depends on MMU
+ depends on !(ARM || MIPS || SPARC)
+ help
+ Direct Access (DAX) can be used on memory-backed block devices.
+ If the block device supports DAX and the filesystem supports DAX,
+ then you can avoid using the pagecache to buffer I/Os. Turning
+ on this option will compile in support for DAX; you will need to
+ mount the filesystem using the -o dax option.
+
+ If you do not have a block device that is capable of using this,
+ or if unsure, say N. Saying Y will increase the size of the kernel
+ by about 5kB.
+
endif # BLOCK
# Posix ACL utility routines
@@ -165,6 +173,7 @@ config HUGETLB_PAGE
def_bool HUGETLBFS
source "fs/configfs/Kconfig"
+source "fs/efivarfs/Kconfig"
endmenu
@@ -209,7 +218,6 @@ source "fs/sysv/Kconfig"
source "fs/ufs/Kconfig"
source "fs/exofs/Kconfig"
source "fs/f2fs/Kconfig"
-source "fs/efivarfs/Kconfig"
endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c055d56ec63d..270c48148f79 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -149,13 +149,6 @@ config BINFMT_EM86
later load the module when you want to use a Linux/Intel binary. The
module will be called binfmt_em86. If unsure, say Y.
-config BINFMT_SOM
- tristate "Kernel support for SOM binaries"
- depends on PARISC && HPUX
- help
- SOM is a binary executable format inherited from HP/UX. Say
- Y here to be able to load and execute SOM binaries directly.
-
config BINFMT_MISC
tristate "Kernel support for MISC binaries"
---help---
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..a88ac4838c9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD) += signalfd.o
obj-$(CONFIG_TIMERFD) += timerfd.o
obj-$(CONFIG_EVENTFD) += eventfd.o
obj-$(CONFIG_AIO) += aio.o
+obj-$(CONFIG_FS_DAX) += dax.o
obj-$(CONFIG_FILE_LOCKING) += locks.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT) += binfmt_script.o
obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o
obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
-obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index ff44ff3ff015..c8764bd7497d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -30,6 +30,8 @@
#define AFFS_AC_SIZE (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
#define AFFS_AC_MASK (AFFS_AC_SIZE-1)
+#define AFFSNAMEMAX 30U
+
struct affs_ext_key {
u32 ext; /* idx of the extended block */
u32 key; /* block number */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index c852f2fa1710..388da1ea815d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
ino = bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
- pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
+ pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
dir_bh = affs_bread(sb, dir->i_ino);
if (!dir_bh)
@@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
sb = dir->i_sb;
rem_ino = rem_bh->b_blocknr;
offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
- pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
- __func__, (u32)dir->i_ino, rem_ino, offset);
+ pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
+ rem_ino, offset);
bh = affs_bread(sb, dir->i_ino);
if (!bh)
@@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
{
int i;
- if (len > 30) {
+ if (len > AFFSNAMEMAX) {
if (notruncate)
return -ENAMETOOLONG;
- else
- len = 30;
+ len = AFFSNAMEMAX;
}
for (i = 0; i < len; i++) {
if (name[i] < ' ' || name[i] == ':'
@@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
int
affs_copy_name(unsigned char *bstr, struct dentry *dentry)
{
- int len = min(dentry->d_name.len, 30u);
+ u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
*bstr++ = len;
memcpy(bstr, dentry->d_name.name, len);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c8de51185c23..675148950fed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -99,7 +99,6 @@ err_bh_read:
err_range:
affs_error(sb, "affs_free_block","Block %u outside partition", block);
- return;
}
/*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 59f07bec92a6..ac4f318aafba 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
u32 ino;
int error = 0;
- pr_debug("%s(ino=%lu,f_pos=%lx)\n",
- __func__, inode->i_ino, (unsigned long)ctx->pos);
+ pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
if (ctx->pos < 2) {
file->private_data = (void *)0;
@@ -115,11 +114,11 @@ inside:
break;
}
- namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
+ namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
+ (u8)AFFSNAMEMAX);
name = AFFS_TAIL(sb, fh_bh)->name + 1;
- pr_debug("readdir(): dir_emit(\"%.*s\", "
- "ino=%u), hash=%d, f_pos=%x\n",
- namelen, name, ino, hash_pos, (u32)ctx->pos);
+ pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
+ namelen, name, ino, hash_pos, ctx->pos);
if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
goto done;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8faa6593ca6d..d2468bf95669 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
if (ext < AFFS_I(inode)->i_extcnt)
goto read_ext;
- if (ext > AFFS_I(inode)->i_extcnt)
- BUG();
+ BUG_ON(ext > AFFS_I(inode)->i_extcnt);
bh = affs_alloc_extblock(inode, bh, ext);
if (IS_ERR(bh))
return bh;
@@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
struct buffer_head *prev_bh;
/* allocate a new extended block */
- if (ext > AFFS_I(inode)->i_extcnt)
- BUG();
+ BUG_ON(ext > AFFS_I(inode)->i_extcnt);
/* get previous extended block */
prev_bh = affs_get_extblock(inode, ext - 1);
@@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
struct buffer_head *ext_bh;
u32 ext;
- pr_debug("%s(%u, %lu)\n",
- __func__, (u32)inode->i_ino, (unsigned long)block);
+ pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
+ (unsigned long long)block);
BUG_ON(block > (sector_t)0x7fffffffUL);
@@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
/* store new block */
if (bh_result->b_blocknr)
- affs_warning(sb, "get_block", "block already set (%lx)",
- (unsigned long)bh_result->b_blocknr);
+ affs_warning(sb, "get_block",
+ "block already set (%llx)",
+ (unsigned long long)bh_result->b_blocknr);
AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
return 0;
err_big:
- affs_error(inode->i_sb, "get_block", "strange block request %d",
- (int)block);
+ affs_error(inode->i_sb, "get_block", "strange block request %llu",
+ (unsigned long long)block);
return -EIO;
err_ext:
// unlock cache
@@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
+ if (rw == WRITE) {
+ loff_t size = offset + count;
+
+ if (AFFS_I(inode)->mmu_private < size)
+ return 0;
+ }
+
ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
if (ret < 0 && (rw & WRITE))
affs_write_failed(mapping, offset + count);
@@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
u32 bidx, boff, bsize;
u32 tmp;
- pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+ pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
page->index, to);
BUG_ON(to > PAGE_CACHE_SIZE);
kmap(page);
@@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
u32 size, bsize;
u32 tmp;
- pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
+ pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
bsize = AFFS_SB(sb)->s_data_blksize;
bh = NULL;
size = AFFS_I(inode)->mmu_private;
@@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
u32 to;
int err;
- pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
+ pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
to = PAGE_CACHE_SIZE;
if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
to = inode->i_size & ~PAGE_CACHE_MASK;
@@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
pgoff_t index;
int err = 0;
- pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
- (unsigned long long)pos, (unsigned long long)pos + len);
+ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
+ pos + len);
if (pos > AFFS_I(inode)->mmu_private) {
/* XXX: this probably leaves a too-big i_size in case of
* failure. Should really be updating i_size at write_end time
@@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
* due to write_begin.
*/
- pr_debug("%s(%u, %llu, %llu)\n",
- __func__, (u32)inode->i_ino, (unsigned long long)pos,
- (unsigned long long)pos + len);
+ pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
+ pos + len);
bsize = AFFS_SB(sb)->s_data_blksize;
data = page_address(page);
@@ -831,8 +836,8 @@ affs_truncate(struct inode *inode)
struct buffer_head *ext_bh;
int i;
- pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
- (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
+ pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
+ inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
last_blk = 0;
ext = 0;
@@ -863,7 +868,7 @@ affs_truncate(struct inode *inode)
if (IS_ERR(ext_bh)) {
affs_warning(sb, "truncate",
"unexpected read error for ext block %u (%ld)",
- (unsigned int)ext, PTR_ERR(ext_bh));
+ ext, PTR_ERR(ext_bh));
return;
}
if (AFFS_I(inode)->i_lc) {
@@ -911,7 +916,7 @@ affs_truncate(struct inode *inode)
if (IS_ERR(bh)) {
affs_warning(sb, "truncate",
"unexpected read error for last block %u (%ld)",
- (unsigned int)ext, PTR_ERR(bh));
+ ext, PTR_ERR(bh));
return;
}
tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index d0609a282e1d..6f34510449e8 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -13,8 +13,6 @@
#include <linux/gfp.h>
#include "affs.h"
-extern const struct inode_operations affs_symlink_inode_operations;
-
struct inode *affs_iget(struct super_block *sb, unsigned long ino)
{
struct affs_sb_info *sbi = AFFS_SB(sb);
@@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
u32 block = 0;
int retval;
- pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n",
- __func__, (u32)dir->i_ino,
- (u32)inode->i_ino, dentry, type);
+ pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
+ dir->i_ino, inode->i_ino, dentry, type);
retval = -EIO;
bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index bbc38530e924..ffb7bd82c2a5 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
{
const u8 *name = qstr->name;
unsigned long hash;
- int i;
+ int retval;
+ u32 len;
- i = affs_check_name(qstr->name, qstr->len, notruncate);
- if (i)
- return i;
+ retval = affs_check_name(qstr->name, qstr->len, notruncate);
+ if (retval)
+ return retval;
hash = init_name_hash();
- i = min(qstr->len, 30u);
- for (; i > 0; name++, i--)
+ len = min(qstr->len, AFFSNAMEMAX);
+ for (; len > 0; name++, len--)
hash = partial_name_hash(toupper(*name), hash);
qstr->hash = end_name_hash(hash);
@@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len,
* If the names are longer than the allowed 30 chars,
* the excess is ignored, so their length may differ.
*/
- if (len >= 30) {
- if (name->len < 30)
+ if (len >= AFFSNAMEMAX) {
+ if (name->len < AFFSNAMEMAX)
return 1;
- len = 30;
+ len = AFFSNAMEMAX;
} else if (len != name->len)
return 1;
@@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
const u8 *name = dentry->d_name.name;
int len = dentry->d_name.len;
- if (len >= 30) {
- if (*name2 < 30)
+ if (len >= AFFSNAMEMAX) {
+ if (*name2 < AFFSNAMEMAX)
return 0;
- len = 30;
+ len = AFFSNAMEMAX;
} else if (len != *name2)
return 0;
@@ -173,9 +174,9 @@ int
affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
{
toupper_t toupper = affs_get_toupper(sb);
- int hash;
+ u32 hash;
- hash = len = min(len, 30u);
+ hash = len = min(len, AFFSNAMEMAX);
for (; len > 0; len--)
hash = (hash * 13 + toupper(*name++)) & 0x7ff;
@@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
int
affs_unlink(struct inode *dir, struct dentry *dentry)
{
- pr_debug("%s(dir=%d, %lu \"%pd\")\n",
- __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
- dentry);
+ pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
+ dentry->d_inode->i_ino, dentry);
return affs_remove_header(dentry);
}
@@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
int
affs_rmdir(struct inode *dir, struct dentry *dentry)
{
- pr_debug("%s(dir=%u, %lu \"%pd\")\n",
- __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
- dentry);
+ pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
+ dentry->d_inode->i_ino, dentry);
return affs_remove_header(dentry);
}
@@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
- pr_debug("%s(%u, %u, \"%pd\")\n",
- __func__, (u32)inode->i_ino, (u32)dir->i_ino,
+ pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
dentry);
return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct buffer_head *bh = NULL;
int retval;
- pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n",
- __func__, (u32)old_dir->i_ino, old_dentry,
- (u32)new_dir->i_ino, new_dentry);
+ pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
+ old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
retval = affs_check_name(new_dentry->d_name.name,
new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index f754ab68a840..4cf0e9113fb6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -432,39 +432,39 @@ got_root:
sb->s_flags |= MS_RDONLY;
}
switch (chksum) {
- case MUFS_FS:
- case MUFS_INTLFFS:
- case MUFS_DCFFS:
- sbi->s_flags |= SF_MUFS;
- /* fall thru */
- case FS_INTLFFS:
- case FS_DCFFS:
- sbi->s_flags |= SF_INTL;
- break;
- case MUFS_FFS:
- sbi->s_flags |= SF_MUFS;
- break;
- case FS_FFS:
- break;
- case MUFS_OFS:
- sbi->s_flags |= SF_MUFS;
- /* fall thru */
- case FS_OFS:
- sbi->s_flags |= SF_OFS;
- sb->s_flags |= MS_NOEXEC;
- break;
- case MUFS_DCOFS:
- case MUFS_INTLOFS:
- sbi->s_flags |= SF_MUFS;
- case FS_DCOFS:
- case FS_INTLOFS:
- sbi->s_flags |= SF_INTL | SF_OFS;
- sb->s_flags |= MS_NOEXEC;
- break;
- default:
- pr_err("Unknown filesystem on device %s: %08X\n",
- sb->s_id, chksum);
- return -EINVAL;
+ case MUFS_FS:
+ case MUFS_INTLFFS:
+ case MUFS_DCFFS:
+ sbi->s_flags |= SF_MUFS;
+ /* fall thru */
+ case FS_INTLFFS:
+ case FS_DCFFS:
+ sbi->s_flags |= SF_INTL;
+ break;
+ case MUFS_FFS:
+ sbi->s_flags |= SF_MUFS;
+ break;
+ case FS_FFS:
+ break;
+ case MUFS_OFS:
+ sbi->s_flags |= SF_MUFS;
+ /* fall thru */
+ case FS_OFS:
+ sbi->s_flags |= SF_OFS;
+ sb->s_flags |= MS_NOEXEC;
+ break;
+ case MUFS_DCOFS:
+ case MUFS_INTLOFS:
+ sbi->s_flags |= SF_MUFS;
+ case FS_DCOFS:
+ case FS_INTLOFS:
+ sbi->s_flags |= SF_INTL | SF_OFS;
+ sb->s_flags |= MS_NOEXEC;
+ break;
+ default:
+ pr_err("Unknown filesystem on device %s: %08X\n",
+ sb->s_id, chksum);
+ return -EINVAL;
}
if (mount_flags & SF_VERBOSE) {
@@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bavail = free;
buf->f_fsid.val[0] = (u32)id;
buf->f_fsid.val[1] = (u32)(id >> 32);
- buf->f_namelen = 30;
+ buf->f_namelen = AFFSNAMEMAX;
return 0;
}
@@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb)
affs_free_bitmap(sb);
affs_brelse(sbi->s_root_bh);
kfree(sbi->s_prefix);
+ mutex_destroy(&sbi->s_bmlock);
kfree(sbi);
}
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
_debug("- range %u-%u%s",
offset, to, msg->msg_flags ? " [more]" : "");
- iov_iter_init(&msg->msg_iter, WRITE,
- (struct iovec *) iov, 1, to - offset);
+ iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
+ iov, 1, to - offset);
/* have to change the state *before* sending the last
* packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
void afs_send_empty_reply(struct afs_call *call)
{
struct msghdr msg;
- struct iovec iov[1];
+ struct kvec iov[1];
_enter("");
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
iov[0].iov_len = 0;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
{
struct msghdr msg;
- struct iovec iov[1];
+ struct kvec iov[1];
int n;
_enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
iov[0].iov_len = len;
msg.msg_name = NULL;
msg.msg_namelen = 0;
- iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
+ iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
volume->cell = params->cell;
volume->vid = vlocation->vldb.vid[params->type];
- ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+ ret = bdi_setup_and_register(&volume->bdi, "afs");
if (ret)
goto error_bdi;
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
static const struct file_operations aio_ring_fops;
static const struct address_space_operations aio_ctx_aops;
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
- .name = "aiofs",
- .state = 0,
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
-
static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
{
struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
inode->i_mapping->a_ops = &aio_ctx_aops;
inode->i_mapping->private_data = ctx;
- inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
inode->i_size = PAGE_SIZE * nr_pages;
path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
if (IS_ERR(aio_mnt))
panic("Failed to create aio fs mount.");
- if (bdi_init(&aio_fs_backing_dev_info))
- panic("Failed to init aio fs backing dev info.");
-
kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -1140,6 +1127,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
long ret = 0;
int copy_ret;
+ /*
+ * The mutex can block and wake us up and that will cause
+ * wait_event_interruptible_hrtimeout() to schedule without sleeping
+ * and repeat. This should be rare enough that it doesn't cause
+ * peformance issues. See the comment in read_events() for more detail.
+ */
+ sched_annotate_sleep();
mutex_lock(&ctx->ring_lock);
/* Access to ->ring_pages here is protected by ctx->ring_lock. */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index edf47774b03d..e089f1985fca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -274,9 +274,9 @@ more:
static struct inode *
befs_alloc_inode(struct super_block *sb)
{
- struct befs_inode_info *bi;
- bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep,
- GFP_KERNEL);
+ struct befs_inode_info *bi;
+
+ bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
if (!bi)
return NULL;
return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b16910f4c9..995986b8e36b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
static unsigned long randomize_stack_top(unsigned long stack_top)
{
- unsigned int random_variable = 0;
+ unsigned long random_variable = 0;
if ((current->flags & PF_RANDOMIZE) &&
!(current->personality & ADDR_NO_RANDOMIZE)) {
- random_variable = get_random_int() & STACK_RND_MASK;
+ random_variable = (unsigned long) get_random_int();
+ random_variable &= STACK_RND_MASK;
random_variable <<= PAGE_SHIFT;
}
#ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
deleted file mode 100644
index 4e00ed68d4a6..000000000000
--- a/fs/binfmt_som.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * linux/fs/binfmt_som.c
- *
- * These are the functions used to load SOM format executables as used
- * by HP-UX.
- *
- * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
- * based on binfmt_elf which is
- * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
- */
-
-#include <linux/module.h>
-
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/binfmts.h>
-#include <linux/som.h>
-#include <linux/string.h>
-#include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/shm.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-
-
-#include <linux/elf.h>
-
-static int load_som_binary(struct linux_binprm * bprm);
-static int load_som_library(struct file *);
-
-/*
- * If we don't support core dumping, then supply a NULL so we
- * don't even try.
- */
-#if 0
-static int som_core_dump(struct coredump_params *cprm);
-#else
-#define som_core_dump NULL
-#endif
-
-#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
-#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
-#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
-
-static struct linux_binfmt som_format = {
- .module = THIS_MODULE,
- .load_binary = load_som_binary,
- .load_shlib = load_som_library,
- .core_dump = som_core_dump,
- .min_coredump = SOM_PAGESIZE
-};
-
-/*
- * create_som_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static void create_som_tables(struct linux_binprm *bprm)
-{
- char **argv, **envp;
- int argc = bprm->argc;
- int envc = bprm->envc;
- unsigned long p;
- unsigned long *sp;
-
- /* Word-align the stack pointer */
- sp = (unsigned long *)((bprm->p + 3) & ~3);
-
- envp = (char **) sp;
- sp += envc + 1;
- argv = (char **) sp;
- sp += argc + 1;
-
- __put_user((unsigned long) envp,++sp);
- __put_user((unsigned long) argv,++sp);
-
- __put_user(argc, ++sp);
-
- bprm->p = (unsigned long) sp;
-
- p = current->mm->arg_start;
- while (argc-- > 0) {
- __put_user((char *)p,argv++);
- p += strlen_user((char *)p);
- }
- __put_user(NULL, argv);
- current->mm->arg_end = current->mm->env_start = p;
- while (envc-- > 0) {
- __put_user((char *)p,envp++);
- p += strlen_user((char *)p);
- }
- __put_user(NULL, envp);
- current->mm->env_end = p;
-}
-
-static int check_som_header(struct som_hdr *som_ex)
-{
- int *buf = (int *)som_ex;
- int i, ck;
-
- if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
- som_ex->system_id != SOM_SID_PARISC_1_1 &&
- som_ex->system_id != SOM_SID_PARISC_2_0)
- return -ENOEXEC;
-
- if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
- som_ex->a_magic != SOM_EXEC_SHARE &&
- som_ex->a_magic != SOM_EXEC_DEMAND)
- return -ENOEXEC;
-
- if (som_ex->version_id != SOM_ID_OLD &&
- som_ex->version_id != SOM_ID_NEW)
- return -ENOEXEC;
-
- ck = 0;
- for (i=0; i<32; i++)
- ck ^= buf[i];
- if (ck != 0)
- return -ENOEXEC;
-
- return 0;
-}
-
-static int map_som_binary(struct file *file,
- const struct som_exec_auxhdr *hpuxhdr)
-{
- unsigned long code_start, code_size, data_start, data_size;
- unsigned long bss_start, som_brk;
- int retval;
- int prot = PROT_READ | PROT_EXEC;
- int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
-
- mm_segment_t old_fs = get_fs();
- set_fs(get_ds());
-
- code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
- code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
- current->mm->start_code = code_start;
- current->mm->end_code = code_start + code_size;
- retval = vm_mmap(file, code_start, code_size, prot,
- flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
- if (retval < 0 && retval > -1024)
- goto out;
-
- data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
- data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
- current->mm->start_data = data_start;
- current->mm->end_data = bss_start = data_start + data_size;
- retval = vm_mmap(file, data_start, data_size,
- prot | PROT_WRITE, flags,
- SOM_PAGESTART(hpuxhdr->exec_dfile));
- if (retval < 0 && retval > -1024)
- goto out;
-
- som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
- current->mm->start_brk = current->mm->brk = som_brk;
- retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
- prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
- if (retval > 0 || retval < -1024)
- retval = 0;
-out:
- set_fs(old_fs);
- return retval;
-}
-
-
-/*
- * These are the functions used to load SOM executables and shared
- * libraries. There is no binary dependent code anywhere else.
- */
-
-static int
-load_som_binary(struct linux_binprm * bprm)
-{
- int retval;
- unsigned int size;
- unsigned long som_entry;
- struct som_hdr *som_ex;
- struct som_exec_auxhdr *hpuxhdr;
- struct pt_regs *regs = current_pt_regs();
-
- /* Get the exec-header */
- som_ex = (struct som_hdr *) bprm->buf;
-
- retval = check_som_header(som_ex);
- if (retval != 0)
- goto out;
-
- /* Now read in the auxiliary header information */
-
- retval = -ENOMEM;
- size = som_ex->aux_header_size;
- if (size > SOM_PAGESIZE)
- goto out;
- hpuxhdr = kmalloc(size, GFP_KERNEL);
- if (!hpuxhdr)
- goto out;
-
- retval = kernel_read(bprm->file, som_ex->aux_header_location,
- (char *) hpuxhdr, size);
- if (retval != size) {
- if (retval >= 0)
- retval = -EIO;
- goto out_free;
- }
-
- /* Flush all traces of the currently running executable */
- retval = flush_old_exec(bprm);
- if (retval)
- goto out_free;
-
- /* OK, This is the point of no return */
- current->personality = PER_HPUX;
- setup_new_exec(bprm);
-
- /* Set the task size for HP-UX processes such that
- * the gateway page is outside the address space.
- * This can be fixed later, but for now, this is much
- * easier.
- */
-
- current->thread.task_size = 0xc0000000;
-
- /* Set map base to allow enough room for hp-ux heap growth */
-
- current->thread.map_base = 0x80000000;
-
- retval = map_som_binary(bprm->file, hpuxhdr);
- if (retval < 0)
- goto out_free;
-
- som_entry = hpuxhdr->exec_entry;
- kfree(hpuxhdr);
-
- set_binfmt(&som_format);
- install_exec_creds(bprm);
- setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-
- create_som_tables(bprm);
-
- current->mm->start_stack = bprm->p;
-
-#if 0
- printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
- printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
- printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
- printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
- printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
- printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
-#endif
-
- map_hpux_gateway_page(current,current->mm);
-
- start_thread_som(regs, som_entry, bprm->p);
- return 0;
-
- /* error cleanup */
-out_free:
- kfree(hpuxhdr);
-out:
- return retval;
-}
-
-static int load_som_library(struct file *f)
-{
-/* No lib support in SOM yet. gizza chance.. */
- return -ENOEXEC;
-}
- /* Install the SOM loader.
- * N.B. We *rely* on the table being the right size with the
- * right number of free slots...
- */
-
-static int __init init_som_binfmt(void)
-{
- register_binfmt(&som_format);
- return 0;
-}
-
-static void __exit exit_som_binfmt(void)
-{
- /* Remove the SOM loader. */
- unregister_binfmt(&som_format);
-}
-
-core_initcall(init_som_binfmt);
-module_exit(exit_som_binfmt);
-
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-/*
- * Move the inode from its current bdi to a new bdi. Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
- struct backing_dev_info *dst)
+static void bdev_write_inode(struct inode *inode)
{
- while (true) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY)) {
- inode->i_data.backing_dev_info = dst;
- spin_unlock(&inode->i_lock);
- return;
- }
+ spin_lock(&inode->i_lock);
+ while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
WARN_ON_ONCE(write_inode_now(inode, true));
+ spin_lock(&inode->i_lock);
}
+ spin_unlock(&inode->i_lock);
}
/* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(bdev_write_page);
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory. The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar. The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+ void **addr, unsigned long *pfn, long size)
+{
+ long avail;
+ const struct block_device_operations *ops = bdev->bd_disk->fops;
+
+ if (size < 0)
+ return size;
+ if (!ops->direct_access)
+ return -EOPNOTSUPP;
+ if ((sector + DIV_ROUND_UP(size, 512)) >
+ part_nr_sects_read(bdev->bd_part))
+ return -ERANGE;
+ sector += get_start_sect(bdev);
+ if (sector % (PAGE_SIZE / 512))
+ return -EINVAL;
+ avail = ops->direct_access(bdev, sector, addr, pfn, size);
+ if (!avail)
+ return -ERANGE;
+ return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
+
/*
* pseudo-fs
*/
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
mapping_set_gfp_mask(&inode->i_data, GFP_USER);
- inode->i_data.backing_dev_info = &default_backing_dev_info;
spin_lock(&bdev_lock);
list_add(&bdev->bd_list, &all_bdevs);
spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_queue = disk->queue;
bdev->bd_contains = bdev;
if (!partno) {
- struct backing_dev_info *bdi;
-
ret = -ENXIO;
bdev->bd_part = disk_get_part(disk, partno);
if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
}
}
- if (!ret) {
+ if (!ret)
bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
- bdi = blk_get_backing_dev_info(bdev);
- bdev_inode_switch_bdi(bdev->bd_inode, bdi);
- }
/*
* If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
if (ret)
goto out_clear;
bdev->bd_contains = whole;
- bdev_inode_switch_bdi(bdev->bd_inode,
- whole->bd_inode->i_data.backing_dev_info);
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
!bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
bdev->bd_disk = NULL;
bdev->bd_part = NULL;
bdev->bd_queue = NULL;
- bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
if (bdev != bdev->bd_contains)
__blkdev_put(bdev->bd_contains, mode, 1);
bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
WARN_ON_ONCE(bdev->bd_holders);
sync_blockdev(bdev);
kill_bdev(bdev);
- /* ->release can cause the old bdi to disappear,
- * so must switch it out first
+ /*
+ * ->release can cause the queue to disappear, so flush all
+ * dirty data before.
*/
- bdev_inode_switch_bdi(bdev->bd_inode,
- &default_backing_dev_info);
+ bdev_write_inode(bdev->bd_inode);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
select LZO_DECOMPRESS
select RAID6_PQ
select XOR_BLOCKS
+ select SRCU
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2d3e32ebfd15..f55721ff9385 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
return ret;
}
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path)
-{
- struct btrfs_key key;
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_ITEM_KEY, &key);
-}
-
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path,
- struct btrfs_key *found_key)
-{
- return btrfs_find_item(fs_root, path, inum, ioff,
- BTRFS_INODE_REF_KEY, found_key);
-}
-
int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
}
- ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+ ret = btrfs_find_item(fs_root, path, parent, 0,
+ BTRFS_INODE_REF_KEY, &found_key);
if (ret > 0)
ret = -ENOENT;
if (ret)
@@ -1552,7 +1534,6 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
{
int ret;
int type;
- struct btrfs_tree_block_info *info;
struct btrfs_extent_inline_ref *eiref;
if (*ptr == (unsigned long)-1)
@@ -1573,9 +1554,17 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
}
/* we can treat both ref types equally here */
- info = (struct btrfs_tree_block_info *)(ei + 1);
*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
- *out_level = btrfs_tree_block_level(eb, info);
+
+ if (key->type == BTRFS_EXTENT_ITEM_KEY) {
+ struct btrfs_tree_block_info *info;
+
+ info = (struct btrfs_tree_block_info *)(ei + 1);
+ *out_level = btrfs_tree_block_level(eb, info);
+ } else {
+ ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+ *out_level = (u8)key->offset;
+ }
if (ret == 1)
*ptr = (unsigned long)-1;
@@ -1720,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
struct btrfs_key found_key;
while (!ret) {
- ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
- &found_key);
+ ret = btrfs_find_item(fs_root, path, inum,
+ parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+ &found_key);
+
if (ret < 0)
break;
if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2a1ac6bfc724..9c41fbac3009 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
- struct btrfs_path *path);
-
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4aadadcfab20..de5e4f2adfea 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
struct btrfs_delayed_node *delayed_node;
+ /* File creation time. */
+ struct timespec i_otime;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 14a72ed14ef7..993642199326 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
*/
static void add_root_to_dirty_list(struct btrfs_root *root)
{
+ if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+ !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+ return;
+
spin_lock(&root->fs_info->trans_lock);
- if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
- list_empty(&root->dirty_list)) {
- list_add(&root->dirty_list,
- &root->fs_info->dirty_cowonly_roots);
+ if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
+ /* Want the extent tree to be the last on the list */
+ if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ list_move_tail(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
+ else
+ list_move(&root->dirty_list,
+ &root->fs_info->dirty_cowonly_roots);
}
spin_unlock(&root->fs_info->trans_lock);
}
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(eb->start,
- fs_info->tree_root->nodesize);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
if (!eb_rewin) {
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
} else if (old_root) {
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+ eb = alloc_dummy_extent_buffer(root->fs_info, logical);
} else {
btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
if ((search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
gen = btrfs_node_ptr_generation(node, nr);
- readahead_tree_block(root, search, blocksize);
+ readahead_tree_block(root, search);
nread += blocksize;
}
nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
u64 gen;
u64 block1 = 0;
u64 block2 = 0;
- int blocksize;
parent = path->nodes[level + 1];
if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
nritems = btrfs_header_nritems(parent);
slot = path->slots[level + 1];
- blocksize = root->nodesize;
if (slot > 0) {
block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
}
if (block1)
- readahead_tree_block(root, block1, blocksize);
+ readahead_tree_block(root, block1);
if (block2)
- readahead_tree_block(root, block2, blocksize);
+ readahead_tree_block(root, block2);
}
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
return 0;
}
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
u64 iobjectid, u64 ioff, u8 key_type,
struct btrfs_key *found_key)
{
int ret;
struct btrfs_key key;
struct extent_buffer *eb;
- struct btrfs_path *path;
+
+ ASSERT(path);
+ ASSERT(found_key);
key.type = key_type;
key.objectid = iobjectid;
key.offset = ioff;
- if (found_path == NULL) {
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- } else
- path = found_path;
-
ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
- if ((ret < 0) || (found_key == NULL)) {
- if (path != found_path)
- btrfs_free_path(path);
+ if (ret < 0)
return ret;
- }
eb = path->nodes[0];
if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
extent_buffer_get(c);
path->nodes[level] = c;
- path->locks[level] = BTRFS_WRITE_LOCK;
+ path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
path->slots[level] = 0;
return 0;
}
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
path->search_for_split = 1;
ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
path->search_for_split = 0;
+ if (ret > 0)
+ ret = -EAGAIN;
if (ret < 0)
goto err;
ret = -EAGAIN;
leaf = path->nodes[0];
- /* if our item isn't there or got smaller, return now */
- if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+ /* if our item isn't there, return now */
+ if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
goto err;
/* the leaf has changed, it now has room. return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7e607416755a..84c3b00f3de8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+
/*
* The key defines the order in the tree, and so it also defines (optimal)
* block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
BTRFS_BLOCK_GROUP_RAID6 | \
BTRFS_BLOCK_GROUP_DUP | \
BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
+ BTRFS_BLOCK_GROUP_RAID6)
+
/*
* We need a bit for restriper to be able to tell when chunks of type
* SINGLE are available. This "extended" profile format is used in
@@ -1171,6 +1176,7 @@ struct btrfs_space_info {
struct percpu_counter total_bytes_pinned;
struct list_head list;
+ /* Protected by the spinlock 'lock'. */
struct list_head ro_bgs;
struct rw_semaphore groups_sem;
@@ -1238,7 +1244,6 @@ enum btrfs_disk_cache_state {
BTRFS_DC_ERROR = 1,
BTRFS_DC_CLEAR = 2,
BTRFS_DC_SETUP = 3,
- BTRFS_DC_NEED_WRITE = 4,
};
struct btrfs_caching_control {
@@ -1276,7 +1281,6 @@ struct btrfs_block_group_cache {
unsigned long full_stripe_len;
unsigned int ro:1;
- unsigned int dirty:1;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
@@ -1314,6 +1318,9 @@ struct btrfs_block_group_cache {
struct list_head ro_list;
atomic_t trimming;
+
+ /* For dirty block groups */
+ struct list_head dirty_list;
};
/* delayed seq elem */
@@ -1740,6 +1747,7 @@ struct btrfs_fs_info {
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
+ struct mutex unused_bg_unpin_mutex;
/* For btrfs to record security options */
struct security_mnt_opts security_opts;
@@ -1775,6 +1783,7 @@ struct btrfs_subvolume_writers {
#define BTRFS_ROOT_DEFRAG_RUNNING 6
#define BTRFS_ROOT_FORCE_COW 7
#define BTRFS_ROOT_MULTI_LOG_TASKS 8
+#define BTRFS_ROOT_DIRTY 9
/*
* in ram representation of the tree. extent_root is used for all allocations
@@ -1793,8 +1802,6 @@ struct btrfs_root {
struct btrfs_fs_info *fs_info;
struct extent_io_tree dirty_log_pages;
- struct kobject root_kobj;
- struct completion kobj_unregister;
struct mutex objectid_mutex;
spinlock_t accounting_lock;
@@ -2464,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, atime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, mtime);
- return (struct btrfs_timespec *)ptr;
-}
-
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
- unsigned long ptr = (unsigned long)inode_item;
- ptr += offsetof(struct btrfs_inode_item, ctime);
- return (struct btrfs_timespec *)ptr;
-}
-
BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 054577bddaf2..82f0c7c95474 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
btrfs_set_stack_inode_block_group(inode_item, 0);
- btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->atime,
inode->i_atime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->atime,
inode->i_atime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->mtime,
inode->i_mtime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->mtime,
inode->i_mtime.tv_nsec);
- btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_sec(&inode_item->ctime,
inode->i_ctime.tv_sec);
- btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+ btrfs_set_stack_timespec_nsec(&inode_item->ctime,
inode->i_ctime.tv_nsec);
+
+ btrfs_set_stack_timespec_sec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec);
+ btrfs_set_stack_timespec_nsec(&inode_item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec);
}
int btrfs_fill_inode(struct inode *inode, u32 *rdev)
{
struct btrfs_delayed_node *delayed_node;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
delayed_node = btrfs_get_delayed_node(inode);
if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
*rdev = btrfs_stack_inode_rdev(inode_item);
BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
- inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_stack_timespec_sec(&inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_stack_timespec_nsec(&inode_item->otime);
inode->i_generation = BTRFS_I(inode)->generation;
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1857,6 +1863,14 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
{
struct btrfs_delayed_node *delayed_node;
+ /*
+ * we don't do delayed inode updates during log recovery because it
+ * leads to enospc problems. This means we also can't do
+ * delayed inode refs
+ */
+ if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+ return -EAGAIN;
+
delayed_node = btrfs_get_or_create_delayed_node(inode);
if (IS_ERR(delayed_node))
return PTR_ERR(delayed_node);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ca6a3a3b6b6c..5ec03d999c37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
- s64 writers;
- DEFINE_WAIT(wait);
-
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- do {
- prepare_to_wait(&fs_info->replace_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- writers = percpu_counter_sum(&fs_info->bio_counter);
- if (writers)
- schedule();
- finish_wait(&fs_info->replace_wait, &wait);
- } while (writers);
+ wait_event(fs_info->replace_wait, !percpu_counter_sum(
+ &fs_info->bio_counter));
}
/*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
- DEFINE_WAIT(wait);
-again:
- percpu_counter_inc(&fs_info->bio_counter);
- if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+ while (1) {
+ percpu_counter_inc(&fs_info->bio_counter);
+ if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
+ &fs_info->fs_state)))
+ break;
+
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
- goto again;
}
-
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..f79f38542a73 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_INFO
+ printk_ratelimited(KERN_WARNING
"BTRFS: %s checksum verify failed on %llu wanted %X found %X "
"level %d\n",
root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+ printk_ratelimited(KERN_ERR
+ "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
eb->fs_info->sb->s_id, eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
"%llu %llu\n",
eb->fs_info->sb->s_id, found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root, eb)) {
- printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+ printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
eb->fs_info->sb->s_id, eb->start);
ret = -EIO;
goto err;
}
found_level = btrfs_header_level(eb);
if (found_level >= BTRFS_MAX_LEVEL) {
- btrfs_info(root->fs_info, "bad tree block level %d",
+ btrfs_err(root->fs_info, "bad tree block level %d",
(int)btrfs_header_level(eb));
ret = -EIO;
goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
.set_page_dirty = btree_set_page_dirty,
};
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
{
struct extent_buffer *buf = NULL;
struct inode *btree_inode = root->fs_info->btree_inode;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return;
read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
free_extent_buffer(buf);
}
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb)
{
struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return 0;
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
}
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize)
+ u64 bytenr)
{
if (btrfs_test_is_dummy_root(root))
- return alloc_test_extent_buffer(root->fs_info, bytenr,
- blocksize);
- return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
+ return alloc_test_extent_buffer(root->fs_info, bytenr);
+ return alloc_extent_buffer(root->fs_info, bytenr);
}
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
struct extent_buffer *buf = NULL;
int ret;
- buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return NULL;
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
memset(&root->root_key, 0, sizeof(root->root_key));
memset(&root->root_item, 0, sizeof(root->root_item));
memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
- memset(&root->root_kobj, 0, sizeof(root->root_kobj));
if (fs_info)
root->defrag_trans_start = fs_info->generation;
else
root->defrag_trans_start = 0;
- init_completion(&root->kobj_unregister);
root->root_key.objectid = objectid;
root->anon_dev = 0;
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
bool check_ref)
{
struct btrfs_root *root;
+ struct btrfs_path *path;
+ struct btrfs_key key;
int ret;
if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
if (ret)
goto fail;
- ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
- location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto fail;
+ }
+ key.objectid = BTRFS_ORPHAN_OBJECTID;
+ key.type = BTRFS_ORPHAN_ITEM_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+ btrfs_free_path(path);
if (ret < 0)
goto fail;
if (ret == 0)
@@ -1715,12 +1724,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
{
int err;
- bdi->capabilities = BDI_CAP_MAP_COPY;
- err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
+ err = bdi_setup_and_register(bdi, "btrfs");
if (err)
return err;
- bdi->ra_pages = default_backing_dev_info.ra_pages;
+ bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
bdi->congested_fn = btrfs_congested_fn;
bdi->congested_data = info;
return 0;
@@ -2233,6 +2241,7 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->qgroup_op_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
+ mutex_init(&fs_info->unused_bg_unpin_mutex);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
@@ -2319,7 +2328,6 @@ int open_ctree(struct super_block *sb,
*/
fs_info->btree_inode->i_size = OFFSET_MAX;
fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
- fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
@@ -2498,7 +2506,7 @@ int open_ctree(struct super_block *sb,
features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
- printk(KERN_ERR "BTRFS: has skinny extents\n");
+ printk(KERN_INFO "BTRFS: has skinny extents\n");
/*
* flag our filesystem as having big metadata blocks if
@@ -2522,7 +2530,7 @@ int open_ctree(struct super_block *sb,
*/
if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
(sectorsize != nodesize)) {
- printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+ printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
"are not allowed for mixed block groups on %s\n",
sb->s_id);
goto fail_alloc;
@@ -2630,12 +2638,12 @@ int open_ctree(struct super_block *sb,
sb->s_blocksize_bits = blksize_bits(sectorsize);
if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
- printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+ printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
goto fail_sb_buffer;
}
if (sectorsize != PAGE_SIZE) {
- printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+ printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
"found on %s\n", (unsigned long)sectorsize, sb->s_id);
goto fail_sb_buffer;
}
@@ -2644,7 +2652,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_sys_array(tree_root);
mutex_unlock(&fs_info->chunk_mutex);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read the system "
+ printk(KERN_ERR "BTRFS: failed to read the system "
"array on %s\n", sb->s_id);
goto fail_sb_buffer;
}
@@ -2659,7 +2667,7 @@ int open_ctree(struct super_block *sb,
generation);
if (!chunk_root->node ||
!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
- printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2671,7 +2679,7 @@ int open_ctree(struct super_block *sb,
ret = btrfs_read_chunk_tree(chunk_root);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2683,7 +2691,7 @@ int open_ctree(struct super_block *sb,
btrfs_close_extra_devices(fs_info, fs_devices, 0);
if (!fs_devices->latest_bdev) {
- printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+ printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
sb->s_id);
goto fail_tree_roots;
}
@@ -2767,7 +2775,7 @@ retry_root_backup:
ret = btrfs_recover_balance(fs_info);
if (ret) {
- printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+ printk(KERN_ERR "BTRFS: failed to recover balance\n");
goto fail_block_groups;
}
@@ -3862,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
btrfs_super_log_root(sb));
+ /*
+ * Check the lower bound, the alignment and other constraints are
+ * checked later.
+ */
+ if (btrfs_super_nodesize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+ btrfs_super_nodesize(sb));
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sectorsize(sb) < 4096) {
+ printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+ btrfs_super_sectorsize(sb));
+ ret = -EINVAL;
+ }
+
if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
fs_info->fsid, sb->dev_item.fsid);
@@ -3875,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
if (btrfs_super_num_devices(sb) > (1UL << 31))
printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
btrfs_super_num_devices(sb));
+ if (btrfs_super_num_devices(sb) == 0) {
+ printk(KERN_ERR "BTRFS: number of devices is 0\n");
+ ret = -EINVAL;
+ }
if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3883,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
}
/*
+ * Obvious sys_chunk_array corruptions, it must hold at least one key
+ * and one chunk
+ */
+ if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+ printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+ btrfs_super_sys_array_size(sb),
+ BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+ ret = -EINVAL;
+ }
+ if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk)) {
+ printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+ btrfs_super_sys_array_size(sb),
+ sizeof(struct btrfs_disk_key)
+ + sizeof(struct btrfs_chunk));
+ ret = -EINVAL;
+ }
+
+ /*
* The generation is a global counter, we'll trust it more than the others
* but it's still possible that it's the one that's wrong.
*/
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 414651821fb3..27d44c0fd236 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
int mirror_num, struct extent_buffer **eb);
struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
- u64 bytenr, u32 blocksize);
+ u64 bytenr);
void clean_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf);
int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a80b97100d90..571f402d3fc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
RESERVE_ALLOC_NO_ACCOUNT = 2,
};
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc);
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc);
static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
*/
ret = 0;
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
- int run_most = 0;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
root = root->fs_info->tree_root;
delayed_refs = &trans->transaction->delayed_refs;
- if (count == 0) {
+ if (count == 0)
count = atomic_read(&delayed_refs->num_entries) * 2;
- run_most = 1;
- }
again:
#ifdef SCRAMBLE_DELAYED_REFS
@@ -3139,9 +3137,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
- if (ret < 0)
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
goto fail;
- BUG_ON(ret); /* Corruption */
+ }
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3149,11 +3149,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(path);
fail:
- if (ret) {
+ if (ret)
btrfs_abort_transaction(trans, root, ret);
- return ret;
- }
- return 0;
+ return ret;
}
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_block_group_cache *cache;
- int err = 0;
+ struct btrfs_transaction *cur_trans = trans->transaction;
+ int ret = 0;
struct btrfs_path *path;
- u64 last = 0;
+
+ if (list_empty(&cur_trans->dirty_bgs))
+ return 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
-again:
- while (1) {
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
- err = cache_save_setup(cache, trans, path);
- last = cache->key.objectid + cache->key.offset;
- btrfs_put_block_group(cache);
- }
-
- while (1) {
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
-
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
- btrfs_put_block_group(cache);
- goto again;
- }
-
- if (cache->dirty)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- if (cache->disk_cache_state == BTRFS_DC_SETUP)
- cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
- cache->dirty = 0;
- last = cache->key.objectid + cache->key.offset;
-
- err = write_one_cache_group(trans, root, path, cache);
- btrfs_put_block_group(cache);
- if (err) /* File system offline */
- goto out;
- }
-
- while (1) {
- /*
- * I don't think this is needed since we're just marking our
- * preallocated extent as written, but just in case it can't
- * hurt.
- */
- if (last == 0) {
- err = btrfs_run_delayed_refs(trans, root,
- (unsigned long)-1);
- if (err) /* File system offline */
- goto out;
- }
-
- cache = btrfs_lookup_first_block_group(root->fs_info, last);
- while (cache) {
- /*
- * Really this shouldn't happen, but it could if we
- * couldn't write the entire preallocated extent and
- * splitting the extent resulted in a new block.
- */
- if (cache->dirty) {
- btrfs_put_block_group(cache);
- goto again;
- }
- if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- break;
- cache = next_block_group(root, cache);
- }
- if (!cache) {
- if (last == 0)
- break;
- last = 0;
- continue;
- }
-
- err = btrfs_write_out_cache(root, trans, cache, path);
-
- /*
- * If we didn't have an error then the cache state is still
- * NEED_WRITE, so we can set it to WRITTEN.
- */
- if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
- cache->disk_cache_state = BTRFS_DC_WRITTEN;
- last = cache->key.objectid + cache->key.offset;
+ /*
+ * We don't need the lock here since we are protected by the transaction
+ * commit. We want to do the cache_save_setup first and then run the
+ * delayed refs to make sure we have the best chance at doing this all
+ * in one shot.
+ */
+ while (!list_empty(&cur_trans->dirty_bgs)) {
+ cache = list_first_entry(&cur_trans->dirty_bgs,
+ struct btrfs_block_group_cache,
+ dirty_list);
+ list_del_init(&cache->dirty_list);
+ if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+ cache_save_setup(cache, trans, path);
+ if (!ret)
+ ret = btrfs_run_delayed_refs(trans, root,
+ (unsigned long) -1);
+ if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
+ btrfs_write_out_cache(root, trans, cache, path);
+ if (!ret)
+ ret = write_one_cache_group(trans, root, path, cache);
btrfs_put_block_group(cache);
}
-out:
btrfs_free_path(path);
- return err;
+ return ret;
}
int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
/**
* drop_outstanding_extent - drop an outstanding extent
* @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
*
* This is called when we are freeing up an outstanding extent, either called
* after an error or after an extent is written. This will return the number of
* reserved extents that need to be freed. This must be called with
* BTRFS_I(inode)->lock held.
*/
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
{
unsigned drop_inode_space = 0;
unsigned dropped_extents = 0;
+ unsigned num_extents = 0;
- BUG_ON(!BTRFS_I(inode)->outstanding_extents);
- BTRFS_I(inode)->outstanding_extents--;
+ num_extents = (unsigned)div64_u64(num_bytes +
+ BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ ASSERT(num_extents);
+ ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
if (BTRFS_I(inode)->outstanding_extents == 0 &&
test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
out_fail:
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
/*
* If the inodes csum_bytes is the same as the original
* csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
num_bytes = ALIGN(num_bytes, root->sectorsize);
spin_lock(&BTRFS_I(inode)->lock);
- dropped = drop_outstanding_extent(inode);
+ dropped = drop_outstanding_extent(inode, num_bytes);
if (num_bytes)
to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
btrfs_free_reserved_data_space(inode, num_bytes);
}
-static int update_block_group(struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int alloc)
+static int update_block_group(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, int alloc)
{
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
if (!alloc && cache->cached == BTRFS_CACHE_NO)
cache_block_group(cache, 1);
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (list_empty(&cache->dirty_list)) {
+ list_add_tail(&cache->dirty_list,
+ &trans->transaction->dirty_bgs);
+ btrfs_get_block_group(cache);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
byte_in_group = bytenr - cache->key.objectid;
WARN_ON(byte_in_group > cache->key.offset);
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
cache->disk_cache_state < BTRFS_DC_CLEAR)
cache->disk_cache_state = BTRFS_DC_CLEAR;
- cache->dirty = 1;
old_val = btrfs_block_group_used(&cache->item);
num_bytes = min(total, cache->key.offset - byte_in_group);
if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
unpin = &fs_info->freed_extents[0];
while (1) {
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY, NULL);
- if (ret)
+ if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
break;
+ }
if (btrfs_test_opt(root, DISCARD))
ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
clear_extent_dirty(unpin, start, end, GFP_NOFS);
unpin_extent_range(root, start, end, true);
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
cond_resched();
}
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
}
}
- ret = update_block_group(root, bytenr, num_bytes, 0);
+ ret = update_block_group(trans, root, bytenr, num_bytes, 0);
if (ret) {
btrfs_abort_transaction(trans, extent_root, ret);
goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
struct extent_buffer *buf,
u64 parent, int last_ref)
{
- struct btrfs_block_group_cache *cache = NULL;
int pin = 1;
int ret;
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
if (!last_ref)
return;
- cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-
if (btrfs_header_generation(buf) == trans->transid) {
+ struct btrfs_block_group_cache *cache;
+
if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
ret = check_ref_cleanup(trans, root, buf->start);
if (!ret)
goto out;
}
+ cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+
if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
pin_down_extent(root, cache, buf->start, buf->len, 1);
+ btrfs_put_block_group(cache);
goto out;
}
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
btrfs_add_free_space(cache, buf->start, buf->len);
btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+ btrfs_put_block_group(cache);
trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
pin = 0;
}
@@ -6253,7 +6194,6 @@ out:
* anymore.
*/
clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- btrfs_put_block_group(cache);
}
/* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = update_block_group(root, ins->objectid, ins->offset, 1);
+ ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
return ret;
}
- ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+ ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+ 1);
if (ret) { /* -ENOENT, logic error */
btrfs_err(fs_info, "update block group failed for %llu %llu",
ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
static struct extent_buffer *
btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 bytenr, u32 blocksize, int level)
+ u64 bytenr, int level)
{
struct extent_buffer *buf;
- buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ buf = btrfs_find_create_tree_block(root, bytenr);
if (!buf)
return ERR_PTR(-ENOMEM);
btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
if (btrfs_test_is_dummy_root(root)) {
buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
- blocksize, level);
+ level);
if (!IS_ERR(buf))
root->alloc_bytenr += blocksize;
return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
return ERR_PTR(ret);
}
- buf = btrfs_init_new_buffer(trans, root, ins.objectid,
- blocksize, level);
+ buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
BUG_ON(IS_ERR(buf)); /* -ENOMEM */
if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
continue;
}
reada:
- readahead_tree_block(root, bytenr, blocksize);
+ readahead_tree_block(root, bytenr);
nread++;
}
wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
next = btrfs_find_tree_block(root, bytenr);
if (!next) {
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
if (IS_ERR(trans))
return PTR_ERR(trans);
- alloc_flags = update_block_group_flags(root, cache->flags);
- if (alloc_flags != cache->flags) {
- ret = do_chunk_alloc(trans, root, alloc_flags,
- CHUNK_ALLOC_FORCE);
- if (ret < 0)
- goto out;
- }
-
ret = set_block_group_ro(cache, 0);
if (!ret)
goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
goto out;
ret = set_block_group_ro(cache, 0);
out:
+ if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+ alloc_flags = update_block_group_flags(root, cache->flags);
+ check_system_chunk(trans, root, alloc_flags);
+ }
+
btrfs_end_transaction(trans, root);
return ret;
}
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
INIT_LIST_HEAD(&cache->cluster_list);
INIT_LIST_HEAD(&cache->bg_list);
INIT_LIST_HEAD(&cache->ro_list);
+ INIT_LIST_HEAD(&cache->dirty_list);
btrfs_init_free_space_ctl(cache);
atomic_set(&cache->trimming, 0);
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
* b) Setting 'dirty flag' makes sure that we flush
* the new space cache info onto disk.
*/
- cache->disk_cache_state = BTRFS_DC_CLEAR;
if (btrfs_test_opt(root, SPACE_CACHE))
- cache->dirty = 1;
+ cache->disk_cache_state = BTRFS_DC_CLEAR;
}
read_extent_buffer(leaf, &cache->item,
@@ -9422,7 +9359,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* are still on the list after taking the semaphore
*/
list_del_init(&block_group->list);
- list_del_init(&block_group->ro_list);
if (list_empty(&block_group->space_info->block_groups[index])) {
kobj = block_group->space_info->block_group_kobjs[index];
block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9461,9 +9397,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
}
+ spin_lock(&trans->transaction->dirty_bgs_lock);
+ if (!list_empty(&block_group->dirty_list)) {
+ list_del_init(&block_group->dirty_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&trans->transaction->dirty_bgs_lock);
+
btrfs_remove_free_space_cache(block_group);
spin_lock(&block_group->space_info->lock);
+ list_del_init(&block_group->ro_list);
block_group->space_info->total_bytes -= block_group->key.offset;
block_group->space_info->bytes_readonly -= block_group->key.offset;
block_group->space_info->disk_total -= block_group->key.offset * factor;
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
*/
- trans = btrfs_join_transaction(root);
+ /* 1 for btrfs_orphan_reserve_metadata() */
+ trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_set_block_group_rw(root, block_group);
ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
start = block_group->key.objectid;
end = start + block_group->key.offset - 1;
+ /*
+ * Hold the unused_bg_unpin_mutex lock to avoid racing with
+ * btrfs_finish_extent_commit(). If we are at transaction N,
+ * another task might be running finish_extent_commit() for the
+ * previous transaction N - 1, and have seen a range belonging
+ * to the block group in freed_extents[] before we were able to
+ * clear the whole block group range from freed_extents[]. This
+ * means that task can lookup for the block group after we
+ * unpinned it from freed_extents[] and removed it, leading to
+ * a BUG_ON() at btrfs_unpin_extent_range().
+ */
+ mutex_lock(&fs_info->unused_bg_unpin_mutex);
ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
EXTENT_DIRTY, GFP_NOFS);
if (ret) {
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
btrfs_set_block_group_rw(root, block_group);
goto end_trans;
}
+ mutex_unlock(&fs_info->unused_bg_unpin_mutex);
/* Reset pinned so btrfs_put_block_group doesn't complain */
block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ebabd237153..c7233ff1d533 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
while (!list_empty(&states)) {
state = list_entry(states.next, struct extent_state, leak_list);
- pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+ pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
state->start, state->end, state->state,
extent_state_in_tree(state),
atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
}
static void set_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->set_bit_hook)
tree->ops->set_bit_hook(tree->mapping->host, state, bits);
}
static void clear_state_cb(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned long *bits);
+ struct extent_state *state, unsigned *bits);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned long *bits)
+ unsigned *bits)
{
struct rb_node *node;
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits, int wake)
+ unsigned *bits, int wake)
{
struct extent_state *next;
- unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached_state,
gfp_t mask)
{
@@ -789,9 +789,9 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
- unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+ unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
set_state_cb(tree, state, bits);
if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
static void cache_state_if_flags(struct extent_state *state,
struct extent_state **cached_ptr,
- const u64 flags)
+ unsigned flags)
{
if (cached_ptr && !(*cached_ptr)) {
if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long exclusive_bits,
+ unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
gfp_t mask)
{
@@ -1034,7 +1034,7 @@ search_again:
}
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 * failed_start,
+ unsigned bits, u64 * failed_start,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
* boundary bits like LOCK.
*/
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask)
{
struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
}
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return set_extent_bit(tree, start, end, bits, NULL,
NULL, mask);
}
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask)
+ unsigned bits, gfp_t mask)
{
return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
}
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
* us if waiting is desired.
*/
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached_state)
+ unsigned bits, struct extent_state **cached_state)
{
int err;
u64 failed_start;
+
while (1) {
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
@@ -1407,8 +1408,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
while (index <= end_index) {
page = find_get_page(inode->i_mapping, index);
BUG_ON(!page); /* Pages should be in the extent_io_tree */
- account_page_redirty(page);
__set_page_dirty_nobuffers(page);
+ account_page_redirty(page);
page_cache_release(page);
index++;
}
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
*/
static struct extent_state *
find_first_extent_bit_state(struct extent_io_tree *tree,
- u64 start, unsigned long bits)
+ u64 start, unsigned bits)
{
struct rb_node *node;
struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
* If nothing was found, 1 is returned. If found something, return 0.
*/
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long clear_bits,
+ unsigned clear_bits,
unsigned long page_ops)
{
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
- unsigned long bits, int contig)
+ unsigned bits, int contig)
{
struct rb_node *node;
struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
* range is found set.
*/
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled, struct extent_state *cached)
+ unsigned bits, int filled, struct extent_state *cached)
{
struct extent_state *state = NULL;
struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
sector = bbio->stripes[mirror_num-1].physical >> 9;
bio->bi_iter.bi_sector = sector;
dev = bbio->stripes[mirror_num-1].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
if (!dev || !dev->bdev || !dev->writeable) {
bio_put(bio);
return -EIO;
@@ -2190,7 +2191,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
next = next_state(state);
- failrec = (struct io_failure_record *)state->private;
+ failrec = (struct io_failure_record *)(unsigned long)state->private;
free_extent_state(state);
kfree(failrec);
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
prev_bio_flags);
- if (ret < 0)
+ if (ret < 0) {
+ *bio_ret = NULL;
return ret;
+ }
bio = NULL;
} else {
return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
page,
&delalloc_start,
&delalloc_end,
- 128 * 1024 * 1024);
+ BTRFS_MAX_EXTENT_SIZE);
if (nr_delalloc == 0) {
delalloc_start = delalloc_end + 1;
continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
static struct extent_buffer *
__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
- unsigned long len, gfp_t mask)
+ unsigned long len)
{
struct extent_buffer *eb = NULL;
- eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+ eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
if (eb == NULL)
return NULL;
eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
struct extent_buffer *new;
unsigned long num_pages = num_extent_pages(src->start, src->len);
- new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+ new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
if (new == NULL)
return NULL;
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
return new;
}
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start)
{
struct extent_buffer *eb;
- unsigned long num_pages = num_extent_pages(0, len);
+ unsigned long len;
+ unsigned long num_pages;
unsigned long i;
- eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+ if (!fs_info) {
+ /*
+ * Called only from tests that don't always have a fs_info
+ * available, but we know that nodesize is 4096
+ */
+ len = 4096;
+ } else {
+ len = fs_info->tree_root->nodesize;
+ }
+ num_pages = num_extent_pages(0, len);
+
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
struct extent_buffer *eb, *exists = NULL;
int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
eb = find_extent_buffer(fs_info, start);
if (eb)
return eb;
- eb = alloc_dummy_extent_buffer(start, len);
+ eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
return NULL;
eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
#endif
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len)
+ u64 start)
{
+ unsigned long len = fs_info->tree_root->nodesize;
unsigned long num_pages = num_extent_pages(start, len);
unsigned long i;
unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (eb)
return eb;
- eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+ eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ece9ce87edff..695b0ccfb755 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
#include <linux/rbtree.h>
/* bits for the extent state */
-#define EXTENT_DIRTY 1
-#define EXTENT_WRITEBACK (1 << 1)
-#define EXTENT_UPTODATE (1 << 2)
-#define EXTENT_LOCKED (1 << 3)
-#define EXTENT_NEW (1 << 4)
-#define EXTENT_DELALLOC (1 << 5)
-#define EXTENT_DEFRAG (1 << 6)
-#define EXTENT_BOUNDARY (1 << 9)
-#define EXTENT_NODATASUM (1 << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
-#define EXTENT_DAMAGED (1 << 14)
-#define EXTENT_NORESERVE (1 << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_DIRTY (1U << 0)
+#define EXTENT_WRITEBACK (1U << 1)
+#define EXTENT_UPTODATE (1U << 2)
+#define EXTENT_LOCKED (1U << 3)
+#define EXTENT_NEW (1U << 4)
+#define EXTENT_DELALLOC (1U << 5)
+#define EXTENT_DEFRAG (1U << 6)
+#define EXTENT_BOUNDARY (1U << 9)
+#define EXTENT_NODATASUM (1U << 10)
+#define EXTENT_DO_ACCOUNTING (1U << 11)
+#define EXTENT_FIRST_DELALLOC (1U << 12)
+#define EXTENT_NEED_WAIT (1U << 13)
+#define EXTENT_DAMAGED (1U << 14)
+#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
/*
* flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
struct extent_state *state, int uptodate);
void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
- unsigned long *bits);
+ unsigned *bits);
void (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
/* ADD NEW ELEMENTS AFTER THIS */
wait_queue_head_t wq;
atomic_t refs;
- unsigned long state;
+ unsigned state;
/* for use by the FS */
u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
int try_release_extent_buffer(struct page *page);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, struct extent_state **cached);
+ unsigned bits, struct extent_state **cached);
int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
- u64 max_bytes, unsigned long bits, int contig);
+ u64 max_bytes, unsigned bits, int contig);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int filled,
+ unsigned bits, int filled,
struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, int wake, int delete,
+ unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, gfp_t mask);
+ unsigned bits, gfp_t mask);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, u64 *failed_start,
+ unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
gfp_t mask);
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned long bits, unsigned long clear_bits,
+ unsigned bits, unsigned clear_bits,
struct extent_state **cached_state, gfp_t mask);
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask);
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
- u64 *start_ret, u64 *end_ret, unsigned long bits,
+ u64 *start_ret, u64 *end_ret, unsigned bits,
struct extent_state **cached_state);
int extent_invalidatepage(struct extent_io_tree *tree,
struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
void set_page_extent_mapped(struct page *page);
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+ u64 start);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+ u64 start);
struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
struct page *locked_page,
- unsigned long bits_to_clear,
+ unsigned bits_to_clear,
unsigned long page_ops);
struct bio *
btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
u64 *end, u64 max_bytes);
#endif
struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
- u64 start, unsigned long len);
+ u64 start);
#endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
mutex_lock(&inode->i_mutex);
- current->backing_dev_info = inode->i_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err) {
mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = btrfs_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7f136b..a71978578fa7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
struct io_ctl io_ctl;
struct btrfs_key key;
struct btrfs_free_space *e, *n;
- struct list_head bitmaps;
+ LIST_HEAD(bitmaps);
u64 num_entries;
u64 num_bitmaps;
u64 generation;
u8 type;
int ret = 0;
- INIT_LIST_HEAD(&bitmaps);
-
/* Nothing in the space cache, goodbye */
if (!i_size_read(inode))
return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
struct inode *inode;
int ret = 0;
+ enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
root = root->fs_info->tree_root;
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
path, block_group->key.objectid);
if (ret) {
- spin_lock(&block_group->lock);
- block_group->disk_cache_state = BTRFS_DC_ERROR;
- spin_unlock(&block_group->lock);
+ dcs = BTRFS_DC_ERROR;
ret = 0;
#ifdef DEBUG
btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
#endif
}
+ spin_lock(&block_group->lock);
+ block_group->disk_cache_state = dcs;
+ spin_unlock(&block_group->lock);
iput(inode);
return ret;
}
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
min_bytes);
- INIT_LIST_HEAD(&bitmaps);
ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
bytes + empty_size,
cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa4783cbf4..265e03c73f4d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->leave_spinning = 1;
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ptr = (unsigned long)(ref + 1);
ret = 0;
} else if (ret < 0) {
- if (ret == -EOVERFLOW)
- ret = -EMLINK;
+ if (ret == -EOVERFLOW) {
+ if (find_name_in_backref(path, name, name_len, &ref))
+ ret = -EEXIST;
+ else
+ ret = -EMLINK;
+ }
goto out;
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e687bb0dc73a..a85c23dfcddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static void btrfs_split_extent_hook(struct inode *inode,
struct extent_state *orig, u64 split)
{
+ u64 size;
+
/* not delalloc, ignore it */
if (!(orig->state & EXTENT_DELALLOC))
return;
+ size = orig->end - orig->start + 1;
+ if (size > BTRFS_MAX_EXTENT_SIZE) {
+ u64 num_extents;
+ u64 new_size;
+
+ /*
+ * We need the largest size of the remaining extent to see if we
+ * need to add a new outstanding extent. Think of the following
+ * case
+ *
+ * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+ *
+ * The new_size would just be 4k and we'd think we had enough
+ * outstanding extents for this if we only took one side of the
+ * split, same goes for the other direction. We need to see if
+ * the larger size still is the same amount of extents as the
+ * original size, because if it is we need to add a new
+ * outstanding extent. But if we split up and the larger size
+ * is less than the original then we are good to go since we've
+ * already accounted for the extra extent in our original
+ * accounting.
+ */
+ new_size = orig->end - split + 1;
+ if ((split - orig->start) > new_size)
+ new_size = split - orig->start;
+
+ num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) < num_extents)
+ return;
+ }
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
struct extent_state *new,
struct extent_state *other)
{
+ u64 new_size, old_size;
+ u64 num_extents;
+
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return;
+ old_size = other->end - other->start + 1;
+ new_size = old_size + (new->end - new->start + 1);
+
+ /* we're not bigger than the max, unreserve the space and go */
+ if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents--;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ return;
+ }
+
+ /*
+ * If we grew by another max_extent, just return, we want to keep that
+ * reserved amount.
+ */
+ num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE) > num_extents)
+ return;
+
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->outstanding_extents--;
spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
* have pending delalloc work to be done.
*/
static void btrfs_set_bit_hook(struct inode *inode,
- struct extent_state *state, unsigned long *bits)
+ struct extent_state *state, unsigned *bits)
{
if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
*/
static void btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state,
- unsigned long *bits)
+ unsigned *bits)
{
u64 len = state->end + 1 - state->start;
+ u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+ BTRFS_MAX_EXTENT_SIZE);
spin_lock(&BTRFS_I(inode)->lock);
if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
*bits &= ~EXTENT_FIRST_DELALLOC;
} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents--;
+ BTRFS_I(inode)->outstanding_extents -= num_extents;
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
return 0;
zeroit:
if (__ratelimit(&_rs))
- btrfs_info(BTRFS_I(inode)->root->fs_info,
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
"csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
out:
if (ret)
- btrfs_crit(root->fs_info,
+ btrfs_err(root->fs_info,
"could not do orphan cleanup %d", ret);
btrfs_free_path(path);
return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_inode_item *inode_item;
- struct btrfs_timespec *tspec;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key location;
unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
- tspec = btrfs_inode_atime(inode_item);
- inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
+ inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
+
+ inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+ inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
- tspec = btrfs_inode_mtime(inode_item);
- inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
+ inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
- tspec = btrfs_inode_ctime(inode_item);
- inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
- inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+ BTRFS_I(inode)->i_otime.tv_sec =
+ btrfs_timespec_sec(leaf, &inode_item->otime);
+ BTRFS_I(inode)->i_otime.tv_nsec =
+ btrfs_timespec_nsec(leaf, &inode_item->otime);
inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3608,7 +3670,6 @@ cache_acl:
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3684,6 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode->i_mapping->a_ops = &btrfs_symlink_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -3658,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
+ btrfs_set_token_timespec_sec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_sec, &token);
+ btrfs_set_token_timespec_nsec(leaf, &item->otime,
+ BTRFS_I(inode)->i_otime.tv_nsec, &token);
+
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
&token);
btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5009,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
struct btrfs_root *new_root;
struct btrfs_root_ref *ref;
struct extent_buffer *leaf;
+ struct btrfs_key key;
int ret;
int err = 0;
@@ -5019,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
}
err = -ENOENT;
- ret = btrfs_find_item(root->fs_info->tree_root, path,
- BTRFS_I(dir)->root->root_key.objectid,
- location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+ key.objectid = BTRFS_I(dir)->root->root_key.objectid;
+ key.type = BTRFS_ROOT_REF_KEY;
+ key.offset = location->objectid;
+
+ ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+ 0, 0);
if (ret) {
if (ret < 0)
err = ret;
@@ -5260,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
inode->i_op = &btrfs_dir_ro_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
return inode;
}
@@ -5828,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
inode_init_owner(inode, dir, mode);
inode_set_bytes(inode, 0);
- inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ inode->i_mtime = CURRENT_TIME;
+ inode->i_atime = inode->i_mtime;
+ inode->i_ctime = inode->i_mtime;
+ BTRFS_I(inode)->i_otime = inode->i_mtime;
+
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -6088,7 +6165,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
if (err)
@@ -6255,8 +6331,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
out_fail:
btrfs_end_transaction(trans, root);
- if (drop_on_err)
+ if (drop_on_err) {
+ inode_dec_link_count(inode);
iput(inode);
+ }
btrfs_balance_delayed_items(root);
btrfs_btree_balance_dirty(root);
return err;
@@ -7135,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
+ u64 orig_len = len;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
if (create)
- unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+ unlock_bits |= EXTENT_DIRTY;
else
len = min_t(u64, len, root->sectorsize);
@@ -7270,14 +7349,12 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
-
- ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
- lockstart + len - 1, EXTENT_DELALLOC, NULL,
- &cached_state, GFP_NOFS);
- BUG_ON(ret);
+ if (len < orig_len) {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents++;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+ btrfs_free_reserved_data_space(inode, len);
}
/*
@@ -7806,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
}
/* async crcs make it difficult to collect full stripe writes. */
- if (btrfs_get_alloc_profile(root, 1) &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+ if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
async_submit = 0;
else
async_submit = 1;
@@ -8054,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode,
count - (size_t)ret);
- else
- btrfs_delalloc_release_metadata(inode, 0);
}
out:
if (wakeup)
@@ -8576,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->delayed_node = NULL;
+ ei->i_otime.tv_sec = 0;
+ ei->i_otime.tv_nsec = 0;
+
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
extent_io_tree_init(&ei->io_tree, &inode->i_data);
@@ -9201,7 +9278,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_fop = &btrfs_file_operations;
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9245,7 +9321,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode->i_mapping->a_ops = &btrfs_symlink_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(inode, name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -9457,7 +9532,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &btrfs_file_inode_operations;
inode->i_mapping->a_ops = &btrfs_aops;
- inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 48b60dbf807f..97159a8e91d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
qgroup = u64_to_ptr(unode->aux);
qgroup->rfer += sign * oper->num_bytes;
qgroup->rfer_cmpr += sign * oper->num_bytes;
+ WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
qgroup->excl += sign * oper->num_bytes;
- if (sign < 0)
- WARN_ON(qgroup->excl < oper->num_bytes);
qgroup->excl_cmpr += sign * oper->num_bytes;
qgroup_dirty(fs_info, qgroup);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ab2a17bbba8..5264858ed768 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
*/
#define RBIO_CACHE_READY_BIT 3
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT 4
-
#define RBIO_CACHE_SIZE 1024
enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
struct btrfs_fs_info *fs_info;
struct btrfs_bio *bbio;
- /*
- * logical block numbers for the start of each stripe
- * The last one or two are p/q. These are sorted,
- * so raid_map[0] is the start of our full stripe
- */
- u64 *raid_map;
-
/* while we're doing rmw on a stripe
* we put it into a hash table so we can
* lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
*/
static int rbio_bucket(struct btrfs_raid_bio *rbio)
{
- u64 num = rbio->raid_map[0];
+ u64 num = rbio->bbio->raid_map[0];
/*
* we shift down quite a bit. We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
test_bit(RBIO_CACHE_BIT, &cur->flags))
return 0;
- if (last->raid_map[0] !=
- cur->raid_map[0])
+ if (last->bbio->raid_map[0] !=
+ cur->bbio->raid_map[0])
return 0;
/* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
spin_lock_irqsave(&h->lock, flags);
list_for_each_entry(cur, &h->hash_list, hash_list) {
walk++;
- if (cur->raid_map[0] == rbio->raid_map[0]) {
+ if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
spin_lock(&cur->bio_list_lock);
/* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
remove_rbio_from_cache(rbio);
}
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
- if (need) {
- kfree(raid_map);
- kfree(bbio);
- }
-}
-
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
- __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
-
static void __free_raid_bio(struct btrfs_raid_bio *rbio)
{
int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
}
}
- free_bbio_and_raid_map(rbio);
-
+ btrfs_put_bbio(rbio->bbio);
kfree(rbio);
}
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
* this does not allocate any pages for rbio->pages.
*/
static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
INIT_LIST_HEAD(&rbio->stripe_cache);
INIT_LIST_HEAD(&rbio->hash_list);
rbio->bbio = bbio;
- rbio->raid_map = raid_map;
rbio->fs_info = root->fs_info;
rbio->stripe_len = stripe_len;
rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
rbio->bio_pages = p + sizeof(struct page *) * num_pages;
rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
- if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ nr_data = real_stripes - 1;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
nr_data = real_stripes - 2;
else
- nr_data = real_stripes - 1;
+ BUG();
rbio->nr_data = nr_data;
return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
spin_lock_irq(&rbio->bio_list_lock);
bio_list_for_each(bio, &rbio->bio_list) {
start = (u64)bio->bi_iter.bi_sector << 9;
- stripe_offset = start - rbio->raid_map[0];
+ stripe_offset = start - rbio->bbio->raid_map[0];
page_index = stripe_offset >> PAGE_CACHE_SHIFT;
for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
logical <<= 9;
for (i = 0; i < rbio->nr_data; i++) {
- stripe_start = rbio->raid_map[i];
+ stripe_start = rbio->bbio->raid_map[i];
if (logical >= stripe_start &&
logical < stripe_start + rbio->stripe_len) {
return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
* our main entry point for writes from the rest of the FS.
*/
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len)
+ struct btrfs_bio *bbio, u64 stripe_len)
{
struct btrfs_raid_bio *rbio;
struct btrfs_plug_cb *plug = NULL;
struct blk_plug_cb *cb;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, 1);
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
}
/* all raid6 handling here */
- if (rbio->raid_map[rbio->real_stripes - 1] ==
- RAID6_Q_STRIPE) {
-
+ if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
/*
* single failure, rebuild from parity raid5
* style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
* here due to a crc mismatch and we can't give them the
* data they want
*/
- if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
- if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
+ if (rbio->bbio->raid_map[faila] ==
+ RAID5_P_STRIPE) {
err = -EIO;
goto cleanup;
}
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto pstripe;
}
- if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+ if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
raid6_datap_recov(rbio->real_stripes,
PAGE_SIZE, faila, pointers);
} else {
@@ -2001,8 +1967,7 @@ cleanup:
cleanup_io:
if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
- if (err == 0 &&
- !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
+ if (err == 0)
cache_rbio_pages(rbio);
else
clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
* of the drive.
*/
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io)
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io)
{
struct btrfs_raid_bio *rbio;
int ret;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio)) {
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
BUG();
- __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+ if (generic_io)
+ btrfs_put_bbio(bbio);
kfree(rbio);
return -EIO;
}
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
btrfs_bio_counter_inc_noblocked(root->fs_info);
rbio->generic_bio_cnt = 1;
} else {
- set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+ btrfs_get_bbio(bbio);
}
/*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors)
{
struct btrfs_raid_bio *rbio;
int i;
- rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+ rbio = alloc_rbio(root, bbio, stripe_len);
if (IS_ERR(rbio))
return NULL;
bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
int stripe_offset;
int index;
- ASSERT(logical >= rbio->raid_map[0]);
- ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+ ASSERT(logical >= rbio->bbio->raid_map[0]);
+ ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
rbio->stripe_len * rbio->nr_data);
- stripe_offset = (int)(logical - rbio->raid_map[0]);
+ stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
index = stripe_offset >> PAGE_CACHE_SHIFT;
rbio->bio_pages[index] = page;
}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 31d4a157b5e3..2b5d7977d83b 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, int mirror_num, int generic_io);
+ struct btrfs_bio *bbio, u64 stripe_len,
+ int mirror_num, int generic_io);
int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len);
+ struct btrfs_bio *bbio, u64 stripe_len);
struct btrfs_raid_bio *
raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
- struct btrfs_bio *bbio, u64 *raid_map,
- u64 stripe_len, struct btrfs_device *scrub_dev,
+ struct btrfs_bio *bbio, u64 stripe_len,
+ struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b63ae20618fb..0e7beea92b4c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
struct reada_extent {
u64 logical;
struct btrfs_key top;
- u32 blocksize;
int err;
struct list_head extctl;
int refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
blocksize = root->nodesize;
re->logical = logical;
- re->blocksize = blocksize;
re->top = *top;
INIT_LIST_HEAD(&re->extctl);
spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
spin_unlock(&fs_info->reada_lock);
btrfs_dev_replace_unlock(&fs_info->dev_replace);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return re;
error:
@@ -488,7 +486,7 @@ error:
kref_put(&zone->refcnt, reada_zone_release);
spin_unlock(&fs_info->reada_lock);
}
- kfree(bbio);
+ btrfs_put_bbio(bbio);
kfree(re);
return re_exist;
}
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
int mirror_num = 0;
struct extent_buffer *eb = NULL;
u64 logical;
- u32 blocksize;
int ret;
int i;
int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
spin_unlock(&fs_info->reada_lock);
return 0;
}
- dev->reada_next = re->logical + re->blocksize;
+ dev->reada_next = re->logical + fs_info->tree_root->nodesize;
re->refcnt++;
spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
}
}
logical = re->logical;
- blocksize = re->blocksize;
spin_lock(&re->lock);
if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
return 0;
atomic_inc(&dev->reada_in_flight);
- ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
- mirror_num, &eb);
+ ret = reada_tree_block_flagged(fs_info->extent_root, logical,
+ mirror_num, &eb);
if (ret)
__readahead_hook(fs_info->extent_root, NULL, logical, ret);
else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
break;
printk(KERN_DEBUG
" re: logical %llu size %u empty %d for %lld",
- re->logical, re->blocksize,
+ re->logical, fs_info->tree_root->nodesize,
list_empty(&re->extctl), re->scheduled_for ?
re->scheduled_for->devid : -1);
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
}
printk(KERN_DEBUG
"re: logical %llu size %u list empty %d for %lld",
- re->logical, re->blocksize, list_empty(&re->extctl),
+ re->logical, fs_info->tree_root->nodesize,
+ list_empty(&re->extctl),
re->scheduled_for ? re->scheduled_for->devid : -1);
for (i = 0; i < re->nzones; ++i) {
printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74257d6436ad..d83085381bcc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
}
}
-static int tree_block_processed(u64 bytenr, u32 blocksize,
- struct reloc_control *rc)
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
{
+ u32 blocksize = rc->extent_root->nodesize;
+
if (test_range_bit(&rc->processed_blocks, bytenr,
bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
while (rb_node) {
block = rb_entry(rb_node, struct tree_block, rb_node);
if (!block->key_ready)
- readahead_tree_block(rc->extent_root, block->bytenr,
- block->key.objectid);
+ readahead_tree_block(rc->extent_root, block->bytenr);
rb_node = rb_next(rb_node);
}
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
SKINNY_METADATA);
- if (tree_block_processed(bytenr, blocksize, rc))
+ if (tree_block_processed(bytenr, rc))
return 0;
if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
if (added)
goto next;
- if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+ if (!tree_block_processed(leaf->start, rc)) {
block = kmalloc(sizeof(*block), GFP_NOFS);
if (!block) {
err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f2bb13a23f86..ec57687c9a4d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
struct scrub_recover {
atomic_t refs;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 map_length;
};
@@ -80,7 +79,7 @@ struct scrub_page {
u64 logical;
u64 physical;
u64 physical_for_dev_replace;
- atomic_t ref_count;
+ atomic_t refs;
struct {
unsigned int mirror_num:8;
unsigned int have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
int page_count;
atomic_t outstanding_pages;
- atomic_t ref_count; /* free mem on transition to zero */
+ atomic_t refs; /* free mem on transition to zero */
struct scrub_ctx *sctx;
struct scrub_parity *sparity;
struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
int stripe_len;
- atomic_t ref_count;
+ atomic_t refs;
struct list_head spages;
@@ -194,6 +193,15 @@ struct scrub_ctx {
*/
struct btrfs_scrub_progress stat;
spinlock_t stat_lock;
+
+ /*
+ * Use a ref counter to avoid use-after-free issues. Scrub workers
+ * decrement bios_in_flight and workers_pending and then do a wakeup
+ * on the list_wait wait queue. We must ensure the main scrub task
+ * doesn't free the scrub context before or while the workers are
+ * doing the wakeup() call.
+ */
+ atomic_t refs;
};
struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
const u8 *csum, u64 generation,
u16 csum_size);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write);
+ struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good,
int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
static void copy_nocow_pages_worker(struct btrfs_work *work);
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
{
+ atomic_inc(&sctx->refs);
atomic_inc(&sctx->bios_in_flight);
}
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
{
atomic_dec(&sctx->bios_in_flight);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
{
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ atomic_inc(&sctx->refs);
/*
* increment scrubs_running to prevent cancel requests from
* completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
atomic_dec(&sctx->workers_pending);
wake_up(&fs_info->scrub_pause_wait);
wake_up(&sctx->list_wait);
+ scrub_put_ctx(sctx);
}
static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
kfree(sctx);
}
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+ if (atomic_dec_and_test(&sctx->refs))
+ scrub_free_ctx(sctx);
+}
+
static noinline_for_stack
struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
{
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
if (!sctx)
goto nomem;
+ atomic_set(&sctx->refs, 1);
sctx->is_dev_replace = is_dev_replace;
sctx->pages_per_rd_bio = pages_per_rd_bio;
sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
struct inode_fs_paths *ipath = NULL;
struct btrfs_root *local_root;
struct btrfs_key root_key;
+ struct btrfs_key key;
root_key.objectid = root;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
goto err;
}
- ret = inode_item_info(inum, 0, local_root, swarn->path);
+ /*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+ key.objectid = inum;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
if (ret) {
btrfs_release_path(swarn->path);
goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
static inline void scrub_put_recover(struct scrub_recover *recover)
{
if (atomic_dec_and_test(&recover->refs)) {
- kfree(recover->bbio);
- kfree(recover->raid_map);
+ btrfs_put_bbio(recover->bbio);
kfree(recover);
}
}
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
/* setup the context, map the logical blocks and alloc the pages */
- ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
- logical, sblocks_for_recheck);
+ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
if (ret) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (!is_metadata && !have_csum) {
struct scrub_fixup_nodatasum *fixup_nodatasum;
-nodatasum_case:
WARN_ON(sctx->is_dev_replace);
+nodatasum_case:
+
/*
* !is_metadata and !have_csum, this means that the data
* might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
sblock_other->no_io_error_seen) {
if (sctx->is_dev_replace) {
scrub_write_block_to_dev_replace(sblock_other);
+ goto corrected_error;
} else {
- int force_write = is_metadata || have_csum;
-
ret = scrub_repair_block_from_good_copy(
- sblock_bad, sblock_other,
- force_write);
+ sblock_bad, sblock_other);
+ if (!ret)
+ goto corrected_error;
}
- if (0 == ret)
- goto corrected_error;
}
}
- /*
- * for dev_replace, pick good pages and write to the target device.
- */
- if (sctx->is_dev_replace) {
- success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count;
- page_num++) {
- int sub_success;
-
- sub_success = 0;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other =
- sblocks_for_recheck + mirror_index;
- struct scrub_page *page_other =
- sblock_other->pagev[page_num];
-
- if (!page_other->io_error) {
- ret = scrub_write_page_to_dev_replace(
- sblock_other, page_num);
- if (ret == 0) {
- /* succeeded for this page */
- sub_success = 1;
- break;
- } else {
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->
- fs_info->dev_replace.
- num_write_errors);
- }
- }
- }
-
- if (!sub_success) {
- /*
- * did not find a mirror to fetch the page
- * from. scrub_write_page_to_dev_replace()
- * handles this case (page->io_error), by
- * filling the block with zeros before
- * submitting the write request
- */
- success = 0;
- ret = scrub_write_page_to_dev_replace(
- sblock_bad, page_num);
- if (ret)
- btrfs_dev_replace_stats_inc(
- &sctx->dev_root->fs_info->
- dev_replace.num_write_errors);
- }
- }
-
- goto out;
- }
+ if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
+ goto did_not_correct_error;
/*
- * for regular scrub, repair those pages that are errored.
* In case of I/O errors in the area that is supposed to be
* repaired, continue by picking good copies of those pages.
* Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
* mirror, even if other 512 byte sectors in the same PAGE_SIZE
* area are unreadable.
*/
-
- /* can only fix I/O errors from here on */
- if (sblock_bad->no_io_error_seen)
- goto did_not_correct_error;
-
success = 1;
- for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+ for (page_num = 0; page_num < sblock_bad->page_count;
+ page_num++) {
struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+ struct scrub_block *sblock_other = NULL;
- if (!page_bad->io_error)
+ /* skip no-io-error page in scrub */
+ if (!page_bad->io_error && !sctx->is_dev_replace)
continue;
- for (mirror_index = 0;
- mirror_index < BTRFS_MAX_MIRRORS &&
- sblocks_for_recheck[mirror_index].page_count > 0;
- mirror_index++) {
- struct scrub_block *sblock_other = sblocks_for_recheck +
- mirror_index;
- struct scrub_page *page_other = sblock_other->pagev[
- page_num];
-
- if (!page_other->io_error) {
- ret = scrub_repair_page_from_good_copy(
- sblock_bad, sblock_other, page_num, 0);
- if (0 == ret) {
- page_bad->io_error = 0;
- break; /* succeeded for this page */
+ /* try to find no-io-error page in mirrors */
+ if (page_bad->io_error) {
+ for (mirror_index = 0;
+ mirror_index < BTRFS_MAX_MIRRORS &&
+ sblocks_for_recheck[mirror_index].page_count > 0;
+ mirror_index++) {
+ if (!sblocks_for_recheck[mirror_index].
+ pagev[page_num]->io_error) {
+ sblock_other = sblocks_for_recheck +
+ mirror_index;
+ break;
}
}
+ if (!sblock_other)
+ success = 0;
}
- if (page_bad->io_error) {
- /* did not find a mirror to copy the page from */
- success = 0;
+ if (sctx->is_dev_replace) {
+ /*
+ * did not find a mirror to fetch the page
+ * from. scrub_write_page_to_dev_replace()
+ * handles this case (page->io_error), by
+ * filling the block with zeros before
+ * submitting the write request
+ */
+ if (!sblock_other)
+ sblock_other = sblock_bad;
+
+ if (scrub_write_page_to_dev_replace(sblock_other,
+ page_num) != 0) {
+ btrfs_dev_replace_stats_inc(
+ &sctx->dev_root->
+ fs_info->dev_replace.
+ num_write_errors);
+ success = 0;
+ }
+ } else if (sblock_other) {
+ ret = scrub_repair_page_from_good_copy(sblock_bad,
+ sblock_other,
+ page_num, 0);
+ if (0 == ret)
+ page_bad->io_error = 0;
+ else
+ success = 0;
}
}
- if (success) {
+ if (success && !sctx->is_dev_replace) {
if (is_metadata || have_csum) {
/*
* need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
return 0;
}
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
{
- if (raid_map) {
- if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
- return 3;
- else
- return 2;
- } else {
+ if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+ return 2;
+ else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
+ return 3;
+ else
return (int)bbio->num_stripes;
- }
}
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+ u64 *raid_map,
u64 mapped_length,
int nstripes, int mirror,
int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
{
int i;
- if (raid_map) {
+ if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* RAID5/6 */
for (i = 0; i < nstripes; i++) {
if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
}
}
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
- struct btrfs_fs_info *fs_info,
- struct scrub_block *original_sblock,
- u64 length, u64 logical,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck)
{
+ struct scrub_ctx *sctx = original_sblock->sctx;
+ struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+ u64 length = original_sblock->page_count * PAGE_SIZE;
+ u64 logical = original_sblock->pagev[0]->logical;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
- u64 *raid_map;
u64 sublen;
u64 mapped_length;
u64 stripe_offset;
int stripe_index;
- int page_index;
+ int page_index = 0;
int mirror_index;
int nmirrors;
int ret;
/*
- * note: the two members ref_count and outstanding_pages
+ * note: the two members refs and outstanding_pages
* are not used (and not set) in the blocks that are used for
* the recheck procedure
*/
- page_index = 0;
while (length > 0) {
sublen = min_t(u64, length, PAGE_SIZE);
mapped_length = sublen;
bbio = NULL;
- raid_map = NULL;
/*
* with a length of PAGE_SIZE, each returned stripe
* represents one mirror
*/
ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
- &mapped_length, &bbio, 0, &raid_map);
+ &mapped_length, &bbio, 0, 1);
if (ret || !bbio || mapped_length < sublen) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
if (!recover) {
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
return -ENOMEM;
}
atomic_set(&recover->refs, 1);
recover->bbio = bbio;
- recover->raid_map = raid_map;
recover->map_length = mapped_length;
BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
- nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+ nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
+
for (mirror_index = 0; mirror_index < nmirrors;
mirror_index++) {
struct scrub_block *sblock;
struct scrub_page *page;
- if (mirror_index >= BTRFS_MAX_MIRRORS)
- continue;
-
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
sblock->pagev[page_index] = page;
page->logical = logical;
- scrub_stripe_index_and_offset(logical, raid_map,
+ scrub_stripe_index_and_offset(logical,
+ bbio->map_type,
+ bbio->raid_map,
mapped_length,
- bbio->num_stripes,
+ bbio->num_stripes -
+ bbio->num_tgtdevs,
mirror_index,
&stripe_index,
&stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
static inline int scrub_is_page_on_raid56(struct scrub_page *page)
{
- return page->recover && page->recover->raid_map;
+ return page->recover &&
+ (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
}
static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
bio->bi_end_io = scrub_bio_wait_endio;
ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
- page->recover->raid_map,
page->recover->map_length,
page->mirror_num, 0);
if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
- struct scrub_block *sblock_good,
- int force_write)
+ struct scrub_block *sblock_good)
{
int page_num;
int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
sblock_good,
- page_num,
- force_write);
+ page_num, 1);
if (ret_sub)
ret = ret_sub;
}
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
static void scrub_block_get(struct scrub_block *sblock)
{
- atomic_inc(&sblock->ref_count);
+ atomic_inc(&sblock->refs);
}
static void scrub_block_put(struct scrub_block *sblock)
{
- if (atomic_dec_and_test(&sblock->ref_count)) {
+ if (atomic_dec_and_test(&sblock->refs)) {
int i;
if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
static void scrub_page_get(struct scrub_page *spage)
{
- atomic_inc(&spage->ref_count);
+ atomic_inc(&spage->refs);
}
static void scrub_page_put(struct scrub_page *spage)
{
- if (atomic_dec_and_test(&spage->ref_count)) {
+ if (atomic_dec_and_test(&spage->refs)) {
if (spage->page)
__free_page(spage->page);
kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
/* one ref inside this function, plus one for each page added to
* a bio later on */
- atomic_set(&sblock->ref_count, 1);
+ atomic_set(&sblock->refs, 1);
sblock->sctx = sctx;
sblock->no_io_error_seen = 1;
sblock->sparity = sparity;
@@ -2607,9 +2587,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
flags, gen, mirror_num,
have_csum ? csum : NULL);
-skip:
if (ret)
return ret;
+skip:
len -= l;
logical += l;
physical += l;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
struct btrfs_raid_bio *rbio;
struct scrub_page *spage;
struct btrfs_bio *bbio = NULL;
- u64 *raid_map = NULL;
u64 length;
int ret;
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
length = sparity->logic_end - sparity->logic_start + 1;
ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
sparity->logic_start,
- &length, &bbio, 0, &raid_map);
- if (ret || !bbio || !raid_map)
+ &length, &bbio, 0, 1);
+ if (ret || !bbio || !bbio->raid_map)
goto bbio_out;
bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
bio->bi_end_io = scrub_parity_bio_endio;
rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
- raid_map, length,
- sparity->scrub_dev,
+ length, sparity->scrub_dev,
sparity->dbitmap,
sparity->nsectors);
if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
rbio_out:
bio_put(bio);
bbio_out:
- kfree(bbio);
- kfree(raid_map);
+ btrfs_put_bbio(bbio);
bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
sparity->nsectors);
spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
static void scrub_parity_get(struct scrub_parity *sparity)
{
- atomic_inc(&sparity->ref_count);
+ atomic_inc(&sparity->refs);
}
static void scrub_parity_put(struct scrub_parity *sparity)
{
- if (!atomic_dec_and_test(&sparity->ref_count))
+ if (!atomic_dec_and_test(&sparity->refs))
return;
scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
sparity->scrub_dev = sdev;
sparity->logic_start = logic_start;
sparity->logic_end = logic_end;
- atomic_set(&sparity->ref_count, 1);
+ atomic_set(&sparity->refs, 1);
INIT_LIST_HEAD(&sparity->spages);
sparity->dbitmap = sparity->bitmap;
sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
increment = map->stripe_len;
mirror_num = num % map->num_stripes + 1;
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical, num, map, &offset, NULL);
increment = map->stripe_len * nr_data_stripes(map);
mirror_num = 1;
@@ -3053,7 +3029,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
ppath = btrfs_alloc_path();
if (!ppath) {
- btrfs_free_path(ppath);
+ btrfs_free_path(path);
return -ENOMEM;
}
@@ -3065,6 +3041,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
path->search_commit_root = 1;
path->skip_locking = 1;
+ ppath->search_commit_root = 1;
+ ppath->skip_locking = 1;
/*
* trigger the readahead for extent tree csum tree and wait for
* completion. During readahead, the scrub is officially paused
@@ -3072,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
*/
logical = base + offset;
physical_end = physical + nstripes * map->stripe_len;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
get_raid56_logic_offset(physical_end, num,
map, &logic_end, NULL);
logic_end += base;
@@ -3119,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
ret = 0;
while (physical < physical_end) {
/* for raid56, we skip parity stripe */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = get_raid56_logic_offset(physical, num,
map, &logical, &stripe_logical);
logical += base;
@@ -3278,8 +3254,7 @@ again:
scrub_free_csums(sctx);
if (extent_logical + extent_len <
key.objectid + bytes) {
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/*
* loop until we find next data stripe
* or we have finished all stripes.
@@ -3773,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
mutex_unlock(&fs_info->scrub_lock);
- scrub_free_ctx(sctx);
+ scrub_put_ctx(sctx);
return ret;
}
@@ -3879,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
&mapped_length, &bbio, 0);
if (ret || !bbio || mapped_length < extent_len ||
!bbio->stripes[0].dev->bdev) {
- kfree(bbio);
+ btrfs_put_bbio(bbio);
return;
}
*extent_physical = bbio->stripes[0].physical;
*extent_mirror_num = bbio->mirror_num;
*extent_dev = bbio->stripes[0].dev;
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 804432dbc351..fe5857223515 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
if (ret < 0)
goto out;
TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
- btrfs_inode_atime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
- btrfs_inode_mtime(ii));
- TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
- btrfs_inode_ctime(ii));
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
+ TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
/* TODO Add otime support when the otime patches get into upstream */
ret = send_cmd(sctx);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 60f7cbe815e9..05fef198ff94 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1000,10 +1000,20 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
*/
if (fs_info->pending_changes == 0)
return 0;
+ /*
+ * A non-blocking test if the fs is frozen. We must not
+ * start a new transaction here otherwise a deadlock
+ * happens. The pending operations are delayed to the
+ * next commit after thawing.
+ */
+ if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
+ __sb_end_write(sb, SB_FREEZE_WRITE);
+ else
+ return 0;
trans = btrfs_start_transaction(root, 0);
- } else {
- return PTR_ERR(trans);
}
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
}
return btrfs_commit_transaction(trans, root);
}
@@ -1948,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
return btrfs_commit_transaction(trans, root);
}
-static int btrfs_unfreeze(struct super_block *sb)
-{
- return 0;
-}
-
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2001,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
.statfs = btrfs_statfs,
.remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
- .unfreeze_fs = btrfs_unfreeze,
};
static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92db3f648df4..94edb0a2a026 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
ret = btrfs_init_debugfs();
if (ret)
- return ret;
+ goto out1;
init_feature_attrs();
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+ if (ret)
+ goto out2;
+
+ return 0;
+out2:
+ debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+ kset_unregister(btrfs_kset);
return ret;
}
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index cc286ce97d1e..f51963a8f929 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
return -ENOMEM;
}
- path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+ path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
if (!eb) {
test_msg("Could not allocate dummy buffer\n");
ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7e99c2f98dd0..9e9f2368177d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
}
ret = 0;
out_bits:
- clear_extent_bits(&tmp, 0, total_dirty - 1,
- (unsigned long)-1, GFP_NOFS);
+ clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
out:
if (locked_page)
page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ae0f5b8bb80..a116b55ce788 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
goto out;
}
- root->node = alloc_dummy_extent_buffer(0, 4096);
+ root->node = alloc_dummy_extent_buffer(NULL, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ec3dcb202357..73f299ebdabb 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
ret = -ENOMEM;
goto out;
}
+ /* We are using this root as our extent root */
+ root->fs_info->extent_root = root;
+
+ /*
+ * Some of the paths we test assume we have a filled out fs_info, so we
+ * just need to add the root in there so we don't panic.
+ */
+ root->fs_info->tree_root = root;
+ root->fs_info->quota_root = root;
+ root->fs_info->quota_enabled = 1;
/*
* Can't use bytenr 0, some things freak out
* *cough*backref walking code*cough*
*/
- root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+ root->node = alloc_test_extent_buffer(root->fs_info, 4096);
if (!root->node) {
test_msg("Couldn't allocate dummy buffer\n");
ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
goto out;
}
- /* We are using this root as our extent root */
- root->fs_info->extent_root = root;
-
- /*
- * Some of the paths we test assume we have a filled out fs_info, so we
- * just need to addt he root in there so we don't panic.
- */
- root->fs_info->tree_root = root;
- root->fs_info->quota_root = root;
- root->fs_info->quota_enabled = 1;
-
test_msg("Running qgroup tests\n");
ret = test_no_shared_qgroup(root);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a605d4e2f2bc..7e80f32550a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
+ cur_trans->have_free_bgs = 0;
cur_trans->start_time = get_seconds();
cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
INIT_LIST_HEAD(&cur_trans->pending_ordered);
+ INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+ spin_lock_init(&cur_trans->dirty_bgs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
u64 old_root_used;
struct btrfs_root *tree_root = root->fs_info->tree_root;
+ bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
old_root_used = btrfs_root_used(&root->root_item);
btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
if (old_root_bytenr == root->node->start &&
- old_root_used == btrfs_root_used(&root->root_item))
+ old_root_used == btrfs_root_used(&root->root_item) &&
+ (!extent_root ||
+ list_empty(&trans->transaction->dirty_bgs)))
break;
btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
return ret;
old_root_used = btrfs_root_used(&root->root_item);
- ret = btrfs_write_dirty_block_groups(trans, root);
+ if (extent_root) {
+ ret = btrfs_write_dirty_block_groups(trans, root);
+ if (ret)
+ return ret;
+ }
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ if (ret)
+ return ret;
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
if (ret)
return ret;
}
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
int ret;
- ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
- if (ret)
- return ret;
-
eb = btrfs_lock_root_node(fs_info->tree_root);
ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
next = fs_info->dirty_cowonly_roots.next;
list_del_init(next);
root = list_entry(next, struct btrfs_root, dirty_list);
+ clear_bit(BTRFS_ROOT_DIRTY, &root->state);
if (root != fs_info->extent_root)
list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
switch_commit_roots(cur_trans, root->fs_info);
assert_qgroups_uptodate(trans);
+ ASSERT(list_empty(&cur_trans->dirty_bgs));
update_super_roots(root);
btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
+ if (cur_trans->have_free_bgs)
+ btrfs_clear_space_info_full(root->fs_info);
+
root->fs_info->last_trans_committed = cur_trans->transid;
/*
* We needn't acquire the lock here because there is no other task
@@ -2118,7 +2133,7 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
unsigned long prev;
unsigned long bit;
- prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+ prev = xchg(&fs_info->pending_changes, 0);
if (!prev)
return;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 00ed29c4b3f9..937050a2b68e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
atomic_t num_writers;
atomic_t use_count;
+ /*
+ * true if there is free bgs operations in this transaction
+ */
+ int have_free_bgs;
+
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
struct list_head pending_chunks;
struct list_head pending_ordered;
struct list_head switch_commits;
+ struct list_head dirty_bgs;
+ spinlock_t dirty_bgs_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
};
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a02da16f2be..9a37f8b39bae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
insert:
btrfs_release_path(path);
/* try to insert the key into the destination tree */
+ path->skip_release_on_error = 1;
ret = btrfs_insert_empty_item(trans, root, path,
key, item_size);
+ path->skip_release_on_error = 0;
/* make sure any existing item is the correct size */
- if (ret == -EEXIST) {
+ if (ret == -EEXIST || ret == -EOVERFLOW) {
u32 found_size;
found_size = btrfs_item_size_nr(path->nodes[0],
path->slots[0]);
@@ -488,8 +490,20 @@ insert:
src_item = (struct btrfs_inode_item *)src_ptr;
dst_item = (struct btrfs_inode_item *)dst_ptr;
- if (btrfs_inode_generation(eb, src_item) == 0)
+ if (btrfs_inode_generation(eb, src_item) == 0) {
+ struct extent_buffer *dst_eb = path->nodes[0];
+
+ if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+ S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+ struct btrfs_map_token token;
+ u64 ino_size = btrfs_inode_size(eb, src_item);
+
+ btrfs_init_map_token(&token);
+ btrfs_set_token_inode_size(dst_eb, dst_item,
+ ino_size, &token);
+ }
goto no_copy;
+ }
if (overwrite_root &&
S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
static noinline int backref_in_log(struct btrfs_root *log,
struct btrfs_key *key,
u64 ref_objectid,
- char *name, int namelen)
+ const char *name, int namelen)
{
struct btrfs_path *path;
struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
}
static int insert_orphan_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 offset)
+ struct btrfs_root *root, u64 ino)
{
int ret;
- ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
- offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
- if (ret > 0)
- ret = btrfs_insert_orphan_item(trans, root, offset);
+
+ ret = btrfs_insert_orphan_item(trans, root, ino);
+ if (ret == -EEXIST)
+ ret = 0;
+
return ret;
}
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
leaf = path->nodes[0];
item_size = btrfs_item_size_nr(leaf, path->slots[0]);
ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+ cur_offset = 0;
while (cur_offset < item_size) {
extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
}
btrfs_release_path(path);
- if (ret < 0)
+ if (ret < 0 && ret != -ENOENT)
return ret;
return nlink;
}
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
nlink = ret;
ret = count_inode_extrefs(root, inode, path);
- if (ret == -ENOENT)
- ret = 0;
-
if (ret < 0)
goto out;
@@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
}
/*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+ const char *name, const int name_len,
+ const u64 dirid, const u64 ino)
+{
+ struct btrfs_key search_key;
+
+ search_key.objectid = ino;
+ search_key.type = BTRFS_INODE_REF_KEY;
+ search_key.offset = dirid;
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ search_key.type = BTRFS_INODE_EXTREF_KEY;
+ search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+ if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+ return true;
+
+ return false;
+}
+
+/*
* take a single entry in a log directory item and replay it into
* the subvolume.
*
@@ -1666,10 +1703,17 @@ out:
return ret;
insert:
+ if (name_in_log_ref(root->log_root, name, name_len,
+ key->objectid, log_key.objectid)) {
+ /* The dentry will be added later. */
+ ret = 0;
+ update_size = false;
+ goto out;
+ }
btrfs_release_path(path);
ret = insert_one_name(trans, root, path, key->objectid, key->offset,
name, name_len, log_type, &log_key);
- if (ret && ret != -ENOENT)
+ if (ret && ret != -ENOENT && ret != -EEXIST)
goto out;
update_size = false;
ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
parent = path->nodes[*level];
root_owner = btrfs_header_owner(parent);
- next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+ next = btrfs_find_create_tree_block(root, bytenr);
if (!next)
return -ENOMEM;
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
mutex_unlock(&root->log_mutex);
if (atomic_read(&root->log_writers))
schedule();
- mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
+ mutex_lock(&root->log_mutex);
}
}
@@ -2591,6 +2635,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
}
if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+ blk_finish_plug(&plug);
mutex_unlock(&log_root_tree->log_mutex);
ret = root_log_ctx.log_ret;
goto out;
@@ -3218,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
static void fill_inode_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf,
struct btrfs_inode_item *item,
- struct inode *inode, int log_inode_only)
+ struct inode *inode, int log_inode_only,
+ u64 logged_isize)
{
struct btrfs_map_token token;
@@ -3231,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
* to say 'update this inode with these values'
*/
btrfs_set_token_inode_generation(leaf, item, 0, &token);
- btrfs_set_token_inode_size(leaf, item, 0, &token);
+ btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
} else {
btrfs_set_token_inode_generation(leaf, item,
BTRFS_I(inode)->generation,
@@ -3244,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->atime,
inode->i_atime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->atime,
inode->i_atime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->mtime,
inode->i_mtime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->mtime,
inode->i_mtime.tv_nsec, &token);
- btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_sec(leaf, &item->ctime,
inode->i_ctime.tv_sec, &token);
- btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+ btrfs_set_token_timespec_nsec(leaf, &item->ctime,
inode->i_ctime.tv_nsec, &token);
btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3283,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return ret;
inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_item);
- fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+ fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
btrfs_release_path(path);
return 0;
}
@@ -3292,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct inode *inode,
struct btrfs_path *dst_path,
struct btrfs_path *src_path, u64 *last_extent,
- int start_slot, int nr, int inode_only)
+ int start_slot, int nr, int inode_only,
+ u64 logged_isize)
{
unsigned long src_offset;
unsigned long dst_offset;
@@ -3349,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
dst_path->slots[0],
struct btrfs_inode_item);
fill_inode_item(trans, dst_path->nodes[0], inode_item,
- inode, inode_only == LOG_INODE_EXISTS);
+ inode, inode_only == LOG_INODE_EXISTS,
+ logged_isize);
} else {
copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
src_offset, ins_sizes[i]);
@@ -3901,6 +3949,33 @@ process:
return ret;
}
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+ struct btrfs_path *path, u64 *size_ret)
+{
+ struct btrfs_key key;
+ int ret;
+
+ key.objectid = btrfs_ino(inode);
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+
+ ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *size_ret = i_size_read(inode);
+ } else {
+ struct btrfs_inode_item *item;
+
+ item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_inode_item);
+ *size_ret = btrfs_inode_size(path->nodes[0], item);
+ }
+
+ btrfs_release_path(path);
+ return 0;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -3938,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+ u64 logged_isize = 0;
path = btrfs_alloc_path();
if (!path)
@@ -3965,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /* Only run delayed items if we are a dir or a new file */
+ /*
+ * Only run delayed items if we are a dir or a new file.
+ * Otherwise commit the delayed inode only, which is needed in
+ * order for the log replay code to mark inodes for link count
+ * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ */
if (S_ISDIR(inode->i_mode) ||
- BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+ BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
ret = btrfs_commit_inode_delayed_items(trans, inode);
- if (ret) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- return ret;
- }
+ else
+ ret = btrfs_commit_inode_delayed_inode(inode);
+
+ if (ret) {
+ btrfs_free_path(path);
+ btrfs_free_path(dst_path);
+ return ret;
}
mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3987,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
if (S_ISDIR(inode->i_mode)) {
int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
- if (inode_only == LOG_INODE_EXISTS)
- max_key_type = BTRFS_XATTR_ITEM_KEY;
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key_type = BTRFS_INODE_EXTREF_KEY;
+ max_key.type = max_key_type;
+ }
ret = drop_objectid_items(trans, log, path, ino, max_key_type);
} else {
- if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags)) {
- clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags);
- ret = btrfs_truncate_inode_items(trans, log,
- inode, 0, 0);
- } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
- &BTRFS_I(inode)->runtime_flags) ||
+ if (inode_only == LOG_INODE_EXISTS) {
+ /*
+ * Make sure the new inode item we write to the log has
+ * the same isize as the current one (if it exists).
+ * This is necessary to prevent data loss after log
+ * replay, and also to prevent doing a wrong expanding
+ * truncate - for e.g. create file, write 4K into offset
+ * 0, fsync, write 4K into offset 4096, add hard link,
+ * fsync some other file (to sync log), power fail - if
+ * we use the inode's current i_size, after log replay
+ * we get a 8Kb file, with the last 4Kb extent as a hole
+ * (zeroes), as if an expanding truncate happened,
+ * instead of getting a file of 4Kb only.
+ */
+ err = logged_inode_size(log, inode, path,
+ &logged_isize);
+ if (err)
+ goto out_unlock;
+ }
+ if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags)) {
+ if (inode_only == LOG_INODE_EXISTS) {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ ret = drop_objectid_items(trans, log, path, ino,
+ max_key.type);
+ } else {
+ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+ &BTRFS_I(inode)->runtime_flags);
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
+ ret = btrfs_truncate_inode_items(trans, log,
+ inode, 0, 0);
+ }
+ } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags) ||
inode_only == LOG_INODE_EXISTS) {
- if (inode_only == LOG_INODE_ALL)
+ if (inode_only == LOG_INODE_ALL) {
+ clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+ &BTRFS_I(inode)->runtime_flags);
fast_search = true;
- max_key.type = BTRFS_XATTR_ITEM_KEY;
+ max_key.type = BTRFS_XATTR_ITEM_KEY;
+ } else {
+ max_key.type = BTRFS_INODE_EXTREF_KEY;
+ }
ret = drop_objectid_items(trans, log, path, ino,
max_key.type);
} else {
@@ -4046,7 +4163,8 @@ again:
}
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4070,7 +4188,7 @@ next_slot:
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
&last_extent, ins_start_slot,
- ins_nr, inode_only);
+ ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4091,7 +4209,8 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, &last_extent,
- ins_start_slot, ins_nr, inode_only);
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
@@ -4272,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
struct dentry *old_parent = NULL;
int ret = 0;
u64 last_committed = root->fs_info->last_trans_committed;
+ const struct dentry * const first_parent = parent;
+ const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+ last_committed);
sb = inode->i_sb;
@@ -4327,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
goto end_trans;
}
- inode_only = LOG_INODE_EXISTS;
while (1) {
if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
@@ -4336,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (root != BTRFS_I(inode)->root)
break;
+ /*
+ * On unlink we must make sure our immediate parent directory
+ * inode is fully logged. This is to prevent leaving dangling
+ * directory index entries and a wrong directory inode's i_size.
+ * Not doing so can result in a directory being impossible to
+ * delete after log replay (rmdir will always fail with error
+ * -ENOTEMPTY).
+ */
+ if (did_unlink && parent == first_parent)
+ inode_only = LOG_INODE_ALL;
+ else
+ inode_only = LOG_INODE_EXISTS;
+
if (BTRFS_I(inode)->generation >
- root->fs_info->last_trans_committed) {
+ root->fs_info->last_trans_committed ||
+ inode_only == LOG_INODE_ALL) {
ret = btrfs_log_inode(trans, root, inode, inode_only,
0, LLONG_MAX, ctx);
if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50c5a8762aed..cd4d1315aaa9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
if (ret) {
btrfs_error(root->fs_info, ret,
"Failed to remove dev extent item");
+ } else {
+ trans->transaction->have_free_bgs = 1;
}
out:
btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
- if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+ if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
return;
btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = map->stripe_len * nr_data_stripes(map);
- }
free_extent_map(em);
return len;
}
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
BUG_ON(em->start > logical || em->start + em->len < logical);
map = (struct map_lookup *)em->bdev;
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6))
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
free_extent_map(em);
return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
}
/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
{
struct btrfs_bio_stripe s;
- int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
int i;
u64 l;
int again = 1;
- int m;
while (again) {
again = 0;
- for (i = 0; i < real_stripes - 1; i++) {
- if (parity_smaller(raid_map[i], raid_map[i+1])) {
+ for (i = 0; i < num_stripes - 1; i++) {
+ if (parity_smaller(bbio->raid_map[i],
+ bbio->raid_map[i+1])) {
s = bbio->stripes[i];
- l = raid_map[i];
+ l = bbio->raid_map[i];
bbio->stripes[i] = bbio->stripes[i+1];
- raid_map[i] = raid_map[i+1];
+ bbio->raid_map[i] = bbio->raid_map[i+1];
bbio->stripes[i+1] = s;
- raid_map[i+1] = l;
-
- if (bbio->tgtdev_map) {
- m = bbio->tgtdev_map[i];
- bbio->tgtdev_map[i] =
- bbio->tgtdev_map[i + 1];
- bbio->tgtdev_map[i + 1] = m;
- }
+ bbio->raid_map[i+1] = l;
again = 1;
}
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
}
}
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+ struct btrfs_bio *bbio = kzalloc(
+ sizeof(struct btrfs_bio) +
+ sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+ sizeof(int) * (real_stripes) +
+ sizeof(u64) * (real_stripes),
+ GFP_NOFS);
+ if (!bbio)
+ return NULL;
+
+ atomic_set(&bbio->error, 0);
+ atomic_set(&bbio->refs, 1);
+
+ return bbio;
+}
+
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+ WARN_ON(!atomic_read(&bbio->refs));
+ atomic_inc(&bbio->refs);
+}
+
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+ if (!bbio)
+ return;
+ if (atomic_dec_and_test(&bbio->refs))
+ kfree(bbio);
+}
+
static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret,
- int mirror_num, u64 **raid_map_ret)
+ int mirror_num, int need_raid_map)
{
struct extent_map *em;
struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 stripe_nr_orig;
u64 stripe_nr_end;
u64 stripe_len;
- u64 *raid_map = NULL;
int stripe_index;
int i;
int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
stripe_offset = offset - stripe_offset;
/* if we're here for raid56, we need to know the stripe aligned start */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
raid56_full_stripe_start = offset;
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & REQ_DISCARD) {
/* we don't discard raid56 yet */
- if (map->type &
- (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
ret = -EOPNOTSUPP;
goto out;
}
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
/* For writes to RAID[56], allow a full stripeset across all disks.
For other RAID types and for RAID[56] reads, just allow a single
stripe (on a single disk). */
- if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
(rw & REQ_WRITE)) {
max_len = stripe_len * nr_data_stripes(map) -
(offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 physical_of_found = 0;
ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
- logical, &tmp_length, &tmp_bbio, 0, NULL);
+ logical, &tmp_length, &tmp_bbio, 0, 0);
if (ret) {
WARN_ON(tmp_bbio != NULL);
goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
* is not left of the left cursor
*/
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
} else {
WARN_ON(1);
ret = -EIO;
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
goto out;
}
- kfree(tmp_bbio);
+ btrfs_put_bbio(tmp_bbio);
} else if (mirror_num > map->num_stripes) {
mirror_num = 0;
}
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
mirror_num = stripe_index - old_stripe_index + 1;
}
- } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
- u64 tmp;
-
- if (raid_map_ret &&
+ } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
+ if (need_raid_map &&
((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
mirror_num > 1)) {
- int i, rot;
-
/* push stripe_nr back to the start of the full stripe */
stripe_nr = raid56_full_stripe_start;
do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
num_stripes = map->num_stripes;
max_errors = nr_parity_stripes(map);
- raid_map = kmalloc_array(num_stripes, sizeof(u64),
- GFP_NOFS);
- if (!raid_map) {
- ret = -ENOMEM;
- goto out;
- }
-
- /* Work out the disk rotation on this stripe-set */
- tmp = stripe_nr;
- rot = do_div(tmp, num_stripes);
-
- /* Fill in the logical address of each stripe */
- tmp = stripe_nr * nr_data_stripes(map);
- for (i = 0; i < nr_data_stripes(map); i++)
- raid_map[(i+rot) % num_stripes] =
- em->start + (tmp + i) * map->stripe_len;
-
- raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
- if (map->type & BTRFS_BLOCK_GROUP_RAID6)
- raid_map[(i+rot+1) % num_stripes] =
- RAID6_Q_STRIPE;
-
*length = map->stripe_len;
stripe_index = 0;
stripe_offset = 0;
} else {
+ u64 tmp;
+
/*
* Mirror #0 or #1 means the original data block.
* Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
tgtdev_indexes = num_stripes;
}
- bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
- GFP_NOFS);
+ bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
if (!bbio) {
- kfree(raid_map);
ret = -ENOMEM;
goto out;
}
- atomic_set(&bbio->error, 0);
if (dev_replace_is_ongoing)
bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+ /* build raid_map */
+ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+ need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+ mirror_num > 1)) {
+ u64 tmp;
+ int i, rot;
+
+ bbio->raid_map = (u64 *)((void *)bbio->stripes +
+ sizeof(struct btrfs_bio_stripe) *
+ num_alloc_stripes +
+ sizeof(int) * tgtdev_indexes);
+
+ /* Work out the disk rotation on this stripe-set */
+ tmp = stripe_nr;
+ rot = do_div(tmp, num_stripes);
+
+ /* Fill in the logical address of each stripe */
+ tmp = stripe_nr * nr_data_stripes(map);
+ for (i = 0; i < nr_data_stripes(map); i++)
+ bbio->raid_map[(i+rot) % num_stripes] =
+ em->start + (tmp + i) * map->stripe_len;
+
+ bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+ if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+ bbio->raid_map[(i+rot+1) % num_stripes] =
+ RAID6_Q_STRIPE;
+ }
+
if (rw & REQ_DISCARD) {
int factor = 0;
int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
max_errors = btrfs_chunk_max_errors(map);
+ if (bbio->raid_map)
+ sort_parity_stripes(bbio, num_stripes);
+
tgtdev_indexes = 0;
if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
}
*bbio_ret = bbio;
+ bbio->map_type = map->type;
bbio->num_stripes = num_stripes;
bbio->max_errors = max_errors;
bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
bbio->mirror_num = map->num_stripes + 1;
}
- if (raid_map) {
- sort_parity_stripes(bbio, raid_map);
- *raid_map_ret = raid_map;
- }
out:
if (dev_replace_is_ongoing)
btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
struct btrfs_bio **bbio_ret, int mirror_num)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, NULL);
+ mirror_num, 0);
}
/* For Scrub/replace */
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret)
+ int need_raid_map)
{
return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
- mirror_num, raid_map_ret);
+ mirror_num, need_raid_map);
}
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
do_div(length, map->num_stripes / map->sub_stripes);
else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
do_div(length, map->num_stripes);
- else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6)) {
+ else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
do_div(length, nr_data_stripes(map));
rmap_len = map->stripe_len * nr_data_stripes(map);
}
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
bio_endio_nodec(bio, err);
else
bio_endio(bio, err);
- kfree(bbio);
+ btrfs_put_bbio(bbio);
}
static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
u64 logical = (u64)bio->bi_iter.bi_sector << 9;
u64 length = 0;
u64 map_length;
- u64 *raid_map = NULL;
int ret;
int dev_nr = 0;
int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
btrfs_bio_counter_inc_blocked(root->fs_info);
ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
- mirror_num, &raid_map);
+ mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(root->fs_info);
return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
bbio->fs_info = root->fs_info;
atomic_set(&bbio->stripes_pending, bbio->num_stripes);
- if (raid_map) {
+ if (bbio->raid_map) {
/* In this case, map_length has been set to the length of
a single stripe; not the whole write */
if (rw & WRITE) {
- ret = raid56_parity_write(root, bio, bbio,
- raid_map, map_length);
+ ret = raid56_parity_write(root, bio, bbio, map_length);
} else {
- ret = raid56_parity_recover(root, bio, bbio,
- raid_map, map_length,
+ ret = raid56_parity_recover(root, bio, bbio, map_length,
mirror_num, 1);
}
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
struct btrfs_chunk *chunk;
- u8 *ptr;
- unsigned long sb_ptr;
+ u8 *array_ptr;
+ unsigned long sb_array_offset;
int ret = 0;
u32 num_stripes;
u32 array_size;
u32 len = 0;
- u32 cur;
+ u32 cur_offset;
struct btrfs_key key;
- sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
- BTRFS_SUPER_INFO_SIZE);
+ ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
+ /*
+ * This will create extent buffer of nodesize, superblock size is
+ * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+ * overallocate but we can keep it as-is, only the first page is used.
+ */
+ sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
- ptr = super_copy->sys_chunk_array;
- sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
- cur = 0;
+ array_ptr = super_copy->sys_chunk_array;
+ sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
+ cur_offset = 0;
+
+ while (cur_offset < array_size) {
+ disk_key = (struct btrfs_disk_key *)array_ptr;
+ len = sizeof(*disk_key);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
- while (cur < array_size) {
- disk_key = (struct btrfs_disk_key *)ptr;
btrfs_disk_key_to_cpu(&key, disk_key);
- len = sizeof(*disk_key); ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
if (key.type == BTRFS_CHUNK_ITEM_KEY) {
- chunk = (struct btrfs_chunk *)sb_ptr;
+ chunk = (struct btrfs_chunk *)sb_array_offset;
+ /*
+ * At least one btrfs_chunk with one stripe must be
+ * present, exact stripe count check comes afterwards
+ */
+ len = btrfs_chunk_item_size(1);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
+ num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+ len = btrfs_chunk_item_size(num_stripes);
+ if (cur_offset + len > array_size)
+ goto out_short_read;
+
ret = read_one_chunk(root, &key, sb, chunk);
if (ret)
break;
- num_stripes = btrfs_chunk_num_stripes(sb, chunk);
- len = btrfs_chunk_item_size(num_stripes);
} else {
ret = -EIO;
break;
}
- ptr += len;
- sb_ptr += len;
- cur += len;
+ array_ptr += len;
+ sb_array_offset += len;
+ cur_offset += len;
}
free_extent_buffer(sb);
return ret;
+
+out_short_read:
+ printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+ len, cur_offset);
+ free_extent_buffer(sb);
+ return -EIO;
}
int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d6fe73c0f4a2..83069dec6898 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0)
struct btrfs_bio {
+ atomic_t refs;
atomic_t stripes_pending;
struct btrfs_fs_info *fs_info;
+ u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
int mirror_num;
int num_tgtdevs;
int *tgtdev_map;
+ /*
+ * logical block numbers for the start of each stripe
+ * The last one or two are p/q. These are sorted,
+ * so raid_map[0] is the start of our full stripe
+ */
+ u64 *raid_map;
struct btrfs_bio_stripe stripes[];
};
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
u64 end, u64 *length);
-
-#define btrfs_bio_size(total_stripes, real_stripes) \
- (sizeof(struct btrfs_bio) + \
- (sizeof(struct btrfs_bio_stripe) * (total_stripes)) + \
- (sizeof(int) * (real_stripes)))
-
+void btrfs_get_bbio(struct btrfs_bio *bbio);
+void btrfs_put_bbio(struct btrfs_bio *bbio);
int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num);
int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
u64 logical, u64 *length,
struct btrfs_bio **bbio_ret, int mirror_num,
- u64 **raid_map_ret);
+ int need_raid_map);
int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
u64 chunk_start, u64 physical, u64 devid,
u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 5bd853ba44ff..64fa248343f6 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
spin_unlock(&ci->i_ceph_lock);
}
-static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
- int type)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct posix_acl *acl = ACL_NOT_CACHED;
-
- spin_lock(&ci->i_ceph_lock);
- if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
- acl = get_cached_acl(inode, type);
- spin_unlock(&ci->i_ceph_lock);
-
- return acl;
-}
-
struct posix_acl *ceph_get_acl(struct inode *inode, int type)
{
int size;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5013d92a7e6..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page)
u64 len = PAGE_CACHE_SIZE;
if (off >= i_size_read(inode)) {
- zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
return 0;
}
- /*
- * Uptodate inline data should have been added into page cache
- * while getting Fcr caps.
- */
- if (ci->i_inline_version != CEPH_INLINE_NONE)
- return -EINVAL;
+ if (ci->i_inline_version != CEPH_INLINE_NONE) {
+ /*
+ * Uptodate inline data should have been added
+ * into page cache while getting Fcr caps.
+ */
+ if (off == 0)
+ return -EINVAL;
+ zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ return 0;
+ }
err = ceph_readpage_from_fscache(inode, page);
if (err == 0)
@@ -1416,7 +1421,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
}
}
- dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n",
+ dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
inode, ceph_vinop(inode), len, locked_page);
if (len > 0) {
@@ -1569,7 +1574,6 @@ out:
static struct vm_operations_struct ceph_vmops = {
.fault = ceph_filemap_fault,
.page_mkwrite = ceph_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631c6c87..8172775428a0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
realmino);
if (realm) {
- ceph_get_snap_realm(mdsc, realm);
spin_lock(&realm->inodes_with_caps_lock);
ci->i_snap_realm = realm;
list_add(&ci->i_snap_realm_item,
@@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode,
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
- ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
if (list_empty(&ci->i_flushing_item)) {
+ ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
dout(" inode %p now flushing seq %lld\n", inode,
@@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
* requested from the MDS.
*/
static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
- loff_t endoff, int *got, struct page **pinned_page,
- int *check_max, int *err)
+ loff_t endoff, int *got, int *check_max, int *err)
{
struct inode *inode = &ci->vfs_inode;
int ret = 0;
- int have, implemented, _got = 0;
+ int have, implemented;
int file_wanted;
dout("get_cap_refs %p need %s want %s\n", inode,
ceph_cap_string(need), ceph_cap_string(want));
-again:
+
spin_lock(&ci->i_ceph_lock);
/* make sure file is actually open */
@@ -2138,50 +2136,34 @@ again:
inode, ceph_cap_string(have), ceph_cap_string(not),
ceph_cap_string(revoking));
if ((revoking & not) == 0) {
- _got = need | (have & want);
- __take_cap_refs(ci, _got);
+ *got = need | (have & want);
+ __take_cap_refs(ci, *got);
ret = 1;
}
} else {
+ int session_readonly = false;
+ if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+ struct ceph_mds_session *s = ci->i_auth_cap->session;
+ spin_lock(&s->s_cap_lock);
+ session_readonly = s->s_readonly;
+ spin_unlock(&s->s_cap_lock);
+ }
+ if (session_readonly) {
+ dout("get_cap_refs %p needed %s but mds%d readonly\n",
+ inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+ *err = -EROFS;
+ ret = 1;
+ goto out_unlock;
+ }
+
dout("get_cap_refs %p have %s needed %s\n", inode,
ceph_cap_string(have), ceph_cap_string(need));
}
out_unlock:
spin_unlock(&ci->i_ceph_lock);
- if (ci->i_inline_version != CEPH_INLINE_NONE &&
- (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
- i_size_read(inode) > 0) {
- int ret1;
- struct page *page = find_get_page(inode->i_mapping, 0);
- if (page) {
- if (PageUptodate(page)) {
- *pinned_page = page;
- goto out;
- }
- page_cache_release(page);
- }
- /*
- * drop cap refs first because getattr while holding
- * caps refs can cause deadlock.
- */
- ceph_put_cap_refs(ci, _got);
- _got = 0;
-
- /* getattr request will bring inline data into page cache */
- ret1 = __ceph_do_getattr(inode, NULL,
- CEPH_STAT_CAP_INLINE_DATA, true);
- if (ret1 >= 0) {
- ret = 0;
- goto again;
- }
- *err = ret1;
- ret = 1;
- }
-out:
dout("get_cap_refs %p ret %d got %s\n", inode,
- ret, ceph_cap_string(_got));
- *got = _got;
+ ret, ceph_cap_string(*got));
return ret;
}
@@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff)
int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page)
{
- int check_max, ret, err;
+ int _got, check_max, ret, err = 0;
retry:
if (endoff > 0)
check_max_size(&ci->vfs_inode, endoff);
+ _got = 0;
check_max = 0;
- err = 0;
ret = wait_event_interruptible(ci->i_cap_wq,
- try_get_cap_refs(ci, need, want, endoff,
- got, pinned_page,
- &check_max, &err));
+ try_get_cap_refs(ci, need, want, endoff,
+ &_got, &check_max, &err));
if (err)
ret = err;
+ if (ret < 0)
+ return ret;
+
if (check_max)
goto retry;
- return ret;
+
+ if (ci->i_inline_version != CEPH_INLINE_NONE &&
+ (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+ i_size_read(&ci->vfs_inode) > 0) {
+ struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
+ if (page) {
+ if (PageUptodate(page)) {
+ *pinned_page = page;
+ goto out;
+ }
+ page_cache_release(page);
+ }
+ /*
+ * drop cap refs first because getattr while holding
+ * caps refs can cause deadlock.
+ */
+ ceph_put_cap_refs(ci, _got);
+ _got = 0;
+
+ /* getattr request will bring inline data into page cache */
+ ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+ CEPH_STAT_CAP_INLINE_DATA, true);
+ if (ret < 0)
+ return ret;
+ goto retry;
+ }
+out:
+ *got = _got;
+ return 0;
}
/*
@@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- void *snaptrace, int snaptrace_len,
u64 inline_version,
void *inline_data, int inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
+ __releases(mdsc->snap_rwsem)
{
struct ceph_inode_info *ci = ceph_inode(inode);
int mds = session->s_mds;
@@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
spin_unlock(&ci->i_ceph_lock);
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- down_write(&mdsc->snap_rwsem);
- ceph_update_snap_trace(mdsc, snaptrace,
- snaptrace + snaptrace_len, false);
- downgrade_write(&mdsc->snap_rwsem);
kick_flushing_inode_caps(mdsc, session, inode);
up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
@@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
+ struct ceph_snap_realm *realm;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
@@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
goto done_unlocked;
case CEPH_CAP_OP_IMPORT:
+ realm = NULL;
+ if (snaptrace_len) {
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace,
+ snaptrace + snaptrace_len,
+ false, &realm);
+ downgrade_write(&mdsc->snap_rwsem);
+ } else {
+ down_read(&mdsc->snap_rwsem);
+ }
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h, snaptrace, snaptrace_len,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
}
@@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h, NULL, 0,
+ handle_cap_grant(mdsc, inode, h,
inline_version, inline_data, inline_len,
msg->middle, session, cap, issued);
goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c241603764fd..0411dbb15815 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -26,8 +26,6 @@
* point by name.
*/
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
const struct dentry_operations ceph_dentry_ops;
/*
@@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
/*
* We created the item, then did a lookup, and found
* it was already linked to another inode we already
- * had in our cache (and thus got spliced). Link our
- * dentry to that inode, but don't hash it, just in
- * case the VFS wants to dereference it.
+ * had in our cache (and thus got spliced). To not
+ * confuse VFS (especially when inode is a directory),
+ * we don't link our dentry to that inode, return an
+ * error instead.
+ *
+ * This event should be rare and it happens only when
+ * we talk to old MDS. Recent MDS does not send traceless
+ * reply for request that creates new inode.
*/
- BUG_ON(!result->d_inode);
- d_instantiate(dentry, result->d_inode);
- return 0;
+ d_drop(result);
+ return -ESTALE;
}
return PTR_ERR(result);
}
@@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = {
.fsync = ceph_dir_fsync,
};
+const struct file_operations ceph_snapdir_fops = {
+ .iterate = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+};
+
const struct inode_operations ceph_dir_iops = {
.lookup = ceph_lookup,
.permission = ceph_permission,
@@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = {
.atomic_open = ceph_atomic_open,
};
+const struct inode_operations ceph_snapdir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .mkdir = ceph_mkdir,
+ .rmdir = ceph_unlink,
+};
+
const struct dentry_operations ceph_dentry_ops = {
.d_revalidate = ceph_d_revalidate,
.d_release = ceph_d_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req);
+ err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
- err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry);
@@ -392,13 +392,14 @@ more:
if (ret >= 0) {
int didpages;
if (was_short && (pos + ret < inode->i_size)) {
- u64 tmp = min(this_len - ret,
- inode->i_size - pos - ret);
+ int zlen = min(this_len - ret,
+ inode->i_size - pos - ret);
+ int zoff = (o_direct ? buf_align : io_align) +
+ read + ret;
dout(" zero gap %llu to %llu\n",
- pos + ret, pos + ret + tmp);
- ceph_zero_page_vector_range(page_align + read + ret,
- tmp, pages);
- ret += tmp;
+ pos + ret, pos + ret + zlen);
+ ceph_zero_page_vector_range(zoff, zlen, pages);
+ ret += zlen;
}
didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@ -878,28 +879,34 @@ again:
i_size = i_size_read(inode);
if (retry_op == READ_INLINE) {
- /* does not support inline data > PAGE_SIZE */
- if (i_size > PAGE_CACHE_SIZE) {
- ret = -EIO;
- } else if (iocb->ki_pos < i_size) {
+ BUG_ON(ret > 0 || read > 0);
+ if (iocb->ki_pos < i_size &&
+ iocb->ki_pos < PAGE_CACHE_SIZE) {
loff_t end = min_t(loff_t, i_size,
iocb->ki_pos + len);
+ end = min_t(loff_t, end, PAGE_CACHE_SIZE);
if (statret < end)
zero_user_segment(page, statret, end);
ret = copy_page_to_iter(page,
iocb->ki_pos & ~PAGE_MASK,
end - iocb->ki_pos, to);
iocb->ki_pos += ret;
- } else {
- ret = 0;
+ read += ret;
+ }
+ if (iocb->ki_pos < i_size && read < len) {
+ size_t zlen = min_t(size_t, len - read,
+ i_size - iocb->ki_pos);
+ ret = iov_iter_zero(zlen, to);
+ iocb->ki_pos += ret;
+ read += ret;
}
__free_pages(page, 0);
- return ret;
+ return read;
}
/* hit EOF or hole? */
if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
- ret < len) {
+ ret < len) {
dout("sync_read hit hole, ppos %lld < size %lld"
", reading more\n", iocb->ki_pos,
inode->i_size);
@@ -945,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = file->f_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_mode = parent->i_mode;
inode->i_uid = parent->i_uid;
inode->i_gid = parent->i_gid;
- inode->i_op = &ceph_dir_iops;
- inode->i_fop = &ceph_dir_fops;
+ inode->i_op = &ceph_snapdir_iops;
+ inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0;
return inode;
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
}
inode->i_mapping->a_ops = &ceph_aops;
- inode->i_mapping->backing_dev_info =
- &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
switch (inode->i_mode & S_IFMT) {
case S_IFIFO:
@@ -840,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
ceph_vinop(inode), inode->i_mode);
}
- /* set dir completion flag? */
- if (S_ISDIR(inode->i_mode) &&
- ci->i_files == 0 && ci->i_subdirs == 0 &&
- ceph_snap(inode) == CEPH_NOSNAP &&
- (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
- (issued & CEPH_CAP_FILE_EXCL) == 0 &&
- !__ceph_dir_is_complete(ci)) {
- dout(" marking %p complete (empty)\n", inode);
- __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
- ci->i_ordered_count);
- }
-
/* were we issued a capability? */
if (info->cap.caps) {
if (ceph_snap(inode) == CEPH_NOSNAP) {
+ unsigned caps = le32_to_cpu(info->cap.caps);
ceph_add_cap(inode, session,
le64_to_cpu(info->cap.cap_id),
- cap_fmode,
- le32_to_cpu(info->cap.caps),
+ cap_fmode, caps,
le32_to_cpu(info->cap.wanted),
le32_to_cpu(info->cap.seq),
le32_to_cpu(info->cap.mseq),
le64_to_cpu(info->cap.realm),
info->cap.flags, &new_cap);
+
+ /* set dir completion flag? */
+ if (S_ISDIR(inode->i_mode) &&
+ ci->i_files == 0 && ci->i_subdirs == 0 &&
+ (caps & CEPH_CAP_FILE_SHARED) &&
+ (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+ !__ceph_dir_is_complete(ci)) {
+ dout(" marking %p complete (empty)\n", inode);
+ __ceph_dir_set_complete(ci,
+ atomic_read(&ci->i_release_count),
+ ci->i_ordered_count);
+ }
+
wake = true;
} else {
dout(" %p got snap_caps %s\n", inode,
@@ -1448,12 +1447,14 @@ retry_lookup:
}
if (!dn->d_inode) {
- dn = splice_dentry(dn, in, NULL);
- if (IS_ERR(dn)) {
- err = PTR_ERR(dn);
+ struct dentry *realdn = splice_dentry(dn, in, NULL);
+ if (IS_ERR(realdn)) {
+ err = PTR_ERR(realdn);
+ d_drop(dn);
dn = NULL;
goto next_item;
}
+ dn = realdn;
}
di = dn->d_fsdata;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,26 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
return err;
}
-/**
- * Must be called with lock_flocks() already held. Fills in the passed
- * counter variables, so you can prepare pagelist metadata before calling
- * ceph_encode_locks.
+/*
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
+ * before calling ceph_encode_locks.
*/
void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
{
struct file_lock *lock;
+ struct file_lock_context *ctx;
*fcntl_count = 0;
*flock_count = 0;
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX)
+ ctx = inode->i_flctx;
+ if (ctx) {
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(lock, &ctx->flc_posix, fl_list)
++(*fcntl_count);
- else if (lock->fl_flags & FL_FLOCK)
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list)
++(*flock_count);
+ spin_unlock(&ctx->flc_lock);
}
dout("counted %d flock locks and %d fcntl locks",
*flock_count, *fcntl_count);
@@ -271,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
int num_fcntl_locks, int num_flock_locks)
{
struct file_lock *lock;
+ struct file_lock_context *ctx = inode->i_flctx;
int err = 0;
int seen_fcntl = 0;
int seen_flock = 0;
@@ -279,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
dout("encoding %d flock and %d fcntl locks", num_flock_locks,
num_fcntl_locks);
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_POSIX) {
- ++seen_fcntl;
- if (seen_fcntl > num_fcntl_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &flocks[l]);
- if (err)
- goto fail;
- ++l;
+ if (!ctx)
+ return 0;
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ ++seen_fcntl;
+ if (seen_fcntl > num_fcntl_locks) {
+ err = -ENOSPC;
+ goto fail;
}
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
}
- for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
- if (lock->fl_flags & FL_FLOCK) {
- ++seen_flock;
- if (seen_flock > num_flock_locks) {
- err = -ENOSPC;
- goto fail;
- }
- err = lock_to_ceph_filelock(lock, &flocks[l]);
- if (err)
- goto fail;
- ++l;
+ list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+ ++seen_flock;
+ if (seen_flock > num_flock_locks) {
+ err = -ENOSPC;
+ goto fail;
}
+ err = lock_to_ceph_filelock(lock, &flocks[l]);
+ if (err)
+ goto fail;
+ ++l;
}
fail:
+ spin_unlock(&ctx->flc_lock);
return err;
}
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d2171f4a6980..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
mdsc->max_sessions = newmax;
}
mdsc->sessions[mds] = s;
+ atomic_inc(&mdsc->num_sessions);
atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
@@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
mdsc->sessions[s->s_mds] = NULL;
ceph_con_close(&s->s_con);
ceph_put_mds_session(s);
+ atomic_dec(&mdsc->num_sessions);
}
/*
@@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
struct ceph_options *opt = mdsc->fsc->client->options;
void *p;
- const char* metadata[3][2] = {
+ const char* metadata[][2] = {
{"hostname", utsname()->nodename},
+ {"kernel_version", utsname()->release},
{"entity_id", opt->name ? opt->name : ""},
{NULL, NULL}
};
@@ -1464,19 +1467,33 @@ out_unlocked:
return err;
}
+static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int ret;
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_flushing_caps)
+ ret = ci->i_cap_flush_seq >= want_flush_seq;
+ else
+ ret = 1;
+ spin_unlock(&ci->i_ceph_lock);
+ return ret;
+}
+
/*
* flush all dirty inode data to disk.
*
* returns true if we've flushed through want_flush_seq
*/
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
{
- int mds, ret = 1;
+ int mds;
dout("check_cap_flush want %lld\n", want_flush_seq);
mutex_lock(&mdsc->mutex);
- for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+ for (mds = 0; mds < mdsc->max_sessions; mds++) {
struct ceph_mds_session *session = mdsc->sessions[mds];
+ struct inode *inode = NULL;
if (!session)
continue;
@@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
list_entry(session->s_cap_flushing.next,
struct ceph_inode_info,
i_flushing_item);
- struct inode *inode = &ci->vfs_inode;
- spin_lock(&ci->i_ceph_lock);
- if (ci->i_cap_flush_seq <= want_flush_seq) {
+ if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
dout("check_cap_flush still flushing %p "
- "seq %lld <= %lld to mds%d\n", inode,
- ci->i_cap_flush_seq, want_flush_seq,
- session->s_mds);
- ret = 0;
+ "seq %lld <= %lld to mds%d\n",
+ &ci->vfs_inode, ci->i_cap_flush_seq,
+ want_flush_seq, session->s_mds);
+ inode = igrab(&ci->vfs_inode);
}
- spin_unlock(&ci->i_ceph_lock);
}
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
- if (!ret)
- return ret;
+ if (inode) {
+ wait_event(mdsc->cap_flushing_wq,
+ check_cap_flush(inode, want_flush_seq));
+ iput(inode);
+ }
+
mutex_lock(&mdsc->mutex);
}
mutex_unlock(&mdsc->mutex);
dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
- return ret;
}
/*
@@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
head->num_releases = cpu_to_le16(releases);
/* time stamp */
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
@@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
/* time stamp */
p = msg->front.iov_base + req->r_request_release_offset;
- ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+ {
+ struct ceph_timespec ts;
+ ceph_encode_timespec(&ts, &req->r_stamp);
+ ceph_encode_copy(&p, &ts, sizeof(ts));
+ }
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
p = rb_next(p);
if (req->r_got_unsafe)
continue;
+ if (req->r_attempts > 0)
+ continue; /* only new requests */
if (req->r_session &&
req->r_session->s_mds == mds) {
dout(" kicking tid %llu\n", req->r_tid);
@@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
struct ceph_mds_request *req;
struct ceph_mds_reply_head *head = msg->front.iov_base;
struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
+ struct ceph_snap_realm *realm;
u64 tid;
int err, result;
int mds = session->s_mds;
@@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
/* snap trace */
+ realm = NULL;
if (rinfo->snapblob_len) {
down_write(&mdsc->snap_rwsem);
ceph_update_snap_trace(mdsc, rinfo->snapblob,
- rinfo->snapblob + rinfo->snapblob_len,
- le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+ rinfo->snapblob + rinfo->snapblob_len,
+ le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+ &realm);
downgrade_write(&mdsc->snap_rwsem);
} else {
down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem);
+ if (realm)
+ ceph_put_snap_realm(mdsc, realm);
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
BUG_ON(req->r_err);
BUG_ON(req->r_got_result);
+ req->r_attempts = 0;
req->r_num_fwd = fwd_seq;
req->r_resend_mds = next_mds;
put_request_session(req);
@@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session,
send_flushmsg_ack(mdsc, session, seq);
break;
+ case CEPH_SESSION_FORCE_RO:
+ dout("force_session_readonly %p\n", session);
+ spin_lock(&session->s_cap_lock);
+ session->s_readonly = true;
+ spin_unlock(&session->s_cap_lock);
+ wake_up_session_caps(session, 0);
+ break;
+
default:
pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
WARN_ON(1);
@@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_mds_request *req, *nreq;
+ struct rb_node *p;
int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds);
@@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
ceph_con_send(&session->s_con, req->r_request);
}
}
+
+ /*
+ * also re-send old requests when MDS enters reconnect stage. So that MDS
+ * can process completed request in clientreplay stage.
+ */
+ p = rb_first(&mdsc->request_tree);
+ while (p) {
+ req = rb_entry(p, struct ceph_mds_request, r_node);
+ p = rb_next(p);
+ if (req->r_got_unsafe)
+ continue;
+ if (req->r_attempts == 0)
+ continue; /* only old requests */
+ if (req->r_session &&
+ req->r_session->s_mds == session->s_mds) {
+ err = __prepare_send_request(mdsc, req, session->s_mds);
+ if (!err) {
+ ceph_msg_get(req->r_request);
+ ceph_con_send(&session->s_con, req->r_request);
+ }
+ }
+ }
mutex_unlock(&mdsc->mutex);
}
@@ -2700,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_filelock *flocks;
encode_again:
- spin_lock(&inode->i_lock);
ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
- spin_unlock(&inode->i_lock);
flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
sizeof(struct ceph_filelock), GFP_NOFS);
if (!flocks) {
err = -ENOMEM;
goto out_free;
}
- spin_lock(&inode->i_lock);
err = ceph_encode_locks_to_buffer(inode, flocks,
num_fcntl_locks,
num_flock_locks);
- spin_unlock(&inode->i_lock);
if (err) {
kfree(flocks);
if (err == -ENOSPC)
@@ -2791,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
spin_unlock(&session->s_gen_ttl_lock);
spin_lock(&session->s_cap_lock);
+ /* don't know if session is readonly */
+ session->s_readonly = 0;
/*
* notify __ceph_remove_cap() that we are composing cap reconnect.
* If a cap get released before being added to the cap reconnect,
@@ -2937,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc,
mutex_unlock(&s->s_mutex);
s->s_state = CEPH_MDS_SESSION_RESTARTING;
}
-
- /* kick any requests waiting on the recovering mds */
- kick_requests(mdsc, i);
} else if (oldstate == newstate) {
continue; /* nothing new with this mds */
}
@@ -3299,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
init_waitqueue_head(&mdsc->session_close_wq);
INIT_LIST_HEAD(&mdsc->waiting_for_map);
mdsc->sessions = NULL;
+ atomic_set(&mdsc->num_sessions, 0);
mdsc->max_sessions = 0;
mdsc->stopping = 0;
init_rwsem(&mdsc->snap_rwsem);
@@ -3432,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
dout("sync\n");
mutex_lock(&mdsc->mutex);
want_tid = mdsc->last_tid;
- want_flush = mdsc->cap_flush_seq;
mutex_unlock(&mdsc->mutex);
- dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
ceph_flush_dirty_caps(mdsc);
+ spin_lock(&mdsc->cap_dirty_lock);
+ want_flush = mdsc->cap_flush_seq;
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3447,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
*/
static bool done_closing_sessions(struct ceph_mds_client *mdsc)
{
- int i, n = 0;
-
if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
return true;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++)
- if (mdsc->sessions[i])
- n++;
- mutex_unlock(&mdsc->mutex);
- return n == 0;
+ return atomic_read(&mdsc->num_sessions) == 0;
}
/*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d00f7d9..1875b5d985c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
int s_nr_caps, s_trim_caps;
int s_num_cap_releases;
int s_cap_reconnect;
+ int s_readonly;
struct list_head s_cap_releases; /* waiting cap_release messages */
struct list_head s_cap_releases_done; /* ready to send */
struct ceph_cap *s_cap_iterator;
@@ -272,6 +273,7 @@ struct ceph_mds_client {
struct list_head waiting_for_map;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
+ atomic_t num_sessions;
int max_sessions; /* len of s_mds_sessions */
int stopping; /* true if shutting down */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ce35fbd4ba5d..a97e39f09ba6 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
* safe. we do need to protect against concurrent empty list
* additions, however.
*/
- if (atomic_read(&realm->nref) == 0) {
+ if (atomic_inc_return(&realm->nref) == 1) {
spin_lock(&mdsc->snap_empty_lock);
list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock);
}
-
- atomic_inc(&realm->nref);
}
static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
if (!realm)
return ERR_PTR(-ENOMEM);
- atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ atomic_set(&realm->nref, 1); /* for caller */
realm->ino = ino;
INIT_LIST_HEAD(&realm->children);
INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
*
* caller must hold snap_rwsem for write.
*/
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
- u64 ino)
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
{
struct rb_node *n = mdsc->snap_realms.rb_node;
struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
return NULL;
}
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *r;
+ r = __lookup_snap_realm(mdsc, ino);
+ if (r)
+ ceph_get_snap_realm(mdsc, r);
+ return r;
+}
+
static void __put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
}
realm->parent_ino = parentino;
realm->parent = parent;
- ceph_get_snap_realm(mdsc, parent);
list_add(&realm->child_item, &parent->children);
return 1;
}
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
* Caller must hold snap_rwsem for write.
*/
int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
- void *p, void *e, bool deletion)
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret)
{
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *first_realm = NULL;
int invalidate = 0;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
realm, invalidate, p, e);
- if (p < e)
- goto more;
-
/* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate)
+ if (invalidate && p >= e)
rebuild_snap_realms(realm);
+ if (!first_realm)
+ first_realm = realm;
+ else
+ ceph_put_snap_realm(mdsc, realm);
+
+ if (p < e)
+ goto more;
+
/*
* queue cap snaps _after_ we've built the new snap contexts,
* so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
queue_realm_cap_snaps(realm);
}
+ if (realm_ret)
+ *realm_ret = first_realm;
+ else
+ ceph_put_snap_realm(mdsc, first_realm);
+
__cleanup_empty_realms(mdsc);
return 0;
bad:
err = -EINVAL;
fail:
+ if (realm && !IS_ERR(realm))
+ ceph_put_snap_realm(mdsc, realm);
+ if (first_realm)
+ ceph_put_snap_realm(mdsc, first_realm);
pr_err("update_snap_trace error %d\n", err);
return err;
}
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
if (IS_ERR(realm))
goto out;
}
- ceph_get_snap_realm(mdsc, realm);
dout("splitting snap_realm %llx %p\n", realm->ino, realm);
for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
/* we may have taken some of the old realm's children. */
for (i = 0; i < num_split_realms; i++) {
struct ceph_snap_realm *child =
- ceph_lookup_snap_realm(mdsc,
+ __lookup_snap_realm(mdsc,
le64_to_cpu(split_realms[i]));
if (!child)
continue;
@@ -918,7 +940,7 @@ skip_inode:
* snap, we can avoid queueing cap_snaps.
*/
ceph_update_snap_trace(mdsc, p, e,
- op == CEPH_SNAP_OP_DESTROY);
+ op == CEPH_SNAP_OP_DESTROY, NULL);
if (op == CEPH_SNAP_OP_SPLIT)
/* we took a reference when we created the realm, above */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 50f06cddc94b..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
dout("put_super\n");
ceph_mdsc_close_sessions(fsc->mdsc);
-
- /*
- * ensure we release the bdi before put_anon_super releases
- * the device name.
- */
- if (s->s_bdi == &fsc->backing_dev_info) {
- bdi_unregister(&fsc->backing_dev_info);
- s->s_bdi = NULL;
- }
-
- return;
}
static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -425,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noshare");
if (opt->flags & CEPH_OPT_NOCRC)
seq_puts(m, ",nocrc");
+ if (opt->flags & CEPH_OPT_NOMSGAUTH)
+ seq_puts(m, ",nocephx_require_signatures");
+ if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+ seq_puts(m, ",notcp_nodelay");
if (opt->name)
seq_printf(m, ",name=%s", opt->name);
@@ -910,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb,
>> PAGE_SHIFT;
else
fsc->backing_dev_info.ra_pages =
- default_backing_dev_info.ra_pages;
+ VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
atomic_long_inc_return(&bdi_seq));
@@ -1002,11 +995,16 @@ out_final:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+ dev_t dev = s->s_dev;
+
dout("kill_sb %p\n", s);
+
ceph_mdsc_pre_umount(fsc->mdsc);
- kill_anon_super(s); /* will call put_super after sb is r/o */
+ generic_shutdown_super(s);
ceph_mdsc_destroy(fsc);
+
destroy_fs_client(fsc);
+ free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e1aa32d0759d..04c8124ed30e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm);
extern int ceph_update_snap_trace(struct ceph_mds_client *m,
- void *p, void *e, bool deletion);
+ void *p, void *e, bool deletion,
+ struct ceph_snap_realm **realm_ret);
extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
@@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
int ceph_uninline_data(struct file *filp, struct page *locked_page);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
ceph_snapdir_dentry_ops;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
#include "internal.h"
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
- .name = "char",
- .capabilities = (
-#ifdef CONFIG_MMU
- /* permit private copies of the data to be taken */
- BDI_CAP_MAP_COPY |
-#endif
- /* permit direct mmap, for read, write or exec */
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
- /* no writeback happens */
- BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
-
static struct kobj_map *cdev_map;
static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
void __init chrdev_init(void)
{
cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
- if (bdi_init(&directly_mappable_cdev_bdi))
- panic("Failed to init directly mappable cdev bdi");
}
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
EXPORT_SYMBOL(cdev_add);
EXPORT_SYMBOL(__register_chrdev);
EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9c56ef776407..7febcf2475c5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -606,9 +606,11 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
*flags = CIFSSEC_MUST_NTLMV2;
else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
*flags = CIFSSEC_MUST_NTLM;
- else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+ else if (CIFSSEC_MUST_LANMAN &&
+ (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
*flags = CIFSSEC_MUST_LANMAN;
- else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+ else if (CIFSSEC_MUST_PLNTXT &&
+ (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
*flags = CIFSSEC_MUST_PLNTXT;
*flags |= signflags;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6e139111fdb2..22b289a3b1c4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -661,16 +661,16 @@ set_credits(struct TCP_Server_Info *server, const int val)
server->ops->set_credits(server, val);
}
-static inline __u64
+static inline __le64
get_next_mid64(struct TCP_Server_Info *server)
{
- return server->ops->get_next_mid(server);
+ return cpu_to_le64(server->ops->get_next_mid(server));
}
static inline __le16
get_next_mid(struct TCP_Server_Info *server)
{
- __u16 mid = get_next_mid64(server);
+ __u16 mid = server->ops->get_next_mid(server);
/*
* The value in the SMB header should be little endian for easy
* on-the-wire decoding.
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
int referral_walks_count = 0;
#endif
- rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
if (rc)
return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
struct cifsLockInfo *li, *tmp;
struct cifs_fid fid;
struct cifs_pending_open open;
+ bool oplock_break_cancelled;
spin_lock(&cifs_file_list_lock);
if (--cifs_file->count > 0) {
@@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
}
spin_unlock(&cifs_file_list_lock);
- cancel_work_sync(&cifs_file->oplock_break);
+ oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
struct TCP_Server_Info *server = tcon->ses->server;
@@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
_free_xid(xid);
}
+ if (oplock_break_cancelled)
+ cifs_done_oplock_break(cifsi);
+
cifs_del_pending_open(&open);
/*
@@ -1109,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
return rc;
}
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
- for (lockp = &inode->i_flock; *lockp != NULL; \
- lockp = &(*lockp)->fl_next)
-
struct lock_to_push {
struct list_head llist;
__u64 offset;
@@ -1128,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
struct inode *inode = cfile->dentry->d_inode;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
- struct file_lock *flock, **before;
- unsigned int count = 0, i = 0;
+ struct file_lock *flock;
+ struct file_lock_context *flctx = inode->i_flctx;
+ unsigned int count = 0, i;
int rc = 0, xid, type;
struct list_head locks_to_send, *el;
struct lock_to_push *lck, *tmp;
@@ -1137,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
xid = get_xid();
- spin_lock(&inode->i_lock);
- cifs_for_each_lock(inode, before) {
- if ((*before)->fl_flags & FL_POSIX)
- count++;
+ if (!flctx)
+ goto out;
+
+ spin_lock(&flctx->flc_lock);
+ list_for_each(el, &flctx->flc_posix) {
+ count++;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
INIT_LIST_HEAD(&locks_to_send);
@@ -1151,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
* added to the list while we are holding cinode->lock_sem that
* protects locking operations of this inode.
*/
- for (; i < count; i++) {
+ for (i = 0; i < count; i++) {
lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
if (!lck) {
rc = -ENOMEM;
@@ -1161,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
}
el = locks_to_send.next;
- spin_lock(&inode->i_lock);
- cifs_for_each_lock(inode, before) {
- flock = *before;
- if ((flock->fl_flags & FL_POSIX) == 0)
- continue;
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
if (el == &locks_to_send) {
/*
* The list ended. We don't have enough allocated
@@ -1185,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
lck->length = length;
lck->type = type;
lck->offset = flock->fl_start;
- el = el->next;
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
int stored_rc;
@@ -3244,7 +3242,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = cifs_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
inode->i_flags |= S_NOATIME | S_NOCMTIME;
if (inode->i_state & I_NEW) {
inode->i_ino = hash;
- if (S_ISREG(inode->i_mode))
- inode->i_data.backing_dev_info = sb->s_bdi;
#ifdef CONFIG_CIFS_FSCACHE
/* initialize per-inode cache cookie pointer */
CIFS_I(inode)->fscache = NULL;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 45cb59bcc791..8b7898b7670f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
}
src_inode = file_inode(src_file.file);
+ rc = -EINVAL;
+ if (S_ISDIR(src_inode->i_mode))
+ goto out_fput;
/*
* Note: cifs case is easier than btrfs since server responsible for
* checks for proper open modes and file type and if it wants
* server could even support copy of range where source = target
*/
-
- /* so we do not deadlock racing two ioctls on same files */
- if (target_inode < src_inode) {
- mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
- } else {
- mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
- mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
- }
+ lock_two_nondirectories(target_inode, src_inode);
/* determine range to clone */
rc = -EINVAL;
@@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
out_unlock:
/* although unlocking in the reverse order from locking is not
strictly necessary here it is a little cleaner to be consistent */
- if (target_inode < src_inode) {
- mutex_unlock(&src_inode->i_mutex);
- mutex_unlock(&target_inode->i_mutex);
- } else {
- mutex_unlock(&target_inode->i_mutex);
- mutex_unlock(&src_inode->i_mutex);
- }
+ unlock_two_nondirectories(src_inode, target_inode);
out_fput:
fdput(src_file);
out_drop_write:
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index b333ff60781d..abae6dd2c6b9 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -926,6 +926,7 @@ cifs_NTtimeToUnix(__le64 ntutc)
/* Subtract the NTFS time offset, then convert to 1s intervals. */
s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+ u64 abs_t;
/*
* Unfortunately can not use normal 64 bit division on 32 bit arch, but
@@ -933,13 +934,14 @@ cifs_NTtimeToUnix(__le64 ntutc)
* to special case them
*/
if (t < 0) {
- t = -t;
- ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+ abs_t = -t;
+ ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100);
ts.tv_nsec = -ts.tv_nsec;
- ts.tv_sec = -t;
+ ts.tv_sec = -abs_t;
} else {
- ts.tv_nsec = (long)do_div(t, 10000000) * 100;
- ts.tv_sec = t;
+ abs_t = t;
+ ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100;
+ ts.tv_sec = abs_t;
}
return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8eaf20a80649..c295338e0a98 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -69,7 +69,8 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
* Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
*
* Find the dentry that matches "name". If there isn't one, create one. If it's
- * a negative dentry or the uniqueid changed, then drop it and recreate it.
+ * a negative dentry or the uniqueid or filetype(mode) changed,
+ * then drop it and recreate it.
*/
static void
cifs_prime_dcache(struct dentry *parent, struct qstr *name,
@@ -97,8 +98,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
- /* update inode in place if i_ino didn't change */
- if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ /* update inode in place
+ * if both i_ino and i_mode didn't change */
+ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
+ (inode->i_mode & S_IFMT) ==
+ (fattr->cf_mode & S_IFMT)) {
cifs_fattr_to_inode(inode, fattr);
goto out;
}
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f1cefc9763ed..689f035915cf 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -32,12 +32,14 @@
static int
check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
{
+ __u64 wire_mid = le64_to_cpu(hdr->MessageId);
+
/*
* Make sure that this really is an SMB, that it is a response,
* and that the message ids match.
*/
if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
- (mid == hdr->MessageId)) {
+ (mid == wire_mid)) {
if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
return 0;
else {
@@ -51,11 +53,11 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
cifs_dbg(VFS, "Bad protocol string signature header %x\n",
*(unsigned int *) hdr->ProtocolId);
- if (mid != hdr->MessageId)
+ if (mid != wire_mid)
cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
- mid, hdr->MessageId);
+ mid, wire_mid);
}
- cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId);
+ cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid);
return 1;
}
@@ -95,7 +97,7 @@ smb2_check_message(char *buf, unsigned int length)
{
struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
- __u64 mid = hdr->MessageId;
+ __u64 mid = le64_to_cpu(hdr->MessageId);
__u32 len = get_rfc1002_length(buf);
__u32 clc_len; /* calculated length */
int command;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 93fd0586f9ec..96b5d40a2ece 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -176,10 +176,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
{
struct mid_q_entry *mid;
struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+ __u64 wire_mid = le64_to_cpu(hdr->MessageId);
spin_lock(&GlobalMid_Lock);
list_for_each_entry(mid, &server->pending_mid_q, qhead) {
- if ((mid->mid == hdr->MessageId) &&
+ if ((mid->mid == wire_mid) &&
(mid->mid_state == MID_REQUEST_SUBMITTED) &&
(mid->command == hdr->Command)) {
spin_unlock(&GlobalMid_Lock);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ce858477002a..70867d54fb8b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -110,7 +110,7 @@ struct smb2_hdr {
__le16 CreditRequest; /* CreditResponse */
__le32 Flags;
__le32 NextCommand;
- __u64 MessageId; /* opaque - so can stay little endian */
+ __le64 MessageId;
__le32 ProcessId;
__u32 TreeId; /* opaque - so do not make little endian */
__u64 SessionId; /* opaque - so do not make little endian */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 5111e7272db6..d4c5b6f109a7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -490,7 +490,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
return temp;
else {
memset(temp, 0, sizeof(struct mid_q_entry));
- temp->mid = smb_buffer->MessageId; /* always LE */
+ temp->mid = le64_to_cpu(smb_buffer->MessageId);
temp->pid = current->pid;
temp->command = smb_buffer->Command; /* Always LE */
temp->when_alloc = jiffies;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6c1566366a66..a4232ec4f2ba 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -221,7 +221,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
}
rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
- memset(wpwd, 0, 129 * sizeof(__le16));
+ memzero_explicit(wpwd, sizeof(wpwd));
return rc;
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 86c893884eb9..281ee011bb6a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,29 +28,6 @@
#include "coda_int.h"
-/* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
-static int coda_link(struct dentry *old_dentry, struct inode *dir_inode,
- struct dentry *entry);
-static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
-static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
- const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
-static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
-static int coda_rename(struct inode *old_inode, struct dentry *old_dentry,
- struct inode *new_inode, struct dentry *new_dentry);
-
-/* dir file-ops */
-static int coda_readdir(struct file *file, struct dir_context *ctx);
-
-/* dentry ops */
-static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
-static int coda_dentry_delete(const struct dentry *);
-
-/* support routines */
-static int coda_venus_readdir(struct file *, struct dir_context *);
-
/* same as fs/bad_inode.c */
static int coda_return_EIO(void)
{
@@ -58,38 +35,6 @@ static int coda_return_EIO(void)
}
#define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-const struct dentry_operations coda_dentry_operations =
-{
- .d_revalidate = coda_dentry_revalidate,
- .d_delete = coda_dentry_delete,
-};
-
-const struct inode_operations coda_dir_inode_operations =
-{
- .create = coda_create,
- .lookup = coda_lookup,
- .link = coda_link,
- .unlink = coda_unlink,
- .symlink = coda_symlink,
- .mkdir = coda_mkdir,
- .rmdir = coda_rmdir,
- .mknod = CODA_EIO_ERROR,
- .rename = coda_rename,
- .permission = coda_permission,
- .getattr = coda_getattr,
- .setattr = coda_setattr,
-};
-
-const struct file_operations coda_dir_operations = {
- .llseek = generic_file_llseek,
- .read = generic_read_dir,
- .iterate = coda_readdir,
- .open = coda_open,
- .release = coda_release,
- .fsync = coda_fsync,
-};
-
-
/* inode operations for directories */
/* access routines: lookup, readlink, permission */
static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
@@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
return error;
}
-
-/* file operations for directories */
-static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
-{
- struct coda_file_info *cfi;
- struct file *host_file;
- int ret;
-
- cfi = CODA_FTOC(coda_file);
- BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
- host_file = cfi->cfi_container;
-
- if (host_file->f_op->iterate) {
- struct inode *host_inode = file_inode(host_file);
- mutex_lock(&host_inode->i_mutex);
- ret = -ENOENT;
- if (!IS_DEADDIR(host_inode)) {
- ret = host_file->f_op->iterate(host_file, ctx);
- file_accessed(host_file);
- }
- mutex_unlock(&host_inode->i_mutex);
- return ret;
- }
- /* Venus: we must read Venus dirents from a file */
- return coda_venus_readdir(coda_file, ctx);
-}
-
static inline unsigned int CDT2DT(unsigned char cdt)
{
unsigned int dt;
@@ -495,6 +413,33 @@ out:
return 0;
}
+/* file operations for directories */
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
+{
+ struct coda_file_info *cfi;
+ struct file *host_file;
+ int ret;
+
+ cfi = CODA_FTOC(coda_file);
+ BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+ host_file = cfi->cfi_container;
+
+ if (host_file->f_op->iterate) {
+ struct inode *host_inode = file_inode(host_file);
+
+ mutex_lock(&host_inode->i_mutex);
+ ret = -ENOENT;
+ if (!IS_DEADDIR(host_inode)) {
+ ret = host_file->f_op->iterate(host_file, ctx);
+ file_accessed(host_file);
+ }
+ mutex_unlock(&host_inode->i_mutex);
+ return ret;
+ }
+ /* Venus: we must read Venus dirents from a file */
+ return coda_venus_readdir(coda_file, ctx);
+}
+
/* called when a cache lookup succeeds */
static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
{
@@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode)
}
return 0;
}
+
+const struct dentry_operations coda_dentry_operations = {
+ .d_revalidate = coda_dentry_revalidate,
+ .d_delete = coda_dentry_delete,
+};
+
+const struct inode_operations coda_dir_inode_operations = {
+ .create = coda_create,
+ .lookup = coda_lookup,
+ .link = coda_link,
+ .unlink = coda_unlink,
+ .symlink = coda_symlink,
+ .mkdir = coda_mkdir,
+ .rmdir = coda_rmdir,
+ .mknod = CODA_EIO_ERROR,
+ .rename = coda_rename,
+ .permission = coda_permission,
+ .getattr = coda_getattr,
+ .setattr = coda_setattr,
+};
+
+const struct file_operations coda_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate = coda_readdir,
+ .open = coda_open,
+ .release = coda_release,
+ .fsync = coda_fsync,
+};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
goto unlock_out;
}
- error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+ error = bdi_setup_and_register(&vc->bdi, "coda");
if (error)
goto unlock_out;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
.write_end = simple_write_end,
};
-static struct backing_dev_info configfs_backing_dev_info = {
- .name = "configfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static const struct inode_operations configfs_inode_operations ={
.setattr = configfs_setattr,
};
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
if (inode) {
inode->i_ino = get_next_ino();
inode->i_mapping->a_ops = &configfs_aops;
- inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
inode->i_op = &configfs_inode_operations;
if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
}
mutex_unlock(&dir->d_inode->i_mutex);
}
-
-int __init configfs_inode_init(void)
-{
- return bdi_init(&configfs_backing_dev_info);
-}
-
-void configfs_inode_exit(void)
-{
- bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
if (!config_kobj)
goto out2;
- err = configfs_inode_init();
- if (err)
- goto out3;
-
err = register_filesystem(&configfs_fs_type);
if (err)
- goto out4;
+ goto out3;
return 0;
-out4:
- pr_err("Unable to register filesystem!\n");
- configfs_inode_exit();
out3:
+ pr_err("Unable to register filesystem!\n");
kobject_put(config_kobj);
out2:
kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
kobject_put(config_kobj);
kmem_cache_destroy(configfs_dir_cachep);
configfs_dir_cachep = NULL;
- configfs_inode_exit();
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ sector_t sector = block << (inode->i_blkbits - 9);
+
+ might_sleep();
+ do {
+ void *addr;
+ unsigned long pfn;
+ long count;
+
+ count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+ if (count < 0)
+ return count;
+ BUG_ON(size < count);
+ while (count > 0) {
+ unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+ if (pgsz > count)
+ pgsz = count;
+ if (pgsz < PAGE_SIZE)
+ memset(addr, 0, pgsz);
+ else
+ clear_page(addr);
+ addr += pgsz;
+ size -= pgsz;
+ count -= pgsz;
+ BUG_ON(pgsz & 511);
+ sector += pgsz / 512;
+ cond_resched();
+ }
+ } while (size);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+ unsigned long pfn;
+ sector_t sector = bh->b_blocknr << (blkbits - 9);
+ return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+ loff_t end)
+{
+ loff_t final = end - pos + first; /* The final byte of the buffer */
+
+ if (first > 0)
+ memset(addr, 0, first);
+ if (final < size)
+ memset(addr + final, 0, size - final);
+}
+
+static bool buffer_written(struct buffer_head *bh)
+{
+ return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size. To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size. Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+ return bh->b_state != 0;
+}
+
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+ loff_t start, loff_t end, get_block_t get_block,
+ struct buffer_head *bh)
+{
+ ssize_t retval = 0;
+ loff_t pos = start;
+ loff_t max = start;
+ loff_t bh_max = start;
+ void *addr;
+ bool hole = false;
+
+ if (rw != WRITE)
+ end = min(end, i_size_read(inode));
+
+ while (pos < end) {
+ unsigned len;
+ if (pos == max) {
+ unsigned blkbits = inode->i_blkbits;
+ sector_t block = pos >> blkbits;
+ unsigned first = pos - (block << blkbits);
+ long size;
+
+ if (pos == bh_max) {
+ bh->b_size = PAGE_ALIGN(end - pos);
+ bh->b_state = 0;
+ retval = get_block(inode, block, bh,
+ rw == WRITE);
+ if (retval)
+ break;
+ if (!buffer_size_valid(bh))
+ bh->b_size = 1 << blkbits;
+ bh_max = pos - first + bh->b_size;
+ } else {
+ unsigned done = bh->b_size -
+ (bh_max - (pos - first));
+ bh->b_blocknr += done >> blkbits;
+ bh->b_size -= done;
+ }
+
+ hole = (rw != WRITE) && !buffer_written(bh);
+ if (hole) {
+ addr = NULL;
+ size = bh->b_size - first;
+ } else {
+ retval = dax_get_addr(bh, &addr, blkbits);
+ if (retval < 0)
+ break;
+ if (buffer_unwritten(bh) || buffer_new(bh))
+ dax_new_buf(addr, retval, first, pos,
+ end);
+ addr += first;
+ size = retval - first;
+ }
+ max = min(pos + size, end);
+ }
+
+ if (rw == WRITE)
+ len = copy_from_iter(addr, max - pos, iter);
+ else if (!hole)
+ len = copy_to_iter(addr, max - pos, iter);
+ else
+ len = iov_iter_zero(max - pos, iter);
+
+ if (!len)
+ break;
+
+ pos += len;
+ addr += len;
+ }
+
+ return (pos == start) ? retval : pos - start;
+}
+
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes. For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+ struct iov_iter *iter, loff_t pos,
+ get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+ struct buffer_head bh;
+ ssize_t retval = -EINVAL;
+ loff_t end = pos + iov_iter_count(iter);
+
+ memset(&bh, 0, sizeof(bh));
+
+ if ((flags & DIO_LOCKING) && (rw == READ)) {
+ struct address_space *mapping = inode->i_mapping;
+ mutex_lock(&inode->i_mutex);
+ retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+ if (retval) {
+ mutex_unlock(&inode->i_mutex);
+ goto out;
+ }
+ }
+
+ /* Protects against truncate */
+ atomic_inc(&inode->i_dio_count);
+
+ retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+
+ if ((flags & DIO_LOCKING) && (rw == READ))
+ mutex_unlock(&inode->i_mutex);
+
+ if ((retval > 0) && end_io)
+ end_io(iocb, pos, retval, bh.b_private);
+
+ inode_dio_done(inode);
+ out:
+ return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+
+/*
+ * The user has performed a load from a hole in the file. Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files. We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+ struct vm_fault *vmf)
+{
+ unsigned long size;
+ struct inode *inode = mapping->host;
+ if (!page)
+ page = find_or_create_page(mapping, vmf->pgoff,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ return VM_FAULT_OOM;
+ /* Recheck i_size under page lock to avoid truncate race */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size) {
+ unlock_page(page);
+ page_cache_release(page);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf->page = page;
+ return VM_FAULT_LOCKED;
+}
+
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+ unsigned blkbits, unsigned long vaddr)
+{
+ void *vfrom, *vto;
+ if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+ return -EIO;
+ vto = kmap_atomic(to);
+ copy_user_page(vto, vfrom, vaddr, to);
+ kunmap_atomic(vto);
+ return 0;
+}
+
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+ struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct address_space *mapping = inode->i_mapping;
+ sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ void *addr;
+ unsigned long pfn;
+ pgoff_t size;
+ int error;
+
+ i_mmap_lock_read(mapping);
+
+ /*
+ * Check truncate didn't happen while we were allocating a block.
+ * If it did, this block may or may not be still allocated to the
+ * file. We can't tell the filesystem to free it because we can't
+ * take i_mutex here. In the worst case, the file still has blocks
+ * allocated past the end of the file.
+ */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ error = -EIO;
+ goto out;
+ }
+
+ error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+ if (error < 0)
+ goto out;
+ if (error < PAGE_SIZE) {
+ error = -EIO;
+ goto out;
+ }
+
+ if (buffer_unwritten(bh) || buffer_new(bh))
+ clear_page(addr);
+
+ error = vm_insert_mixed(vma, vaddr, pfn);
+
+ out:
+ i_mmap_unlock_read(mapping);
+
+ if (bh->b_end_io)
+ bh->b_end_io(bh, 1);
+
+ return error;
+}
+
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ struct page *page;
+ struct buffer_head bh;
+ unsigned long vaddr = (unsigned long)vmf->virtual_address;
+ unsigned blkbits = inode->i_blkbits;
+ sector_t block;
+ pgoff_t size;
+ int error;
+ int major = 0;
+
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ return VM_FAULT_SIGBUS;
+
+ memset(&bh, 0, sizeof(bh));
+ block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+ bh.b_size = PAGE_SIZE;
+
+ repeat:
+ page = find_get_page(mapping, vmf->pgoff);
+ if (page) {
+ if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+ page_cache_release(page);
+ return VM_FAULT_RETRY;
+ }
+ if (unlikely(page->mapping != mapping)) {
+ unlock_page(page);
+ page_cache_release(page);
+ goto repeat;
+ }
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (unlikely(vmf->pgoff >= size)) {
+ /*
+ * We have a struct page covering a hole in the file
+ * from a read fault and we've raced with a truncate
+ */
+ error = -EIO;
+ goto unlock_page;
+ }
+ }
+
+ error = get_block(inode, block, &bh, 0);
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO; /* fs corruption? */
+ if (error)
+ goto unlock_page;
+
+ if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ error = get_block(inode, block, &bh, 1);
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ major = VM_FAULT_MAJOR;
+ if (!error && (bh.b_size < PAGE_SIZE))
+ error = -EIO;
+ if (error)
+ goto unlock_page;
+ } else {
+ return dax_load_hole(mapping, page, vmf);
+ }
+ }
+
+ if (vmf->cow_page) {
+ struct page *new_page = vmf->cow_page;
+ if (buffer_written(&bh))
+ error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+ else
+ clear_user_highpage(new_page, vaddr);
+ if (error)
+ goto unlock_page;
+ vmf->page = page;
+ if (!page) {
+ i_mmap_lock_read(mapping);
+ /* Check we didn't race with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+ PAGE_SHIFT;
+ if (vmf->pgoff >= size) {
+ i_mmap_unlock_read(mapping);
+ error = -EIO;
+ goto out;
+ }
+ }
+ return VM_FAULT_LOCKED;
+ }
+
+ /* Check we didn't race with a read fault installing a new page */
+ if (!page && major)
+ page = find_lock_page(mapping, vmf->pgoff);
+
+ if (page) {
+ unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+ PAGE_CACHE_SIZE, 0);
+ delete_from_page_cache(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ error = dax_insert_mapping(inode, &bh, vma, vmf);
+
+ out:
+ if (error == -ENOMEM)
+ return VM_FAULT_OOM | major;
+ /* -EBUSY is fine, somebody else faulted on the same PTE */
+ if ((error < 0) && (error != -EBUSY))
+ return VM_FAULT_SIGBUS | major;
+ return VM_FAULT_NOPAGE | major;
+
+ unlock_page:
+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ goto out;
+}
+
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+ get_block_t get_block)
+{
+ int result;
+ struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(sb);
+ file_update_time(vma->vm_file);
+ }
+ result = do_dax_fault(vma, vmf, get_block);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(sb);
+
+ return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file. This is intended for hole-punch operations. If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks. Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+ get_block_t get_block)
+{
+ struct buffer_head bh;
+ pgoff_t index = from >> PAGE_CACHE_SHIFT;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ int err;
+
+ /* Block boundary? Nothing to do */
+ if (!length)
+ return 0;
+ BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+
+ memset(&bh, 0, sizeof(bh));
+ bh.b_size = PAGE_CACHE_SIZE;
+ err = get_block(inode, index, &bh, 0);
+ if (err < 0)
+ return err;
+ if (buffer_written(&bh)) {
+ void *addr;
+ err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+ if (err < 0)
+ return err;
+ memset(addr + offset, 0, length);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks. Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+ unsigned length = PAGE_CACHE_ALIGN(from) - from;
+ return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..dc400fd29f4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,8 @@
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <linux/kasan.h>
+
#include "internal.h"
#include "mount.h"
@@ -400,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
* LRU lists entirely, while shrink_move moves it to the indicated
* private list.
*/
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags &= ~DCACHE_LRU_LIST;
this_cpu_dec(nr_dentry_unused);
- list_del_init(&dentry->d_lru);
+ list_lru_isolate(lru, &dentry->d_lru);
}
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+ struct list_head *list)
{
D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
dentry->d_flags |= DCACHE_SHRINK_LIST;
- list_move_tail(&dentry->d_lru, list);
+ list_lru_isolate_move(lru, &dentry->d_lru, list);
}
/*
@@ -508,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry)
* dentry_iput drops the locks, at which point nobody (except
* transient RCU lookups) can reach this dentry.
*/
- BUG_ON((int)dentry->d_lockref.count > 0);
+ BUG_ON(dentry->d_lockref.count > 0);
this_cpu_dec(nr_dentry);
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
@@ -561,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
struct dentry *parent = dentry->d_parent;
if (IS_ROOT(dentry))
return NULL;
- if (unlikely((int)dentry->d_lockref.count < 0))
+ if (unlikely(dentry->d_lockref.count < 0))
return NULL;
if (likely(spin_trylock(&parent->d_lock)))
return parent;
@@ -590,6 +593,110 @@ again:
return parent;
}
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+ int ret;
+ unsigned int d_flags;
+
+ /*
+ * If we have a d_op->d_delete() operation, we sould not
+ * let the dentry count go to zero, so use "put__or_lock".
+ */
+ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+ return lockref_put_or_lock(&dentry->d_lockref);
+
+ /*
+ * .. otherwise, we can try to just decrement the
+ * lockref optimistically.
+ */
+ ret = lockref_put_return(&dentry->d_lockref);
+
+ /*
+ * If the lockref_put_return() failed due to the lock being held
+ * by somebody else, the fast path has failed. We will need to
+ * get the lock, and then check the count again.
+ */
+ if (unlikely(ret < 0)) {
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_lockref.count > 1) {
+ dentry->d_lockref.count--;
+ spin_unlock(&dentry->d_lock);
+ return 1;
+ }
+ return 0;
+ }
+
+ /*
+ * If we weren't the last ref, we're done.
+ */
+ if (ret)
+ return 1;
+
+ /*
+ * Careful, careful. The reference count went down
+ * to zero, but we don't hold the dentry lock, so
+ * somebody else could get it again, and do another
+ * dput(), and we need to not race with that.
+ *
+ * However, there is a very special and common case
+ * where we don't care, because there is nothing to
+ * do: the dentry is still hashed, it does not have
+ * a 'delete' op, and it's referenced and already on
+ * the LRU list.
+ *
+ * NOTE! Since we aren't locked, these values are
+ * not "stable". However, it is sufficient that at
+ * some point after we dropped the reference the
+ * dentry was hashed and the flags had the proper
+ * value. Other dentry users may have re-gotten
+ * a reference to the dentry and change that, but
+ * our work is done - we can leave the dentry
+ * around with a zero refcount.
+ */
+ smp_rmb();
+ d_flags = ACCESS_ONCE(dentry->d_flags);
+ d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+
+ /* Nothing to do? Dropping the reference was all we needed? */
+ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+ return 1;
+
+ /*
+ * Not the fast normal case? Get the lock. We've already decremented
+ * the refcount, but we'll need to re-check the situation after
+ * getting the lock.
+ */
+ spin_lock(&dentry->d_lock);
+
+ /*
+ * Did somebody else grab a reference to it in the meantime, and
+ * we're no longer the last user after all? Alternatively, somebody
+ * else could have killed it and marked it dead. Either way, we
+ * don't need to do anything else.
+ */
+ if (dentry->d_lockref.count) {
+ spin_unlock(&dentry->d_lock);
+ return 1;
+ }
+
+ /*
+ * Re-get the reference we optimistically dropped. We hold the
+ * lock, and we just tested that it was zero, so we can just
+ * set it to 1.
+ */
+ dentry->d_lockref.count = 1;
+ return 0;
+}
+
+
/*
* This is dput
*
@@ -622,8 +729,14 @@ void dput(struct dentry *dentry)
return;
repeat:
- if (lockref_put_or_lock(&dentry->d_lockref))
+ rcu_read_lock();
+ if (likely(fast_dput(dentry))) {
+ rcu_read_unlock();
return;
+ }
+
+ /* Slow case: now with the dentry lock held */
+ rcu_read_unlock();
/* Unreachable? Get rid of it */
if (unlikely(d_unhashed(dentry)))
@@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list)
* We found an inuse dentry which was not removed from
* the LRU because of laziness during lookup. Do not free it.
*/
- if ((int)dentry->d_lockref.count > 0) {
+ if (dentry->d_lockref.count > 0) {
spin_unlock(&dentry->d_lock);
if (parent)
spin_unlock(&parent->d_lock);
@@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list)
}
}
-static enum lru_status
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* another pass through the LRU.
*/
if (dentry->d_lockref.count) {
- d_lru_isolate(dentry);
+ d_lru_isolate(lru, dentry);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
}
@@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
return LRU_ROTATE;
}
- d_lru_shrink_move(dentry, freeable);
+ d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
@@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
- * @nr_to_scan : number of entries to try to free
- * @nid: which node to scan for freeable entities
+ * @sc: shrink control, passed to list_lru_shrink_walk()
*
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
* function.
*
* This function may fail to free any resources if all the dentries are in
* use.
*/
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(dispose);
long freed;
- freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
- &dispose, &nr_to_scan);
+ freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+ dentry_lru_isolate, &dispose);
shrink_dentry_list(&dispose);
return freed;
}
static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
- spinlock_t *lru_lock, void *arg)
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct dentry *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
if (!spin_trylock(&dentry->d_lock))
return LRU_SKIP;
- d_lru_shrink_move(dentry, freeable);
+ d_lru_shrink_move(lru, dentry, freeable);
spin_unlock(&dentry->d_lock);
return LRU_REMOVED;
@@ -1430,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
}
atomic_set(&p->u.count, 1);
dname = p->name;
+ if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+ kasan_unpoison_shadow(dname,
+ round_up(name->len + 1, sizeof(unsigned long)));
} else {
dname = dentry->d_iname;
}
@@ -2187,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
}
EXPORT_SYMBOL(d_hash_and_lookup);
-/**
- * d_validate - verify dentry provided from insecure source (deprecated)
- * @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
- *
- * An insecure source has sent us a dentry, here we verify it and dget() it.
- * This is used by ncpfs in its readdir implementation.
- * Zero is returned in the dentry is invalid.
- *
- * This function is slow for big directories, and deprecated, do not use it.
- */
-int d_validate(struct dentry *dentry, struct dentry *dparent)
-{
- struct dentry *child;
-
- spin_lock(&dparent->d_lock);
- list_for_each_entry(child, &dparent->d_subdirs, d_child) {
- if (dentry == child) {
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- __dget_dlock(dentry);
- spin_unlock(&dentry->d_lock);
- spin_unlock(&dparent->d_lock);
- return 1;
- }
- }
- spin_unlock(&dparent->d_lock);
-
- return 0;
-}
-EXPORT_SYMBOL(d_validate);
-
/*
* When a file is deleted, we have two options:
* - turn this dentry into a negative dentry
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 05f2960ed7c3..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
static int debugfs_mount_count;
static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
- void *data, const struct file_operations *fops)
-
+static struct inode *debugfs_get_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
-
if (inode) {
inode->i_ino = get_next_ino();
- inode->i_mode = mode;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
- switch (mode & S_IFMT) {
- default:
- init_special_inode(inode, mode, dev);
- break;
- case S_IFREG:
- inode->i_fop = fops ? fops : &debugfs_file_operations;
- inode->i_private = data;
- break;
- case S_IFLNK:
- inode->i_op = &debugfs_link_operations;
- inode->i_private = data;
- break;
- case S_IFDIR:
- inode->i_op = &simple_dir_inode_operations;
- inode->i_fop = &simple_dir_operations;
-
- /* directory inodes start off with i_nlink == 2
- * (for "." entry) */
- inc_nlink(inode);
- break;
- }
}
return inode;
}
-/* SMP-safe */
-static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
- umode_t mode, dev_t dev, void *data,
- const struct file_operations *fops)
-{
- struct inode *inode;
- int error = -EPERM;
-
- if (dentry->d_inode)
- return -EEXIST;
-
- inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
- if (inode) {
- d_instantiate(dentry, inode);
- dget(dentry);
- error = 0;
- }
- return error;
-}
-
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
- int res;
-
- mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
- res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
- if (!res) {
- inc_nlink(dir);
- fsnotify_mkdir(dir, dentry);
- }
- return res;
-}
-
-static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
- void *data)
-{
- mode = (mode & S_IALLUGO) | S_IFLNK;
- return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
-}
-
-static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- void *data, const struct file_operations *fops)
-{
- int res;
-
- mode = (mode & S_IALLUGO) | S_IFREG;
- res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
- if (!res)
- fsnotify_create(dir, dentry);
- return res;
-}
-
static inline int debugfs_positive(struct dentry *dentry)
{
return dentry->d_inode && !d_unhashed(dentry);
@@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
.show_options = debugfs_show_options,
};
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+ struct vfsmount *(*f)(void *);
+ f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+ return f(path->dentry->d_inode->i_private);
+}
+
+static const struct dentry_operations debugfs_dops = {
+ .d_delete = always_delete_dentry,
+ .d_automount = debugfs_automount,
+};
+
static int debug_fill_super(struct super_block *sb, void *data, int silent)
{
static struct tree_descr debug_files[] = {{""}};
@@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
goto fail;
sb->s_op = &debugfs_super_operations;
+ sb->s_d_op = &debugfs_dops;
debugfs_apply_options(sb);
@@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = {
};
MODULE_ALIAS_FS("debugfs");
-static struct dentry *__create_file(const char *name, umode_t mode,
- struct dentry *parent, void *data,
- const struct file_operations *fops)
+static struct dentry *start_creating(const char *name, struct dentry *parent)
{
- struct dentry *dentry = NULL;
+ struct dentry *dentry;
int error;
pr_debug("debugfs: creating file '%s'\n",name);
@@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
&debugfs_mount_count);
if (error)
- goto exit;
+ return ERR_PTR(error);
/* If the parent is not specified, we create it in the root.
* We need the root dentry to do this, which is in the super
@@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
mutex_lock(&parent->d_inode->i_mutex);
dentry = lookup_one_len(name, parent, strlen(name));
- if (!IS_ERR(dentry)) {
- switch (mode & S_IFMT) {
- case S_IFDIR:
- error = debugfs_mkdir(parent->d_inode, dentry, mode);
-
- break;
- case S_IFLNK:
- error = debugfs_link(parent->d_inode, dentry, mode,
- data);
- break;
- default:
- error = debugfs_create(parent->d_inode, dentry, mode,
- data, fops);
- break;
- }
+ if (!IS_ERR(dentry) && dentry->d_inode) {
dput(dentry);
- } else
- error = PTR_ERR(dentry);
- mutex_unlock(&parent->d_inode->i_mutex);
-
- if (error) {
- dentry = NULL;
- simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ dentry = ERR_PTR(-EEXIST);
}
-exit:
+ if (IS_ERR(dentry))
+ mutex_unlock(&parent->d_inode->i_mutex);
+ return dentry;
+}
+
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+ dput(dentry);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ return NULL;
+}
+
+static struct dentry *end_creating(struct dentry *dentry)
+{
+ mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
return dentry;
}
@@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
- switch (mode & S_IFMT) {
- case S_IFREG:
- case 0:
- break;
- default:
- BUG();
- }
+ struct dentry *dentry;
+ struct inode *inode;
+
+ if (!(mode & S_IFMT))
+ mode |= S_IFREG;
+ BUG_ON(!S_ISREG(mode));
+ dentry = start_creating(name, parent);
+
+ if (IS_ERR(dentry))
+ return NULL;
- return __create_file(name, mode, parent, data, fops);
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = mode;
+ inode->i_fop = fops ? fops : &debugfs_file_operations;
+ inode->i_private = data;
+ d_instantiate(dentry, inode);
+ fsnotify_create(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_file);
/**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ * on. The inode.i_private pointer will point to this value on
+ * the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ * this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs. It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+ struct dentry *parent, void *data,
+ const struct file_operations *fops,
+ loff_t file_size)
+{
+ struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+
+ if (de)
+ de->d_inode->i_size = file_size;
+ return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+
+/**
* debugfs_create_dir - create a directory in the debugfs filesystem
* @name: a pointer to a string containing the name of the directory to
* create.
@@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
*/
struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
{
- return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
- parent, NULL, NULL);
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_op = &simple_dir_inode_operations;
+ inode->i_fop = &simple_dir_operations;
+
+ /* directory inodes start off with i_nlink == 2 (for "." entry) */
+ inc_nlink(inode);
+ d_instantiate(dentry, inode);
+ inc_nlink(dentry->d_parent->d_inode);
+ fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_dir);
/**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+ struct dentry *parent,
+ struct vfsmount *(*f)(void *),
+ void *data)
+{
+ struct dentry *dentry = start_creating(name, parent);
+ struct inode *inode;
+
+ if (IS_ERR(dentry))
+ return NULL;
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode))
+ return failed_creating(dentry);
+
+ inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+ inode->i_flags |= S_AUTOMOUNT;
+ inode->i_private = data;
+ dentry->d_fsdata = (void *)f;
+ d_instantiate(dentry, inode);
+ return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+
+/**
* debugfs_create_symlink- create a symbolic link in the debugfs filesystem
* @name: a pointer to a string containing the name of the symbolic link to
* create.
@@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
const char *target)
{
- struct dentry *result;
- char *link;
-
- link = kstrdup(target, GFP_KERNEL);
+ struct dentry *dentry;
+ struct inode *inode;
+ char *link = kstrdup(target, GFP_KERNEL);
if (!link)
return NULL;
- result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL);
- if (!result)
+ dentry = start_creating(name, parent);
+ if (IS_ERR(dentry)) {
kfree(link);
- return result;
+ return NULL;
+ }
+
+ inode = debugfs_get_inode(dentry->d_sb);
+ if (unlikely(!inode)) {
+ kfree(link);
+ return failed_creating(dentry);
+ }
+ inode->i_mode = S_IFLNK | S_IRWXUGO;
+ inode->i_op = &debugfs_link_operations;
+ inode->i_private = link;
+ d_instantiate(dentry, inode);
+ return end_creating(dentry);
}
EXPORT_SYMBOL_GPL(debugfs_create_symlink);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
{
struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
void *data = genlmsg_data(genlhdr);
- int rv;
- rv = genlmsg_end(skb, data);
- if (rv < 0) {
- nlmsg_free(skb);
- return rv;
- }
+ genlmsg_end(skb, data);
return genlmsg_unicast(&init_net, skb, listener_nlportid);
}
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
iput(toput_inode);
}
-static void drop_slab(void)
-{
- int nr_objects;
-
- do {
- int nid;
-
- nr_objects = 0;
- for_each_online_node(nid)
- nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
- 1000, 1000);
- } while (nr_objects > 10);
-}
-
int drop_caches_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
inode->i_ino = lower_inode->i_ino;
inode->i_version++;
inode->i_mapping->a_ops = &ecryptfs_aops;
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
if (S_ISLNK(inode->i_mode))
inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
goto out;
}
- rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+ rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
if (rc)
goto out1;
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
config EFIVAR_FS
tristate "EFI Variable filesystem"
depends on EFI
+ default m
help
efivarfs is a replacement filesystem for the old EFI
variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len] = '-';
- efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+ efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4b0a226024fa..8d0c0df01854 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
{
struct eventfd_ctx *ctx = file->private_data;
unsigned int events = 0;
- unsigned long flags;
+ u64 count;
poll_wait(file, &ctx->wqh, wait);
+ smp_rmb();
+ count = ctx->count;
- spin_lock_irqsave(&ctx->wqh.lock, flags);
- if (ctx->count > 0)
+ if (count > 0)
events |= POLLIN;
- if (ctx->count == ULLONG_MAX)
+ if (count == ULLONG_MAX)
events |= POLLERR;
- if (ULLONG_MAX - 1 > ctx->count)
+ if (ULLONG_MAX - 1 > count)
events |= POLLOUT;
- spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return events;
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f94491352..1e009cad8d5c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1639,9 +1639,9 @@ fetch_events:
spin_lock_irqsave(&ep->lock, flags);
}
- __remove_wait_queue(&ep->wq, &wait);
- set_current_state(TASK_RUNNING);
+ __remove_wait_queue(&ep->wq, &wait);
+ __set_current_state(TASK_RUNNING);
}
check_events:
/* Is it worth to try to dig for events ? */
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..c7f9b733406d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -794,8 +794,14 @@ exit:
struct file *open_exec(const char *name)
{
- struct filename tmp = { .name = name };
- return do_open_execat(AT_FDCWD, &tmp, 0);
+ struct filename *filename = getname_kernel(name);
+ struct file *f = ERR_CAST(filename);
+
+ if (!IS_ERR(filename)) {
+ f = do_open_execat(AT_FDCWD, filename, 0);
+ putname(filename);
+ }
+ return f;
}
EXPORT_SYMBOL(open_exec);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
.direct_IO = exofs_direct_IO,
/* With these NULL has special meaning or default is not exported */
- .get_xip_mem = NULL,
.migratepage = NULL,
.launder_page = NULL,
.is_partially_uptodate = NULL,
@@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
}
- inode->i_mapping->backing_dev_info = sb->s_bdi;
if (S_ISREG(inode->i_mode)) {
inode->i_op = &exofs_file_inode_operations;
inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
set_obj_2bcreated(oi);
- inode->i_mapping->backing_dev_info = sb->s_bdi;
inode_init_owner(inode, dir, mode);
inode->i_ino = sbi->s_nextid++;
inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
goto free_sbi;
}
- ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+ ret = bdi_setup_and_register(&sbi->bdi, "exofs");
if (ret) {
EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
dput(sb->s_root);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
If you are not using a security module that requires using
extended attributes for file security labels, say N.
-
-config EXT2_FS_XIP
- bool "Ext2 execute in place support"
- depends on EXT2_FS && MMU
- help
- Execute in place can be used on memory-backed block devices. If you
- enable this option, you can select to mount block devices which are
- capable of this feature without using the page cache.
-
- If you do not use a block device that is capable of using this,
- or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
ext2-$(CONFIG_EXT2_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o
ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
ext2-$(CONFIG_EXT2_FS_SECURITY) += xattr_security.o
-ext2-$(CONFIG_EXT2_FS_XIP) += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
#define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */
#define EXT2_MOUNT_XATTR_USER 0x004000 /* Extended user attributes */
#define EXT2_MOUNT_POSIX_ACL 0x008000 /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP 0x010000 /* Execute in place */
+#define EXT2_MOUNT_XIP 0x010000 /* Obsolete, use DAX */
#define EXT2_MOUNT_USRQUOTA 0x020000 /* user quota */
#define EXT2_MOUNT_GRPQUOTA 0x040000 /* group quota */
#define EXT2_MOUNT_RESERVATION 0x080000 /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX 0x100000 /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX 0
+#endif
#define clear_opt(o, opt) o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
int datasync);
extern const struct inode_operations ext2_file_inode_operations;
extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_dax_file_operations;
/* inode.c */
extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_aops_xip;
extern const struct address_space_operations ext2_nobh_aops;
/* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
#include "xattr.h"
#include "acl.h"
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_fault(vma, vmf, ext2_get_block);
+}
+
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+ .fault = ext2_dax_fault,
+ .page_mkwrite = ext2_dax_mkwrite,
+};
+
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!IS_DAX(file_inode(file)))
+ return generic_file_mmap(file, vma);
+
+ file_accessed(file);
+ vma->vm_ops = &ext2_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ return 0;
+}
+#else
+#define ext2_file_mmap generic_file_mmap
+#endif
+
/*
* Called when filp is released. This happens when all file descriptors
* for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = ext2_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
.splice_write = iter_file_splice_write,
};
-#ifdef CONFIG_EXT2_FS_XIP
-const struct file_operations ext2_xip_file_operations = {
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext2_dax_file_operations = {
.llseek = generic_file_llseek,
- .read = xip_file_read,
- .write = xip_file_write,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
- .mmap = xip_file_mmap,
+ .mmap = ext2_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
struct ext2_group_desc * gdp;
struct backing_dev_info *bdi;
- bdi = inode->i_mapping->backing_dev_info;
+ bdi = inode_to_bdi(inode);
if (bdi_read_congested(bdi))
return;
if (bdi_write_congested(bdi))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
#include <linux/aio.h>
#include "ext2.h"
#include "acl.h"
-#include "xip.h"
#include "xattr.h"
static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
goto cleanup;
}
- if (ext2_use_xip(inode->i_sb)) {
+ if (IS_DAX(inode)) {
/*
- * we need to clear the block
+ * block must be initialised before we put it in the tree
+ * so that it's not found by another thread before it's
+ * initialised
*/
- err = ext2_clear_xip_target (inode,
- le32_to_cpu(chain[depth-1].key));
+ err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
+ 1 << inode->i_blkbits);
if (err) {
mutex_unlock(&ei->truncate_mutex);
goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
size_t count = iov_iter_count(iter);
ssize_t ret;
- ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+ NULL, DIO_LOCKING);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+ ext2_get_block);
if (ret < 0 && (rw & WRITE))
ext2_write_failed(mapping, offset + count);
return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
.error_remove_page = generic_error_remove_page,
};
-const struct address_space_operations ext2_aops_xip = {
- .bmap = ext2_bmap,
- .get_xip_mem = ext2_get_xip_mem,
-};
-
const struct address_space_operations ext2_nobh_aops = {
.readpage = ext2_readpage,
.readpages = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
inode_dio_wait(inode);
- if (mapping_is_xip(inode->i_mapping))
- error = xip_truncate_page(inode->i_mapping, newsize);
+ if (IS_DAX(inode))
+ error = dax_truncate_page(inode, newsize, ext2_get_block);
else if (test_opt(inode->i_sb, NOBH))
error = nobh_truncate_page(inode->i_mapping,
newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
{
unsigned int flags = EXT2_I(inode)->i_flags;
- inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+ S_DIRSYNC | S_DAX);
if (flags & EXT2_SYNC_FL)
inode->i_flags |= S_SYNC;
if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
inode->i_flags |= S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |= S_DIRSYNC;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_flags |= S_DAX;
}
/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
-#include "xip.h"
static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
{
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
return PTR_ERR(inode);
inode->i_op = &ext2_file_inode_operations;
- if (ext2_use_xip(inode->i_sb)) {
- inode->i_mapping->a_ops = &ext2_aops_xip;
- inode->i_fop = &ext2_xip_file_operations;
+ if (test_opt(inode->i_sb, DAX)) {
+ inode->i_mapping->a_ops = &ext2_aops;
+ inode->i_fop = &ext2_dax_file_operations;
} else if (test_opt(inode->i_sb, NOBH)) {
inode->i_mapping->a_ops = &ext2_nobh_aops;
inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
#include "ext2.h"
#include "xattr.h"
#include "acl.h"
-#include "xip.h"
static void ext2_sync_super(struct super_block *sb,
struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",grpquota");
#endif
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
seq_puts(seq, ",xip");
+ if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+ seq_puts(seq, ",dax");
#endif
if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
- Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+ Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
};
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
{Opt_xip, "xip"},
+ {Opt_dax, "dax"},
{Opt_grpquota, "grpquota"},
{Opt_ignore, "noquota"},
{Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
break;
#endif
case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
- set_opt (sbi->s_mount_opt, XIP);
+ ext2_msg(sb, KERN_INFO, "use dax instead of xip");
+ set_opt(sbi->s_mount_opt, XIP);
+ /* Fall through */
+ case Opt_dax:
+#ifdef CONFIG_FS_DAX
+ set_opt(sbi->s_mount_opt, DAX);
#else
- ext2_msg(sb, KERN_INFO, "xip option not supported");
+ ext2_msg(sb, KERN_INFO, "dax option not supported");
#endif
break;
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
MS_POSIXACL : 0);
- ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
- EXT2_MOUNT_XIP if not */
-
if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
(EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
- if (!silent)
+ if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
+ if (blocksize != PAGE_SIZE) {
ext2_msg(sb, KERN_ERR,
- "error: unsupported blocksize for xip");
- goto failed_mount;
+ "error: unsupported blocksize for dax");
+ goto failed_mount;
+ }
+ if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ ext2_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ goto failed_mount;
+ }
}
/* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
{
struct ext2_sb_info * sbi = EXT2_SB(sb);
struct ext2_super_block * es;
- unsigned long old_mount_opt = sbi->s_mount_opt;
struct ext2_mount_options old_opts;
unsigned long old_sb_flags;
int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
- ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
- EXT2_MOUNT_XIP if not */
-
- if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
- ext2_msg(sb, KERN_WARNING,
- "warning: unsupported blocksize for xip");
- err = -EINVAL;
- goto restore_opts;
- }
-
es = sbi->s_es;
- if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
- "xip flag with busy inodes while remounting");
- sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
- sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+ "dax flag with busy inodes while remounting");
+ sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
}
if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index e98171a11cfe..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * linux/fs/ext2/xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include "ext2.h"
-#include "xip.h"
-
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
- void **kaddr, unsigned long *pfn)
-{
- struct block_device *bdev = inode->i_sb->s_bdev;
- const struct block_device_operations *ops = bdev->bd_disk->fops;
- sector_t sector;
-
- sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-
- BUG_ON(!ops->direct_access);
- return ops->direct_access(bdev, sector, kaddr, pfn);
-}
-
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
- sector_t *result)
-{
- struct buffer_head tmp;
- int rc;
-
- memset(&tmp, 0, sizeof(struct buffer_head));
- tmp.b_size = 1 << inode->i_blkbits;
- rc = ext2_get_block(inode, pgoff, &tmp, create);
- *result = tmp.b_blocknr;
-
- /* did we get a sparse block (hole in the file)? */
- if (!tmp.b_blocknr && !rc) {
- BUG_ON(create);
- rc = -ENODATA;
- }
-
- return rc;
-}
-
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
- void *kaddr;
- unsigned long pfn;
- int rc;
-
- rc = __inode_direct_access(inode, block, &kaddr, &pfn);
- if (!rc)
- clear_page(kaddr);
- return rc;
-}
-
-void ext2_xip_verify_sb(struct super_block *sb)
-{
- struct ext2_sb_info *sbi = EXT2_SB(sb);
-
- if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
- !sb->s_bdev->bd_disk->fops->direct_access) {
- sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
- ext2_msg(sb, KERN_WARNING,
- "warning: ignoring xip option - "
- "not supported by bdev");
- }
-}
-
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
- void **kmem, unsigned long *pfn)
-{
- int rc;
- sector_t block;
-
- /* first, retrieve the sector number */
- rc = __ext2_get_block(mapping->host, pgoff, create, &block);
- if (rc)
- return rc;
-
- /* retrieve address of the target data */
- rc = __inode_direct_access(mapping->host, block, kmem, pfn);
- return rc;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * linux/fs/ext2/xip.h
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-
-#ifdef CONFIG_EXT2_FS_XIP
-extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-
-static inline int ext2_use_xip (struct super_block *sb)
-{
- struct ext2_sb_info *sbi = EXT2_SB(sb);
- return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
-}
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
- void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
-#else
-#define mapping_is_xip(map) 0
-#define ext2_xip_verify_sb(sb) do { } while (0)
-#define ext2_use_xip(sb) 0
-#define ext2_clear_xip_target(inode, chain) 0
-#define ext2_get_xip_mem NULL
-#endif
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
}
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
+ mutex_destroy(&sbi->s_orphan_lock);
+ mutex_destroy(&sbi->s_resize_lock);
kfree(sbi);
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..982d934fd9ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
#define EXT4_MOUNT_ERRORS_MASK 0x00070
#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */
#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX 0
+#endif
#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */
#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */
#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
/* file.c */
extern const struct inode_operations ext4_file_inode_operations;
extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
/* inline.c */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e5d3eadf47b1..bed43081720f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
/* fallback to generic here if not in extents fmt */
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- return __generic_block_fiemap(inode, fieinfo, start, len,
- ext4_get_block);
+ return generic_block_fiemap(inode, fieinfo, start, len,
+ ext4_get_block);
if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
return -EBADR;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 513c12cf444c..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
struct mutex *aio_mutex = NULL;
struct blk_plug plug;
- int o_direct = file->f_flags & O_DIRECT;
+ int o_direct = io_is_direct(file);
int overwrite = 0;
size_t length = iov_iter_count(from);
ssize_t ret;
@@ -191,17 +191,41 @@ errout:
return ret;
}
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_fault(vma, vmf, ext4_get_block);
+ /* Is this the right get_block? */
+}
+
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+ .fault = ext4_dax_fault,
+ .page_mkwrite = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
+
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
file_accessed(file);
- vma->vm_ops = &ext4_file_vm_ops;
+ if (IS_DAX(file_inode(file))) {
+ vma->vm_ops = &ext4_dax_vm_ops;
+ vma->vm_flags |= VM_MIXEDMAP;
+ } else {
+ vma->vm_ops = &ext4_file_vm_ops;
+ }
return 0;
}
@@ -273,19 +297,24 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
* we determine this extent as a data or a hole according to whether the
* page cache has data or not.
*/
-static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
- loff_t endoff, loff_t *offset)
+static int ext4_find_unwritten_pgoff(struct inode *inode,
+ int whence,
+ struct ext4_map_blocks *map,
+ loff_t *offset)
{
struct pagevec pvec;
+ unsigned int blkbits;
pgoff_t index;
pgoff_t end;
+ loff_t endoff;
loff_t startoff;
loff_t lastoff;
int found = 0;
+ blkbits = inode->i_sb->s_blocksize_bits;
startoff = *offset;
lastoff = startoff;
-
+ endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
index = startoff >> PAGE_CACHE_SHIFT;
end = endoff >> PAGE_CACHE_SHIFT;
@@ -403,144 +432,147 @@ out:
static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct fiemap_extent_info fie;
- struct fiemap_extent ext[2];
- loff_t next;
- int i, ret = 0;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t dataoff, isize;
+ int blkbits;
+ int ret = 0;
mutex_lock(&inode->i_mutex);
- if (offset >= inode->i_size) {
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
- fie.fi_flags = 0;
- fie.fi_extents_max = 2;
- fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
- while (1) {
- mm_segment_t old_fs = get_fs();
-
- fie.fi_extents_mapped = 0;
- memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
-
- set_fs(get_ds());
- ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
- set_fs(old_fs);
- if (ret)
+
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ dataoff = offset;
+
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
break;
+ }
- /* No extents found, EOF */
- if (!fie.fi_extents_mapped) {
- ret = -ENXIO;
+ /*
+ * If there is a delay extent at this offset,
+ * it will be as a data.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ if (last != start)
+ dataoff = (loff_t)last << blkbits;
break;
}
- for (i = 0; i < fie.fi_extents_mapped; i++) {
- next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
- if (offset < (loff_t)ext[i].fe_logical)
- offset = (loff_t)ext[i].fe_logical;
- /*
- * If extent is not unwritten, then it contains valid
- * data, mapped or delayed.
- */
- if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
- goto out;
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+ &map, &dataoff);
+ if (unwritten)
+ break;
+ }
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
- next, &offset))
- goto out;
+ last++;
+ dataoff = (loff_t)last << blkbits;
+ } while (last <= end);
- if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
- ret = -ENXIO;
- goto out;
- }
- offset = next;
- }
- }
- if (offset > inode->i_size)
- offset = inode->i_size;
-out:
mutex_unlock(&inode->i_mutex);
- if (ret)
- return ret;
- return vfs_setpos(file, offset, maxsize);
+ if (dataoff > isize)
+ return -ENXIO;
+
+ return vfs_setpos(file, dataoff, maxsize);
}
/*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
*/
static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
{
struct inode *inode = file->f_mapping->host;
- struct fiemap_extent_info fie;
- struct fiemap_extent ext[2];
- loff_t next;
- int i, ret = 0;
+ struct ext4_map_blocks map;
+ struct extent_status es;
+ ext4_lblk_t start, last, end;
+ loff_t holeoff, isize;
+ int blkbits;
+ int ret = 0;
mutex_lock(&inode->i_mutex);
- if (offset >= inode->i_size) {
+
+ isize = i_size_read(inode);
+ if (offset >= isize) {
mutex_unlock(&inode->i_mutex);
return -ENXIO;
}
- fie.fi_flags = 0;
- fie.fi_extents_max = 2;
- fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
- while (1) {
- mm_segment_t old_fs = get_fs();
+ blkbits = inode->i_sb->s_blocksize_bits;
+ start = offset >> blkbits;
+ last = start;
+ end = isize >> blkbits;
+ holeoff = offset;
- fie.fi_extents_mapped = 0;
- memset(ext, 0, sizeof(*ext));
-
- set_fs(get_ds());
- ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
- set_fs(old_fs);
- if (ret)
- break;
+ do {
+ map.m_lblk = last;
+ map.m_len = end - last + 1;
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
- /* No extents found */
- if (!fie.fi_extents_mapped)
- break;
+ /*
+ * If there is a delay extent at this offset,
+ * we will skip this extent.
+ */
+ ext4_es_find_delayed_extent_range(inode, last, last, &es);
+ if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+ last = es.es_lblk + es.es_len;
+ holeoff = (loff_t)last << blkbits;
+ continue;
+ }
- for (i = 0; i < fie.fi_extents_mapped; i++) {
- next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
- /*
- * If extent is not unwritten, then it contains valid
- * data, mapped or delayed.
- */
- if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
- if (offset < (loff_t)ext[i].fe_logical)
- goto out;
- offset = next;
+ /*
+ * If there is a unwritten extent at this offset,
+ * it will be as a data or a hole according to page
+ * cache that has data or not.
+ */
+ if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+ int unwritten;
+ unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+ &map, &holeoff);
+ if (!unwritten) {
+ last += ret;
+ holeoff = (loff_t)last << blkbits;
continue;
}
- /*
- * If there is a unwritten extent at this offset,
- * it will be as a data or a hole according to page
- * cache that has data or not.
- */
- if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
- next, &offset))
- goto out;
-
- offset = next;
- if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
- goto out;
}
- }
- if (offset > inode->i_size)
- offset = inode->i_size;
-out:
+
+ /* find a hole */
+ break;
+ } while (last <= end);
+
mutex_unlock(&inode->i_mutex);
- if (ret)
- return ret;
- return vfs_setpos(file, offset, maxsize);
+ if (holeoff > isize)
+ holeoff = isize;
+
+ return vfs_setpos(file, holeoff, maxsize);
}
/*
@@ -592,6 +624,26 @@ const struct file_operations ext4_file_operations = {
.fallocate = ext4_fallocate,
};
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+ .llseek = ext4_llseek,
+ .read = new_sync_read,
+ .write = new_sync_write,
+ .read_iter = generic_file_read_iter,
+ .write_iter = ext4_file_write_iter,
+ .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ext4_compat_ioctl,
+#endif
+ .mmap = ext4_file_mmap,
+ .open = ext4_file_open,
+ .release = ext4_release_file,
+ .fsync = ext4_sync_file,
+ /* Splice not yet supported with DAX */
+ .fallocate = ext4_fallocate,
+};
+#endif
+
const struct inode_operations ext4_file_inode_operations = {
.setattr = ext4_setattr,
.getattr = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
inode_dio_done(inode);
goto locked;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter, offset,
- ext4_get_block, NULL, NULL, 0);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, 0);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ ext4_get_block, NULL, NULL, 0);
inode_dio_done(inode);
} else {
locked:
- ret = blockdev_direct_IO(rw, iocb, inode, iter,
- offset, ext4_get_block);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset,
+ ext4_get_block, NULL, DIO_LOCKING);
+ else
+ ret = blockdev_direct_IO(rw, iocb, inode, iter,
+ offset, ext4_get_block);
if (unlikely((rw & WRITE) && ret < 0)) {
loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
return retval;
}
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+ struct inode *inode = bh->b_assoc_map->host;
+ /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+ loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+ int err;
+ if (!uptodate)
+ return;
+ WARN_ON(!buffer_unwritten(bh));
+ err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
+
/* Maximum number of blocks we map for direct IO at once. */
#define DIO_MAX_BLOCKS 4096
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map_bh(bh, inode->i_sb, map.m_pblk);
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+ if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+ bh->b_assoc_map = inode->i_mapping;
+ bh->b_private = (void *)(unsigned long)iblock;
+ bh->b_end_io = ext4_end_io_unwritten;
+ }
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
set_buffer_defer_completion(bh);
bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
get_block_func = ext4_get_block_write;
dio_flags = DIO_LOCKING;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iter,
- offset,
- get_block_func,
- ext4_end_io_dio,
- NULL,
- dio_flags);
+ if (IS_DAX(inode))
+ ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
+ ext4_end_io_dio, dio_flags);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iter, offset,
+ get_block_func,
+ ext4_end_io_dio, NULL, dio_flags);
/*
* Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
-/*
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'. The range to be zero'd must
- * be contained with in one block. If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
+static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize, max, pos;
+ unsigned blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
return -ENOMEM;
blocksize = inode->i_sb->s_blocksize;
- max = blocksize - (offset & (blocksize - 1));
-
- /*
- * correct length if it does not fall between
- * 'from' and the end of the block
- */
- if (length > max || length < 0)
- length = max;
iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -3278,6 +3281,33 @@ unlock:
}
/*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'. The range to be zero'd must
+ * be contained with in one block. If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+ struct address_space *mapping, loff_t from, loff_t length)
+{
+ struct inode *inode = mapping->host;
+ unsigned offset = from & (PAGE_CACHE_SIZE-1);
+ unsigned blocksize = inode->i_sb->s_blocksize;
+ unsigned max = blocksize - (offset & (blocksize - 1));
+
+ /*
+ * correct length if it does not fall between
+ * 'from' and the end of the block
+ */
+ if (length > max || length < 0)
+ length = max;
+
+ if (IS_DAX(inode))
+ return dax_zero_page_range(inode, from, length, ext4_get_block);
+ return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+
+/*
* ext4_block_truncate_page() zeroes out a mapping from file offset `from'
* up to the end of the block which corresponds to `from'.
* This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
new_fl |= S_NOATIME;
if (flags & EXT4_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
+ if (test_opt(inode->i_sb, DAX))
+ new_fl |= S_DAX;
inode_set_flags(inode, new_fl,
- S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+ S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
}
/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
return 0;
}
+struct other_inode {
+ unsigned long orig_ino;
+ struct ext4_inode *raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+ void *data)
+{
+ struct other_inode *oi = (struct other_inode *) data;
+
+ if ((inode->i_ino != ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+ ((inode->i_state & I_DIRTY_TIME) == 0))
+ return 0;
+ spin_lock(&inode->i_lock);
+ if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+ I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+ (inode->i_state & I_DIRTY_TIME)) {
+ struct ext4_inode_info *ei = EXT4_I(inode);
+
+ inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+ spin_unlock(&inode->i_lock);
+
+ spin_lock(&ei->i_raw_lock);
+ EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+ EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+ ext4_inode_csum_set(inode, oi->raw_inode, ei);
+ spin_unlock(&ei->i_raw_lock);
+ trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+ return -1;
+ }
+ spin_unlock(&inode->i_lock);
+ return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+ unsigned long orig_ino, char *buf)
+{
+ struct other_inode oi;
+ unsigned long ino;
+ int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+ int inode_size = EXT4_INODE_SIZE(sb);
+
+ oi.orig_ino = orig_ino;
+ ino = orig_ino & ~(inodes_per_block - 1);
+ for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+ if (ino == orig_ino)
+ continue;
+ oi.raw_inode = (struct ext4_inode *) buf;
+ (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+ }
+}
+
/*
* Post the struct inode info into an on-disk inode location in the
* buffer-cache. This gobbles the caller's reference to the
@@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle,
cpu_to_le16(ei->i_extra_isize);
}
}
-
ext4_inode_csum_set(inode, raw_inode, ei);
-
spin_unlock(&ei->i_raw_lock);
+ if (inode->i_sb->s_flags & MS_LAZYTIME)
+ ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+ bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
- truncate_pagecache(inode, inode->i_size);
+ truncate_pagecache(inode, inode->i_size);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
* If the inode is marked synchronous, we don't honour that here - doing
* so would cause a commit on atime updates, which we don't bother doing.
* We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything. If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
*/
void ext4_dirty_inode(struct inode *inode, int flags)
{
handle_t *handle;
+ if (flags == I_DIRTY_TIME)
+ return;
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle))
goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, inode);
if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
- inode->i_fop = &ext4_file_operations;
+ if (test_opt(inode->i_sb, DAX))
+ inode->i_fop = &ext4_dax_file_operations;
+ else
+ inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
d_tmpfile(dentry, inode);
err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bf76f405a5f9..8a8ec6293b19 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -24,6 +24,18 @@ int ext4_resize_begin(struct super_block *sb)
return -EPERM;
/*
+ * If we are not using the primary superblock/GDT copy don't resize,
+ * because the user tools have no way of handling this. Probably a
+ * bad time to do it anyways.
+ */
+ if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+ le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+ ext4_warning(sb, "won't resize using backup superblock at %llu",
+ (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+ return -EPERM;
+ }
+
+ /*
* We are not allowed to do online-resizing on a filesystem mounted
* with error, because it can destroy the filesystem easily.
*/
@@ -758,18 +770,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
"EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
gdb_num);
- /*
- * If we are not using the primary superblock/GDT copy don't resize,
- * because the user tools have no way of handling this. Probably a
- * bad time to do it anyways.
- */
- if (EXT4_SB(sb)->s_sbh->b_blocknr !=
- le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
- ext4_warning(sb, "won't resize using backup superblock at %llu",
- (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
- return -EPERM;
- }
-
gdb_bh = sb_bread(sb, gdblock);
if (!gdb_bh)
return -EIO;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43c92b1685cb..1adac6868e6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
static int block_device_ejected(struct super_block *sb)
{
struct inode *bd_inode = sb->s_bdev->bd_inode;
- struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
return bdi->dev == NULL;
}
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
static int ext4_write_info(struct super_block *sb, int type);
static int ext4_quota_on(struct super_block *sb, int type, int format_id,
struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id);
static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
static int ext4_quota_on_mount(struct super_block *sb, int type);
static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
.get_dqblk = dquot_get_dqblk,
.set_dqblk = dquot_set_dqblk
};
-
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
- .quota_on_meta = ext4_quota_on_sysfile,
- .quota_off = ext4_quota_off_sysfile,
- .quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk
-};
#endif
static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
- Opt_usrquota, Opt_grpquota, Opt_i_version,
+ Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+ Opt_lazytime, Opt_nolazytime,
Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
Opt_inode_readahead_blks, Opt_journal_ioprio,
Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
{Opt_barrier, "barrier"},
{Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
+ {Opt_dax, "dax"},
{Opt_stripe, "stripe=%u"},
{Opt_delalloc, "delalloc"},
+ {Opt_lazytime, "lazytime"},
+ {Opt_nolazytime, "nolazytime"},
{Opt_nodelalloc, "nodelalloc"},
{Opt_removed, "mblk_io_submit"},
{Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
{Opt_min_batch_time, 0, MOPT_GTE0},
{Opt_inode_readahead_blks, 0, MOPT_GTE0},
{Opt_init_itable, 0, MOPT_GTE0},
+ {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
{Opt_stripe, 0, MOPT_GTE0},
{Opt_resuid, 0, MOPT_GTE0},
{Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
case Opt_i_version:
sb->s_flags |= MS_I_VERSION;
return 1;
+ case Opt_lazytime:
+ sb->s_flags |= MS_LAZYTIME;
+ return 1;
+ case Opt_nolazytime:
+ sb->s_flags &= ~MS_LAZYTIME;
+ return 1;
}
for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
}
sbi->s_jquota_fmt = m->mount_opt;
#endif
+#ifndef CONFIG_FS_DAX
+ } else if (token == Opt_dax) {
+ ext4_msg(sb, KERN_INFO, "dax option not supported");
+ return -1;
+#endif
} else {
if (!args->from)
arg = 1;
@@ -3482,7 +3485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
- ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+ ext4_warning(sb, "metadata_csum and uninit_bg are "
"redundant flags; please run fsck.");
/* Check for a known checksum algorithm */
@@ -3602,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
"both data=journal and dioread_nolock");
goto failed_mount;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ goto failed_mount;
+ }
if (test_opt(sb, DELALLOC))
clear_opt(sb, DELALLOC);
}
@@ -3665,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+ if (blocksize != PAGE_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "error: unsupported blocksize for dax");
+ goto failed_mount;
+ }
+ if (!sb->s_bdev->bd_disk->fops->direct_access) {
+ ext4_msg(sb, KERN_ERR,
+ "error: device does not support dax");
+ goto failed_mount;
+ }
+ }
+
if (sb->s_blocksize != blocksize) {
/* Validate the filesystem blocksize */
if (!sb_set_blocksize(sb, blocksize)) {
@@ -3935,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- sb->s_qcop = &ext4_qctl_sysfile_operations;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4882,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
+ if (test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ err = -EINVAL;
+ goto restore_opts;
+ }
+ }
+
+ if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+ ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+ "dax flag with busy inodes while remounting");
+ sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
}
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -5020,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
#endif
+ *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
kfree(orig_data);
return 0;
@@ -5288,21 +5322,6 @@ static int ext4_enable_quotas(struct super_block *sb)
return 0;
}
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
- int format_id)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /*
- * USAGE was enabled at mount time. Only need to enable LIMITS now.
- */
- return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
-
static int ext4_quota_off(struct super_block *sb, int type)
{
struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5348,6 @@ out:
return dquot_quota_off(sb, type);
}
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
- return -EINVAL;
-
- /* Disable only the limits. */
- return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-
/* Read data from quotafile - avoid pagecache and such because we cannot afford
* acquiring the locks... As quota files are never truncated and quota code
* itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..94e2d2ffabe1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -71,3 +71,13 @@ config F2FS_CHECK_FS
Enables BUG_ONs which check the filesystem consistency in runtime.
If you want to improve the performance, say N.
+
+config F2FS_IO_TRACE
+ bool "F2FS IO tracer"
+ depends on F2FS_FS
+ depends on FUNCTION_TRACER
+ help
+ F2FS IO trace is based on a function trace, which gathers process
+ information and block IO patterns in the filesystem level.
+
+ If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..d92397731db8 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -5,3 +5,4 @@ f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o
f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
+f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1ccb26bc2a0b..742202779bd5 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
if (count == 0)
return NULL;
- acl = posix_acl_alloc(count, GFP_KERNEL);
+ acl = posix_acl_alloc(count, GFP_NOFS);
if (!acl)
return ERR_PTR(-ENOMEM);
@@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
int i;
f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
- sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+ sizeof(struct f2fs_acl_entry), GFP_NOFS);
if (!f2fs_acl)
return ERR_PTR(-ENOMEM);
@@ -396,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
posix_acl_release(default_acl);
}
if (acl) {
- if (error)
+ if (!error)
error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
ipage);
posix_acl_release(acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e6c271fefaca..7f794b72b3b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,11 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
static struct kmem_cache *ino_entry_slab;
-static struct kmem_cache *inode_entry_slab;
+struct kmem_cache *inode_entry_slab;
/*
* We guarantee no failure on the returned page.
@@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
+ struct f2fs_io_info fio = {
+ .type = META,
+ .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .blk_addr = index,
+ };
repeat:
page = grab_cache_page(mapping, index);
if (!page) {
@@ -59,8 +65,7 @@ repeat:
if (PageUptodate(page))
goto out;
- if (f2fs_submit_page_bio(sbi, page, index,
- READ_SYNC | REQ_META | REQ_PRIO))
+ if (f2fs_submit_page_bio(sbi, page, &fio))
goto repeat;
lock_page(page);
@@ -112,14 +117,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
block_t prev_blk_addr = 0;
struct page *page;
block_t blkno = start;
-
struct f2fs_io_info fio = {
.type = META,
.rw = READ_SYNC | REQ_META | REQ_PRIO
};
for (; nrpages-- > 0; blkno++) {
- block_t blk_addr;
if (!is_valid_blkaddr(sbi, blkno, type))
goto out;
@@ -130,27 +133,27 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
blkno = 0;
/* get nat block addr */
- blk_addr = current_nat_addr(sbi,
+ fio.blk_addr = current_nat_addr(sbi,
blkno * NAT_ENTRY_PER_BLOCK);
break;
case META_SIT:
/* get sit block addr */
- blk_addr = current_sit_addr(sbi,
+ fio.blk_addr = current_sit_addr(sbi,
blkno * SIT_ENTRY_PER_BLOCK);
- if (blkno != start && prev_blk_addr + 1 != blk_addr)
+ if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
goto out;
- prev_blk_addr = blk_addr;
+ prev_blk_addr = fio.blk_addr;
break;
case META_SSA:
case META_CP:
case META_POR:
- blk_addr = blkno;
+ fio.blk_addr = blkno;
break;
default:
BUG();
}
- page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+ page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
if (!page)
continue;
if (PageUptodate(page)) {
@@ -158,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
continue;
}
- f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+ f2fs_submit_page_mbio(sbi, page, &fio);
f2fs_put_page(page, 0);
}
out:
@@ -187,7 +190,7 @@ static int f2fs_write_meta_page(struct page *page,
trace_f2fs_writepage(page, META);
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
goto redirty_out;
@@ -299,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
+ SetPagePrivate(page);
+ f2fs_trace_pid(page);
return 1;
}
return 0;
@@ -308,6 +313,8 @@ const struct address_space_operations f2fs_meta_aops = {
.writepage = f2fs_write_meta_page,
.writepages = f2fs_write_meta_pages,
.set_page_dirty = f2fs_set_meta_page_dirty,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
};
static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -462,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
return;
- sbi->por_doing = true;
+ set_sbi_flag(sbi, SBI_POR_DOING);
start_blk = __start_cp_addr(sbi) + 1 +
le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
@@ -483,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
}
/* clear Orphan Flag */
clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
- sbi->por_doing = false;
+ clear_sbi_flag(sbi, SBI_POR_DOING);
return;
}
@@ -567,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp1;
- crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+ crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp1;
@@ -582,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
if (crc_offset >= blk_size)
goto invalid_cp2;
- crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+ crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
if (!f2fs_crc_valid(crc, cp_block, crc_offset))
goto invalid_cp2;
@@ -669,7 +676,7 @@ fail_no_cp:
return -EINVAL;
}
-static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
+static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -686,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
void update_dirty_page(struct inode *inode, struct page *page)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *new;
+ struct inode_entry *new;
int ret = 0;
if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
@@ -710,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page)
kmem_cache_free(inode_entry_slab, new);
out:
SetPagePrivate(page);
+ f2fs_trace_pid(page);
}
void add_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *new =
+ struct inode_entry *new =
f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
int ret = 0;
@@ -733,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode)
void remove_dirty_dir_inode(struct inode *inode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dir_inode_entry *entry;
+ struct inode_entry *entry;
if (!S_ISDIR(inode->i_mode))
return;
@@ -763,7 +771,7 @@ void remove_dirty_dir_inode(struct inode *inode)
void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
{
struct list_head *head;
- struct dir_inode_entry *entry;
+ struct inode_entry *entry;
struct inode *inode;
retry:
if (unlikely(f2fs_cp_error(sbi)))
@@ -776,7 +784,7 @@ retry:
spin_unlock(&sbi->dir_inode_lock);
return;
}
- entry = list_entry(head->next, struct dir_inode_entry, list);
+ entry = list_entry(head->next, struct inode_entry, list);
inode = igrab(entry->inode);
spin_unlock(&sbi->dir_inode_lock);
if (inode) {
@@ -922,7 +930,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->next_free_nid = cpu_to_le32(last_nid);
/* 2 cp + n data seg summary + orphan inode blocks */
- data_sum_blocks = npages_for_summary_flush(sbi);
+ data_sum_blocks = npages_for_summary_flush(sbi, false);
if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
else
@@ -932,24 +940,31 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
orphan_blocks);
- if (cpc->reason == CP_UMOUNT) {
- set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ if (__remain_node_summaries(cpc->reason))
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
cp_payload_blks + data_sum_blocks +
orphan_blocks + NR_CURSEG_NODE_TYPE);
- } else {
- clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ else
ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
cp_payload_blks + data_sum_blocks +
orphan_blocks);
- }
+
+ if (cpc->reason == CP_UMOUNT)
+ set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+
+ if (cpc->reason == CP_FASTBOOT)
+ set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+ else
+ clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
if (orphan_num)
set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
else
clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- if (sbi->need_fsck)
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
set_ckpt_flags(ckpt, CP_FSCK_FLAG);
/* update SIT/NAT bitmap */
@@ -966,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* write out checkpoint buffer at block 0 */
cp_page = grab_meta_page(sbi, start_blk++);
kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ memcpy(kaddr, ckpt, F2FS_BLKSIZE);
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
for (i = 1; i < 1 + cp_payload_blks; i++) {
cp_page = grab_meta_page(sbi, start_blk++);
kaddr = page_address(cp_page);
- memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
- (1 << sbi->log_blocksize));
+ memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
}
@@ -986,7 +1000,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
write_data_summaries(sbi, start_blk);
start_blk += data_sum_blocks;
- if (cpc->reason == CP_UMOUNT) {
+ if (__remain_node_summaries(cpc->reason)) {
write_node_summaries(sbi, start_blk);
start_blk += NR_CURSEG_NODE_TYPE;
}
@@ -994,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* writeout checkpoint block */
cp_page = grab_meta_page(sbi, start_blk);
kaddr = page_address(cp_page);
- memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+ memcpy(kaddr, ckpt, F2FS_BLKSIZE);
set_page_dirty(cp_page);
f2fs_put_page(cp_page, 1);
@@ -1023,7 +1037,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return;
clear_prefree_segments(sbi);
- F2FS_RESET_SB_DIRT(sbi);
+ clear_sbi_flag(sbi, SBI_IS_DIRTY);
}
/*
@@ -1038,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
mutex_lock(&sbi->cp_mutex);
- if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+ if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
+ cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
goto out;
if (unlikely(f2fs_cp_error(sbi)))
goto out;
+ if (f2fs_readonly(sbi->sb))
+ goto out;
if (block_operations(sbi))
goto out;
@@ -1102,8 +1119,8 @@ int __init create_checkpoint_caches(void)
sizeof(struct ino_entry));
if (!ino_entry_slab)
return -ENOMEM;
- inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
- sizeof(struct dir_inode_entry));
+ inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
+ sizeof(struct inode_entry));
if (!inode_entry_slab) {
kmem_cache_destroy(ino_entry_slab);
return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b37f19..985ed023a750 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
static void f2fs_read_end_io(struct bio *bio, int err)
@@ -95,11 +96,9 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
return;
if (is_read_io(fio->rw))
- trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw,
- fio->type, io->bio);
+ trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
else
- trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw,
- fio->type, io->bio);
+ trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
submit_bio(fio->rw, io->bio);
io->bio = NULL;
@@ -132,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
* Return unlocked page.
*/
int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, int rw)
+ struct f2fs_io_info *fio)
{
struct bio *bio;
- trace_f2fs_submit_page_bio(page, blk_addr, rw);
+ trace_f2fs_submit_page_bio(page, fio);
+ f2fs_trace_ios(page, fio, 0);
/* Allocate a new bio */
- bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+ bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
bio_put(bio);
@@ -147,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
return -EFAULT;
}
- submit_bio(rw, bio);
+ submit_bio(fio->rw, bio);
return 0;
}
void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
- block_t blk_addr, struct f2fs_io_info *fio)
+ struct f2fs_io_info *fio)
{
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io;
@@ -160,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
io = is_read ? &sbi->read_io : &sbi->write_io[btype];
- verify_block_addr(sbi, blk_addr);
+ verify_block_addr(sbi, fio->blk_addr);
down_write(&io->io_rwsem);
if (!is_read)
inc_page_count(sbi, F2FS_WRITEBACK);
- if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+ if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
io->fio.rw != fio->rw))
__submit_merged_bio(io);
alloc_new:
if (io->bio == NULL) {
int bio_blocks = MAX_BIO_BLOCKS(sbi);
- io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+ io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
io->fio = *fio;
}
@@ -184,10 +184,11 @@ alloc_new:
goto alloc_new;
}
- io->last_block_in_bio = blk_addr;
+ io->last_block_in_bio = fio->blk_addr;
+ f2fs_trace_ios(page, fio, 0);
up_write(&io->io_rwsem);
- trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+ trace_f2fs_submit_page_mbio(page, fio);
}
/*
@@ -196,7 +197,7 @@ alloc_new:
* ->node_page
* update block addresses in the node page
*/
-static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+static void __set_data_blkaddr(struct dnode_of_data *dn)
{
struct f2fs_node *rn;
__le32 *addr_array;
@@ -209,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
- addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+ addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
set_page_dirty(node_page);
}
@@ -224,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn)
trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
- __set_data_blkaddr(dn, NEW_ADDR);
dn->data_blkaddr = NEW_ADDR;
+ __set_data_blkaddr(dn);
mark_inode_dirty(dn->inode);
sync_inode_page(dn);
return 0;
@@ -273,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
unsigned int blkbits = inode->i_sb->s_blocksize_bits;
size_t count;
- clear_buffer_new(bh_result);
+ set_buffer_new(bh_result);
map_bh(bh_result, inode->i_sb,
start_blkaddr + pgofs - start_fofs);
count = end_fofs - pgofs + 1;
@@ -290,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
return 0;
}
-void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+void update_extent_cache(struct dnode_of_data *dn)
{
struct f2fs_inode_info *fi = F2FS_I(dn->inode);
pgoff_t fofs, start_fofs, end_fofs;
block_t start_blkaddr, end_blkaddr;
int need_update = true;
- f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
- fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
- dn->ofs_in_node;
+ f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
/* Update the page address in the parent node */
- __set_data_blkaddr(dn, blk_addr);
+ __set_data_blkaddr(dn);
if (is_inode_flag_set(fi, FI_NO_EXTENT))
return;
+ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+ dn->ofs_in_node;
+
write_lock(&fi->ext.ext_lock);
start_fofs = fi->ext.fofs;
@@ -320,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
/* Initial extent */
if (fi->ext.len == 0) {
- if (blk_addr != NULL_ADDR) {
+ if (dn->data_blkaddr != NULL_ADDR) {
fi->ext.fofs = fofs;
- fi->ext.blk_addr = blk_addr;
+ fi->ext.blk_addr = dn->data_blkaddr;
fi->ext.len = 1;
}
goto end_update;
}
/* Front merge */
- if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+ if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
fi->ext.fofs--;
fi->ext.blk_addr--;
fi->ext.len++;
@@ -337,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
}
/* Back merge */
- if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+ if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
fi->ext.len++;
goto end_update;
}
@@ -376,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
struct dnode_of_data dn;
struct page *page;
int err;
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = sync ? READ_SYNC : READA,
+ };
page = find_get_page(mapping, index);
if (page && PageUptodate(page))
@@ -404,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
return page;
}
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
- sync ? READ_SYNC : READA);
+ fio.blk_addr = dn.data_blkaddr;
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
if (err)
return ERR_PTR(err);
@@ -430,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
struct dnode_of_data dn;
struct page *page;
int err;
-
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = READ_SYNC,
+ };
repeat:
page = grab_cache_page(mapping, index);
if (!page)
@@ -464,8 +473,8 @@ repeat:
return page;
}
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
- dn.data_blkaddr, READ_SYNC);
+ fio.blk_addr = dn.data_blkaddr;
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
if (err)
return ERR_PTR(err);
@@ -515,8 +524,12 @@ repeat:
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
SetPageUptodate(page);
} else {
- err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
- dn.data_blkaddr, READ_SYNC);
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = READ_SYNC,
+ .blk_addr = dn.data_blkaddr,
+ };
+ err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
if (err)
goto put_err;
@@ -550,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_inode_info *fi = F2FS_I(dn->inode);
struct f2fs_summary sum;
- block_t new_blkaddr;
struct node_info ni;
+ int seg = CURSEG_WARM_DATA;
pgoff_t fofs;
- int type;
if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
return -EPERM;
if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
return -ENOSPC;
- __set_data_blkaddr(dn, NEW_ADDR);
- dn->data_blkaddr = NEW_ADDR;
-
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
- type = CURSEG_WARM_DATA;
+ if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
+ seg = CURSEG_DIRECT_IO;
- allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
+ allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
/* direct IO doesn't use extent cache to maximize the performance */
- set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
- update_extent_cache(new_blkaddr, dn);
- clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+ __set_data_blkaddr(dn);
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -581,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn)
if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
- dn->data_blkaddr = new_blkaddr;
return 0;
}
+static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+ size_t count)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct dnode_of_data dn;
+ u64 start = F2FS_BYTES_TO_BLK(offset);
+ u64 len = F2FS_BYTES_TO_BLK(count);
+ bool allocated;
+ u64 end_offset;
+
+ while (len) {
+ f2fs_balance_fs(sbi);
+ f2fs_lock_op(sbi);
+
+ /* When reading holes, we need its node page */
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+ goto out;
+
+ allocated = false;
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+
+ while (dn.ofs_in_node < end_offset && len) {
+ if (dn.data_blkaddr == NULL_ADDR) {
+ if (__allocate_data_block(&dn))
+ goto sync_out;
+ allocated = true;
+ }
+ len--;
+ start++;
+ dn.ofs_in_node++;
+ }
+
+ if (allocated)
+ sync_inode_page(&dn);
+
+ f2fs_put_dnode(&dn);
+ f2fs_unlock_op(sbi);
+ }
+ return;
+
+sync_out:
+ if (allocated)
+ sync_inode_page(&dn);
+ f2fs_put_dnode(&dn);
+out:
+ f2fs_unlock_op(sbi);
+ return;
+}
+
/*
* get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
* If original data blocks are allocated, then give them to blockdev.
@@ -610,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
if (check_extent_cache(inode, pgofs, bh_result))
goto out;
- if (create) {
- f2fs_balance_fs(F2FS_I_SB(inode));
+ if (create)
f2fs_lock_op(F2FS_I_SB(inode));
- }
/* When reading holes, we need its node page */
set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -627,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
goto put_out;
if (dn.data_blkaddr != NULL_ADDR) {
+ set_buffer_new(bh_result);
map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
} else if (create) {
err = __allocate_data_block(&dn);
if (err)
goto put_out;
allocated = true;
+ set_buffer_new(bh_result);
map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
} else {
goto put_out;
@@ -745,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file,
int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
{
struct inode *inode = page->mapping->host;
- block_t old_blkaddr, new_blkaddr;
struct dnode_of_data dn;
int err = 0;
@@ -754,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
if (err)
return err;
- old_blkaddr = dn.data_blkaddr;
+ fio->blk_addr = dn.data_blkaddr;
/* This page is already truncated */
- if (old_blkaddr == NULL_ADDR)
+ if (fio->blk_addr == NULL_ADDR)
goto out_writepage;
set_page_writeback(page);
@@ -766,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
- if (unlikely(old_blkaddr != NEW_ADDR &&
+ if (unlikely(fio->blk_addr != NEW_ADDR &&
!is_cold_data(page) &&
need_inplace_update(inode))) {
- rewrite_data_page(page, old_blkaddr, fio);
+ rewrite_data_page(page, fio);
set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
} else {
- write_data_page(page, &dn, &new_blkaddr, fio);
- update_extent_cache(new_blkaddr, &dn);
+ write_data_page(page, &dn, fio);
+ update_extent_cache(&dn);
set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
}
out_writepage:
@@ -812,7 +868,12 @@ static int f2fs_write_data_page(struct page *page,
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
write:
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+ goto redirty_out;
+ if (f2fs_is_drop_cache(inode))
+ goto out;
+ if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
+ available_free_memory(sbi, BASE_CHECK))
goto redirty_out;
/* Dentry blocks are controlled by checkpoint */
@@ -826,7 +887,6 @@ write:
/* we should bypass data pages to proceed the kworkder jobs */
if (unlikely(f2fs_cp_error(sbi))) {
SetPageError(page);
- unlock_page(page);
goto out;
}
@@ -1002,8 +1062,12 @@ put_next:
if (dn.data_blkaddr == NEW_ADDR) {
zero_user_segment(page, 0, PAGE_CACHE_SIZE);
} else {
- err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
- READ_SYNC);
+ struct f2fs_io_info fio = {
+ .type = DATA,
+ .rw = READ_SYNC,
+ .blk_addr = dn.data_blkaddr,
+ };
+ err = f2fs_submit_page_bio(sbi, page, &fio);
if (err)
goto fail;
@@ -1092,6 +1156,9 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+ if (rw & WRITE)
+ __allocate_data_blocks(inode, offset, count);
+
err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
if (err < 0 && (rw & WRITE))
f2fs_write_failed(mapping, offset + count);
@@ -1101,24 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
return err;
}
-static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
- unsigned int length)
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
+ unsigned int length)
{
struct inode *inode = page->mapping->host;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+ if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
+ (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
return;
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
- invalidate_inmem_page(inode, page);
-
- if (PageDirty(page))
- inode_dec_dirty_pages(inode);
+ if (PageDirty(page)) {
+ if (inode->i_ino == F2FS_META_INO(sbi))
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ else if (inode->i_ino == F2FS_NODE_INO(sbi))
+ dec_page_count(sbi, F2FS_DIRTY_NODES);
+ else
+ inode_dec_dirty_pages(inode);
+ }
ClearPagePrivate(page);
}
-static int f2fs_release_data_page(struct page *page, gfp_t wait)
+int f2fs_release_page(struct page *page, gfp_t wait)
{
+ /* If this is dirty page, keep PagePrivate */
+ if (PageDirty(page))
+ return 0;
+
ClearPagePrivate(page);
return 1;
}
@@ -1132,7 +1208,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
SetPageUptodate(page);
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) {
+ if (f2fs_is_atomic_file(inode)) {
register_inmem_page(inode, page);
return 1;
}
@@ -1168,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = {
.write_begin = f2fs_write_begin,
.write_end = f2fs_write_end,
.set_page_dirty = f2fs_set_data_page_dirty,
- .invalidatepage = f2fs_invalidate_data_page,
- .releasepage = f2fs_release_data_page,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
.direct_IO = f2fs_direct_IO,
.bmap = f2fs_bmap,
};
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f699ab30..e671373cc8ab 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -40,6 +40,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->ndirty_dirs = sbi->n_dirty_dirs;
si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+ si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
si->rsvd_segs = reserved_segments(sbi);
si->overp_segs = overprovision_segments(sbi);
@@ -57,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->node_pages = NODE_MAPPING(sbi)->nrpages;
si->meta_pages = META_MAPPING(sbi)->nrpages;
si->nats = NM_I(sbi)->nat_cnt;
- si->sits = SIT_I(sbi)->dirty_sentries;
+ si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+ si->sits = MAIN_SEGS(sbi);
+ si->dirty_sits = SIT_I(sbi)->dirty_sentries;
si->fnids = NM_I(sbi)->fcnt;
si->bg_gc = sbi->bg_gc;
si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -79,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
si->segment_count[i] = sbi->segment_count[i];
si->block_count[i] = sbi->block_count[i];
}
+
+ si->inplace_count = atomic_read(&sbi->inplace_count);
}
/*
@@ -137,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+ si->base_mem += SIT_VBLOCK_MAP_SIZE;
if (sbi->segs_per_sec > 1)
si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -159,20 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
si->base_mem += sizeof(struct f2fs_nm_info);
si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+get_cache:
+ si->cache_mem = 0;
+
/* build gc */
- si->base_mem += sizeof(struct f2fs_gc_kthread);
+ if (sbi->gc_thread)
+ si->cache_mem += sizeof(struct f2fs_gc_kthread);
+
+ /* build merge flush thread */
+ if (SM_I(sbi)->cmd_control_info)
+ si->cache_mem += sizeof(struct flush_cmd_control);
-get_cache:
/* free nids */
- si->cache_mem = NM_I(sbi)->fcnt;
- si->cache_mem += NM_I(sbi)->nat_cnt;
- npages = NODE_MAPPING(sbi)->nrpages;
- si->cache_mem += npages << PAGE_CACHE_SHIFT;
- npages = META_MAPPING(sbi)->nrpages;
- si->cache_mem += npages << PAGE_CACHE_SHIFT;
- si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
+ si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+ si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
+ si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
+ sizeof(struct nat_entry_set);
+ si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
+ si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
for (i = 0; i <= UPDATE_INO; i++)
si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+
+ si->page_mem = 0;
+ npages = NODE_MAPPING(sbi)->nrpages;
+ si->page_mem += npages << PAGE_CACHE_SHIFT;
+ npages = META_MAPPING(sbi)->nrpages;
+ si->page_mem += npages << PAGE_CACHE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -250,16 +268,16 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
si->hit_ext, si->total_ext);
seq_puts(s, "\nBalancing F2FS Async:\n");
- seq_printf(s, " - inmem: %4d\n",
- si->inmem_pages);
+ seq_printf(s, " - inmem: %4d, wb: %4d\n",
+ si->inmem_pages, si->wb_pages);
seq_printf(s, " - nodes: %4d in %4d\n",
si->ndirty_node, si->node_pages);
seq_printf(s, " - dents: %4d in dirs:%4d\n",
si->ndirty_dent, si->ndirty_dirs);
seq_printf(s, " - meta: %4d in %4d\n",
si->ndirty_meta, si->meta_pages);
- seq_printf(s, " - NATs: %9d\n - SITs: %9d\n",
- si->nats, si->sits);
+ seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n",
+ si->dirty_nats, si->nats, si->dirty_sits, si->sits);
seq_printf(s, " - free_nids: %9d\n",
si->fnids);
seq_puts(s, "\nDistribution of User Blocks:");
@@ -277,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v)
for (j = 0; j < si->util_free; j++)
seq_putc(s, '-');
seq_puts(s, "]\n\n");
+ seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
seq_printf(s, "SSR: %u blocks in %u segments\n",
si->block_count[SSR], si->segment_count[SSR]);
seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -289,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v)
/* memory footprint */
update_mem_info(si->sbi);
- seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
- (si->base_mem + si->cache_mem) >> 10,
- si->base_mem >> 10, si->cache_mem >> 10);
+ seq_printf(s, "\nMemory: %u KB\n",
+ (si->base_mem + si->cache_mem + si->page_mem) >> 10);
+ seq_printf(s, " - static: %u KB\n",
+ si->base_mem >> 10);
+ seq_printf(s, " - cached: %u KB\n",
+ si->cache_mem >> 10);
+ seq_printf(s, " - paged : %u KB\n",
+ si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
return 0;
@@ -331,6 +355,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
atomic_set(&sbi->inline_inode, 0);
atomic_set(&sbi->inline_dir, 0);
+ atomic_set(&sbi->inplace_count, 0);
mutex_lock(&f2fs_stat_mutex);
list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b1a7d5737cd0..b74097a7f6d9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -286,8 +286,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
f2fs_wait_on_page_writeback(page, type);
de->ino = cpu_to_le32(inode->i_ino);
set_de_type(de, inode);
- if (!f2fs_has_inline_dentry(dir))
- kunmap(page);
+ f2fs_dentry_kunmap(dir, page);
set_page_dirty(page);
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
mark_inode_dirty(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2373fc..7fa3313ab0e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,7 +28,7 @@
do { \
if (unlikely(condition)) { \
WARN_ON(1); \
- sbi->need_fsck = true; \
+ set_sbi_flag(sbi, SBI_NEED_FSCK); \
} \
} while (0)
#define f2fs_down_write(x, y) down_write(x)
@@ -100,10 +100,15 @@ enum {
enum {
CP_UMOUNT,
+ CP_FASTBOOT,
CP_SYNC,
CP_DISCARD,
};
+#define DEF_BATCHED_TRIM_SECTIONS 32
+#define BATCHED_TRIM_SEGMENTS(sbi) \
+ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
+
struct cp_control {
int reason;
__u64 trim_start;
@@ -136,8 +141,14 @@ struct ino_entry {
nid_t ino; /* inode number */
};
-/* for the list of directory inodes */
-struct dir_inode_entry {
+/*
+ * for the list of directory inodes or gc inodes.
+ * NOTE: there are two slab users for this structure, if we add/modify/delete
+ * fields in structure for one of slab users, it may affect fields or size of
+ * other one, in this condition, it's better to split both of slab and related
+ * data structure.
+ */
+struct inode_entry {
struct list_head list; /* list head */
struct inode *inode; /* vfs inode pointer */
};
@@ -196,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
*/
#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS
#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS
+#define F2FS_IOC_GETVERSION FS_IOC_GETVERSION
#define F2FS_IOCTL_MAGIC 0xf5
#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1)
#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2)
#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3)
+#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
+#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -295,7 +309,7 @@ struct f2fs_inode_info {
nid_t i_xattr_nid; /* node id that contains xattrs */
unsigned long long xattr_ver; /* cp version of xattr modification */
struct extent_info ext; /* in-memory extent cache entry */
- struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */
+ struct inode_entry *dirty_dir; /* the pointer of dirty dir */
struct radix_tree_root inmem_root; /* radix tree for inmem pages */
struct list_head inmem_pages; /* inmemory pages managed by f2fs */
@@ -398,7 +412,8 @@ enum {
CURSEG_HOT_NODE, /* direct node blocks of directory files */
CURSEG_WARM_NODE, /* direct node blocks of normal files */
CURSEG_COLD_NODE, /* indirect node blocks */
- NO_CHECK_TYPE
+ NO_CHECK_TYPE,
+ CURSEG_DIRECT_IO, /* to use for the direct IO path */
};
struct flush_cmd {
@@ -437,6 +452,9 @@ struct f2fs_sm_info {
int nr_discards; /* # of discards in the list */
int max_discards; /* max. discards to be issued */
+ /* for batched trimming */
+ unsigned int trim_sections; /* # of sections to trim */
+
struct list_head sit_entry_set; /* sit entry set list */
unsigned int ipu_policy; /* in-place-update policy */
@@ -489,6 +507,7 @@ enum page_type {
struct f2fs_io_info {
enum page_type type; /* contains DATA/NODE/META/META_FLUSH */
int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+ block_t blk_addr; /* block address to be written */
};
#define is_read_io(rw) (((rw) & 1) == READ)
@@ -508,13 +527,20 @@ struct inode_management {
unsigned long ino_num; /* number of entries */
};
+/* For s_flag in struct f2fs_sb_info */
+enum {
+ SBI_IS_DIRTY, /* dirty flag for checkpoint */
+ SBI_IS_CLOSE, /* specify unmounting */
+ SBI_NEED_FSCK, /* need fsck.f2fs to fix */
+ SBI_POR_DOING, /* recovery is doing or not */
+};
+
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
struct proc_dir_entry *s_proc; /* proc entry */
struct buffer_head *raw_super_buf; /* buffer head of raw sb */
struct f2fs_super_block *raw_super; /* raw super block pointer */
- int s_dirty; /* dirty flag for checkpoint */
- bool need_fsck; /* need fsck.f2fs to fix */
+ int s_flag; /* flags for sbi */
/* for node-related operations */
struct f2fs_nm_info *nm_info; /* node manager */
@@ -534,7 +560,6 @@ struct f2fs_sb_info {
struct rw_semaphore cp_rwsem; /* blocking FS operations */
struct rw_semaphore node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
- bool por_doing; /* recovery is doing or not */
wait_queue_head_t cp_wait;
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -589,6 +614,7 @@ struct f2fs_sb_info {
struct f2fs_stat_info *stat_info; /* FS status information */
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
+ atomic_t inplace_count; /* # of inplace update */
int total_hit_ext, read_hit_ext; /* extent cache hit ratio */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -686,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
return sbi->node_inode->i_mapping;
}
-static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_dirty = 1;
+ return sbi->s_flag & (0x01 << type);
}
-static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
{
- sbi->s_dirty = 0;
+ sbi->s_flag |= (0x01 << type);
+}
+
+static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
+{
+ sbi->s_flag &= ~(0x01 << type);
}
static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -741,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
up_write(&sbi->cp_rwsem);
}
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+ int reason = CP_SYNC;
+
+ if (test_opt(sbi, FASTBOOT))
+ reason = CP_FASTBOOT;
+ if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+ reason = CP_UMOUNT;
+ return reason;
+}
+
+static inline bool __remain_node_summaries(int reason)
+{
+ return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+ return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+ is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
+
/*
* Check whether the given nid is within node id range.
*/
@@ -805,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
{
atomic_inc(&sbi->nr_pages[count_type]);
- F2FS_SET_SB_DIRT(sbi);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
}
static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1113,6 +1166,7 @@ enum {
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
};
@@ -1220,6 +1274,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
}
+static inline bool f2fs_is_drop_cache(struct inode *inode)
+{
+ return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+}
+
static inline void *inline_data_addr(struct page *page)
{
struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1389,7 +1448,6 @@ void destroy_node_manager_caches(void);
* segment.c
*/
void register_inmem_page(struct inode *, struct page *);
-void invalidate_inmem_page(struct inode *, struct page *);
void commit_inmem_pages(struct inode *, bool);
void f2fs_balance_fs(struct f2fs_sb_info *);
void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1401,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *);
void release_discard_addrs(struct f2fs_sb_info *);
void discard_next_dnode(struct f2fs_sb_info *, block_t);
-int npages_for_summary_flush(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *, bool);
void allocate_new_segments(struct f2fs_sb_info *);
int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
void write_meta_page(struct f2fs_sb_info *, struct page *);
void write_node_page(struct f2fs_sb_info *, struct page *,
- struct f2fs_io_info *, unsigned int, block_t, block_t *);
-void write_data_page(struct page *, struct dnode_of_data *, block_t *,
- struct f2fs_io_info *);
-void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
+ unsigned int, struct f2fs_io_info *);
+void write_data_page(struct page *, struct dnode_of_data *,
+ struct f2fs_io_info *);
+void rewrite_data_page(struct page *, struct f2fs_io_info *);
void recover_data_page(struct f2fs_sb_info *, struct page *,
struct f2fs_summary *, block_t, block_t);
void allocate_data_block(struct f2fs_sb_info *, struct page *,
@@ -1457,17 +1515,20 @@ void destroy_checkpoint_caches(void);
* data.c
*/
void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
+ struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
struct f2fs_io_info *);
int reserve_new_block(struct dnode_of_data *);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(block_t, struct dnode_of_data *);
+void update_extent_cache(struct dnode_of_data *);
struct page *find_data_page(struct inode *, pgoff_t, bool);
struct page *get_lock_data_page(struct inode *, pgoff_t);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct page *, struct f2fs_io_info *);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
+int f2fs_release_page(struct page *, gfp_t);
/*
* gc.c
@@ -1477,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
int f2fs_gc(struct f2fs_sb_info *);
void build_gc_manager(struct f2fs_sb_info *);
-int __init create_gc_caches(void);
-void destroy_gc_caches(void);
/*
* recovery.c
@@ -1497,9 +1556,9 @@ struct f2fs_stat_info {
int main_area_segs, main_area_sections, main_area_zones;
int hit_ext, total_ext;
int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
- int nats, sits, fnids;
+ int nats, dirty_nats, sits, dirty_sits, fnids;
int total_count, utilization;
- int bg_gc, inline_inode, inline_dir, inmem_pages;
+ int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
unsigned int valid_count, valid_node_count, valid_inode_count;
unsigned int bimodal, avg_vblocks;
int util_free, util_valid, util_invalid;
@@ -1514,7 +1573,8 @@ struct f2fs_stat_info {
unsigned int segment_count[2];
unsigned int block_count[2];
- unsigned base_mem, cache_mem;
+ unsigned int inplace_count;
+ unsigned base_mem, cache_mem, page_mem;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1553,7 +1613,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
((sbi)->segment_count[(curseg)->alloc_type]++)
#define stat_inc_block_count(sbi, curseg) \
((sbi)->block_count[(curseg)->alloc_type]++)
-
+#define stat_inc_inplace_blocks(sbi) \
+ (atomic_inc(&(sbi)->inplace_count))
#define stat_inc_seg_count(sbi, type) \
do { \
struct f2fs_stat_info *si = F2FS_STAT(sbi); \
@@ -1599,6 +1660,7 @@ void f2fs_destroy_root_stats(void);
#define stat_dec_inline_dir(inode)
#define stat_inc_seg_type(sbi, curseg)
#define stat_inc_block_count(sbi, curseg)
+#define stat_inc_inplace_blocks(sbi)
#define stat_inc_seg_count(si, type)
#define stat_inc_tot_blk_count(si, blks)
#define stat_inc_data_blk_count(si, blks)
@@ -1619,6 +1681,7 @@ extern const struct address_space_operations f2fs_meta_aops;
extern const struct inode_operations f2fs_dir_inode_operations;
extern const struct inode_operations f2fs_symlink_inode_operations;
extern const struct inode_operations f2fs_special_inode_operations;
+extern struct kmem_cache *inode_entry_slab;
/*
* inline.c
@@ -1629,7 +1692,6 @@ int f2fs_read_inline_data(struct inode *, struct page *);
int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
int f2fs_convert_inline_inode(struct inode *);
int f2fs_write_inline_data(struct inode *, struct page *);
-void truncate_inline_data(struct page *, u64);
bool recover_inline_data(struct inode *, struct page *);
struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
struct page **);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..98dac27bc3f7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -26,6 +26,7 @@
#include "segment.h"
#include "xattr.h"
#include "acl.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = f2fs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -246,6 +246,10 @@ go_write:
sync_nodes:
sync_node_pages(sbi, ino, &wbc);
+ /* if cp_error was enabled, we should avoid infinite loop */
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto out;
+
if (need_inode_block_update(sbi, ino)) {
mark_inode_dirty_sync(inode);
f2fs_write_inode(inode, NULL);
@@ -265,6 +269,7 @@ flush_out:
ret = f2fs_issue_flush(sbi);
out:
trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+ f2fs_trace_ios(NULL, NULL, 1);
return ret;
}
@@ -351,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
/* find data/hole in dnode block */
for (; dn.ofs_in_node < end_offset;
dn.ofs_in_node++, pgofs++,
- data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
block_t blkaddr;
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -427,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
if (blkaddr == NULL_ADDR)
continue;
- update_extent_cache(NULL_ADDR, dn);
+ dn->data_blkaddr = NULL_ADDR;
+ update_extent_cache(dn);
invalidate_blocks(sbi, blkaddr);
nr_free++;
}
@@ -484,8 +490,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
trace_f2fs_truncate_blocks_enter(inode, from);
- free_from = (pgoff_t)
- ((from + blocksize - 1) >> (sbi->log_blocksize));
+ free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
if (lock)
f2fs_lock_op(sbi);
@@ -836,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode,
return ret;
}
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+ /* some remained atomic pages should discarded */
+ if (f2fs_is_atomic_file(inode))
+ commit_inmem_pages(inode, true);
+ if (f2fs_is_volatile_file(inode)) {
+ set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ filemap_fdatawrite(inode->i_mapping);
+ clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+ }
+ return 0;
+}
+
#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL)
@@ -906,29 +924,30 @@ out:
return ret;
}
+static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+
+ return put_user(inode->i_generation, (int __user *)arg);
+}
+
static int f2fs_ioc_start_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
- struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
if (!inode_owner_or_capable(inode))
return -EACCES;
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(F2FS_I_SB(inode));
+
+ if (f2fs_is_atomic_file(inode))
+ return 0;
set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
return f2fs_convert_inline_inode(inode);
}
-static int f2fs_release_file(struct inode *inode, struct file *filp)
-{
- /* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
- commit_inmem_pages(inode, true);
- return 0;
-}
-
static int f2fs_ioc_commit_atomic_write(struct file *filp)
{
struct inode *inode = file_inode(filp);
@@ -949,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
mnt_drop_write_file(filp);
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
return ret;
}
@@ -959,11 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
if (!inode_owner_or_capable(inode))
return -EACCES;
+ if (f2fs_is_volatile_file(inode))
+ return 0;
+
set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
return f2fs_convert_inline_inode(inode);
}
+static int f2fs_ioc_release_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ if (!f2fs_is_volatile_file(inode))
+ return 0;
+
+ punch_hole(inode, 0, F2FS_BLKSIZE);
+ return 0;
+}
+
+static int f2fs_ioc_abort_volatile_write(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ int ret;
+
+ if (!inode_owner_or_capable(inode))
+ return -EACCES;
+
+ ret = mnt_want_write_file(filp);
+ if (ret)
+ return ret;
+
+ f2fs_balance_fs(F2FS_I_SB(inode));
+
+ if (f2fs_is_atomic_file(inode)) {
+ commit_inmem_pages(inode, false);
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ }
+
+ if (f2fs_is_volatile_file(inode)) {
+ clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ filemap_fdatawrite(inode->i_mapping);
+ set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ }
+ mnt_drop_write_file(filp);
+ return ret;
+}
+
static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
@@ -1001,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_getflags(filp, arg);
case F2FS_IOC_SETFLAGS:
return f2fs_ioc_setflags(filp, arg);
+ case F2FS_IOC_GETVERSION:
+ return f2fs_ioc_getversion(filp, arg);
case F2FS_IOC_START_ATOMIC_WRITE:
return f2fs_ioc_start_atomic_write(filp);
case F2FS_IOC_COMMIT_ATOMIC_WRITE:
return f2fs_ioc_commit_atomic_write(filp);
case F2FS_IOC_START_VOLATILE_WRITE:
return f2fs_ioc_start_volatile_write(filp);
+ case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+ return f2fs_ioc_release_volatile_write(filp);
+ case F2FS_IOC_ABORT_VOLATILE_WRITE:
+ return f2fs_ioc_abort_volatile_write(filp);
case FITRIM:
return f2fs_ioc_fitrim(filp, arg);
default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index eec0933a4819..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -24,8 +24,6 @@
#include "gc.h"
#include <trace/events/f2fs.h>
-static struct kmem_cache *winode_slab;
-
static int gc_thread_func(void *data)
{
struct f2fs_sb_info *sbi = data;
@@ -46,7 +44,7 @@ static int gc_thread_func(void *data)
break;
if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
continue;
}
@@ -67,15 +65,15 @@ static int gc_thread_func(void *data)
continue;
if (!is_idle(sbi)) {
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
mutex_unlock(&sbi->gc_mutex);
continue;
}
if (has_enough_invalid_blocks(sbi))
- wait_ms = decrease_sleep_time(gc_th, wait_ms);
+ decrease_sleep_time(gc_th, &wait_ms);
else
- wait_ms = increase_sleep_time(gc_th, wait_ms);
+ increase_sleep_time(gc_th, &wait_ms);
stat_inc_bggc_count(sbi);
@@ -356,13 +354,10 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
iput(inode);
return;
}
- new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
+ new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
new_ie->inode = inode;
-retry:
- if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) {
- cond_resched();
- goto retry;
- }
+
+ f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
list_add_tail(&new_ie->list, &gc_list->ilist);
}
@@ -373,7 +368,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
iput(ie->inode);
list_del(&ie->list);
- kmem_cache_free(winode_slab, ie);
+ kmem_cache_free(inode_entry_slab, ie);
}
}
@@ -703,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
.iroot = RADIX_TREE_INIT(GFP_NOFS),
};
- cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
-
+ cpc.reason = __get_cp_reason(sbi);
gc_more:
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
@@ -750,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
{
DIRTY_I(sbi)->v_ops = &default_v_ops;
}
-
-int __init create_gc_caches(void)
-{
- winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
- sizeof(struct inode_entry));
- if (!winode_slab)
- return -ENOMEM;
- return 0;
-}
-
-void destroy_gc_caches(void)
-{
- kmem_cache_destroy(winode_slab);
-}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 6ff7ad38463e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,11 +35,6 @@ struct f2fs_gc_kthread {
unsigned int gc_idle;
};
-struct inode_entry {
- struct list_head list;
- struct inode *inode;
-};
-
struct gc_inode_list {
struct list_head ilist;
struct radix_tree_root iroot;
@@ -69,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
}
-static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
+ long *wait)
{
- if (wait == gc_th->no_gc_sleep_time)
- return wait;
+ if (*wait == gc_th->no_gc_sleep_time)
+ return;
- wait += gc_th->min_sleep_time;
- if (wait > gc_th->max_sleep_time)
- wait = gc_th->max_sleep_time;
- return wait;
+ *wait += gc_th->min_sleep_time;
+ if (*wait > gc_th->max_sleep_time)
+ *wait = gc_th->max_sleep_time;
}
-static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
+ long *wait)
{
- if (wait == gc_th->no_gc_sleep_time)
- wait = gc_th->max_sleep_time;
+ if (*wait == gc_th->no_gc_sleep_time)
+ *wait = gc_th->max_sleep_time;
- wait -= gc_th->min_sleep_time;
- if (wait <= gc_th->min_sleep_time)
- wait = gc_th->min_sleep_time;
- return wait;
+ *wait -= gc_th->min_sleep_time;
+ if (*wait <= gc_th->min_sleep_time)
+ *wait = gc_th->min_sleep_time;
}
static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f2d3c581e776..1484c00133cd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -50,6 +50,12 @@ void read_inline_data(struct page *page, struct page *ipage)
SetPageUptodate(page);
}
+static void truncate_inline_data(struct page *ipage)
+{
+ f2fs_wait_on_page_writeback(ipage, NODE);
+ memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
+}
+
int f2fs_read_inline_data(struct inode *inode, struct page *page)
{
struct page *ipage;
@@ -79,7 +85,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
{
void *src_addr, *dst_addr;
- block_t new_blk_addr;
struct f2fs_io_info fio = {
.type = DATA,
.rw = WRITE_SYNC | REQ_PRIO,
@@ -115,9 +120,9 @@ no_update:
/* write data page to try to make data consistent */
set_page_writeback(page);
-
- write_data_page(page, dn, &new_blk_addr, &fio);
- update_extent_cache(new_blk_addr, dn);
+ fio.blk_addr = dn->data_blkaddr;
+ write_data_page(page, dn, &fio);
+ update_extent_cache(dn);
f2fs_wait_on_page_writeback(page, DATA);
if (dirty)
inode_dec_dirty_pages(dn->inode);
@@ -126,7 +131,7 @@ no_update:
set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
/* clear inline data and flag after data writeback */
- truncate_inline_data(dn->inode_page, 0);
+ truncate_inline_data(dn->inode_page);
clear_out:
stat_dec_inline_inode(dn->inode);
f2fs_clear_inline_inode(dn->inode);
@@ -199,19 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
return 0;
}
-void truncate_inline_data(struct page *ipage, u64 from)
-{
- void *addr;
-
- if (from >= MAX_INLINE_DATA)
- return;
-
- f2fs_wait_on_page_writeback(ipage, NODE);
-
- addr = inline_data_addr(ipage);
- memset(addr + from, 0, MAX_INLINE_DATA - from);
-}
-
bool recover_inline_data(struct inode *inode, struct page *npage)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -253,7 +245,7 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- truncate_inline_data(ipage, 0);
+ truncate_inline_data(ipage);
f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
@@ -371,7 +363,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
set_page_dirty(page);
/* clear inline dir and flag after data writeback */
- truncate_inline_data(ipage, 0);
+ truncate_inline_data(ipage);
stat_dec_inline_dir(dir);
clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc7843aaf..2d002e3738a7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,29 +67,23 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
}
}
-static int __recover_inline_status(struct inode *inode, struct page *ipage)
+static void __recover_inline_status(struct inode *inode, struct page *ipage)
{
void *inline_data = inline_data_addr(ipage);
- struct f2fs_inode *ri;
- void *zbuf;
+ __le32 *start = inline_data;
+ __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
- zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS);
- if (!zbuf)
- return -ENOMEM;
+ while (start < end) {
+ if (*start++) {
+ f2fs_wait_on_page_writeback(ipage, NODE);
- if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) {
- kfree(zbuf);
- return 0;
+ set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+ set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
+ set_page_dirty(ipage);
+ return;
+ }
}
- kfree(zbuf);
-
- f2fs_wait_on_page_writeback(ipage, NODE);
- set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-
- ri = F2FS_INODE(ipage);
- set_raw_inline(F2FS_I(inode), ri);
- set_page_dirty(ipage);
- return 0;
+ return;
}
static int do_read_inode(struct inode *inode)
@@ -98,7 +92,6 @@ static int do_read_inode(struct inode *inode)
struct f2fs_inode_info *fi = F2FS_I(inode);
struct page *node_page;
struct f2fs_inode *ri;
- int err = 0;
/* Check if ino is within scope */
if (check_nid_range(sbi, inode->i_ino)) {
@@ -142,7 +135,7 @@ static int do_read_inode(struct inode *inode)
/* check data exist */
if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
- err = __recover_inline_status(inode, node_page);
+ __recover_inline_status(inode, node_page);
/* get rdev by using inline_info */
__get_inode_rdev(inode, ri);
@@ -152,7 +145,7 @@ static int do_read_inode(struct inode *inode)
stat_inc_inline_inode(inode);
stat_inc_inline_dir(inode);
- return err;
+ return 0;
}
struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -304,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode)
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
/* some remained atomic pages should discarded */
- if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+ if (f2fs_is_atomic_file(inode))
commit_inmem_pages(inode, true);
trace_f2fs_evict_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 547a2deeb1ac..e79639a9787a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -299,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
inode->i_op = &f2fs_dir_inode_operations;
inode->i_fop = &f2fs_dir_operations;
inode->i_mapping->a_ops = &f2fs_dblock_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
set_inode_flag(F2FS_I(inode), FI_INC_LINK);
f2fs_lock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f83326ca32ef..97bd9d3db882 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
#include "f2fs.h"
#include "node.h"
#include "segment.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
@@ -57,12 +58,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
} else if (type == INO_ENTRIES) {
int i;
- if (sbi->sb->s_bdi->dirty_exceeded)
- return false;
for (i = 0; i <= UPDATE_INO; i++)
mem_size += (sbi->im[i].ino_num *
sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+ } else {
+ if (sbi->sb->s_bdi->dirty_exceeded)
+ return false;
}
return res;
}
@@ -268,7 +270,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
e = __lookup_nat_cache(nm_i, ni->nid);
if (!e) {
e = grab_nat_entry(nm_i, ni->nid);
- e->ni = *ni;
+ copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
} else if (new_blkaddr == NEW_ADDR) {
/*
@@ -276,7 +278,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
* previous nat entry can be remained in nat cache.
* So, reinitialize it with new information.
*/
- e->ni = *ni;
+ copy_node_info(&e->ni, ni);
f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
}
@@ -346,7 +348,6 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
struct nat_entry *e;
int i;
- memset(&ne, 0, sizeof(struct f2fs_nat_entry));
ni->nid = nid;
/* Check nat cache */
@@ -361,6 +362,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
if (e)
return;
+ memset(&ne, 0, sizeof(struct f2fs_nat_entry));
+
/* Check current segment summary */
mutex_lock(&curseg->curseg_mutex);
i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -471,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct page *npage[4];
- struct page *parent;
+ struct page *parent = NULL;
int offset[4];
unsigned int noffset[4];
nid_t nids[4];
@@ -488,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
if (IS_ERR(npage[0]))
return PTR_ERR(npage[0]);
}
+
+ /* if inline_data is set, should not report any block indices */
+ if (f2fs_has_inline_data(dn->inode) && index) {
+ err = -EINVAL;
+ f2fs_put_page(npage[0], 1);
+ goto release_out;
+ }
+
parent = npage[0];
if (level != 0)
nids[1] = get_nid(parent, offset[0], true);
@@ -585,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn)
}
invalidate:
clear_node_page_dirty(dn->node_page);
- F2FS_SET_SB_DIRT(sbi);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
f2fs_put_page(dn->node_page, 1);
@@ -976,6 +987,10 @@ static int read_node_page(struct page *page, int rw)
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
struct node_info ni;
+ struct f2fs_io_info fio = {
+ .type = NODE,
+ .rw = rw,
+ };
get_node_info(sbi, page->index, &ni);
@@ -987,7 +1002,8 @@ static int read_node_page(struct page *page, int rw)
if (PageUptodate(page))
return LOCKED_PAGE;
- return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
+ fio.blk_addr = ni.blk_addr;
+ return f2fs_submit_page_bio(sbi, page, &fio);
}
/*
@@ -1028,11 +1044,11 @@ repeat:
err = read_node_page(page, READ_SYNC);
if (err < 0)
return ERR_PTR(err);
- else if (err == LOCKED_PAGE)
- goto got_it;
+ else if (err != LOCKED_PAGE)
+ lock_page(page);
- lock_page(page);
if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
+ ClearPageUptodate(page);
f2fs_put_page(page, 1);
return ERR_PTR(-EIO);
}
@@ -1040,7 +1056,6 @@ repeat:
f2fs_put_page(page, 1);
goto repeat;
}
-got_it:
return page;
}
@@ -1268,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page,
{
struct f2fs_sb_info *sbi = F2FS_P_SB(page);
nid_t nid;
- block_t new_addr;
struct node_info ni;
struct f2fs_io_info fio = {
.type = NODE,
@@ -1277,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page,
trace_f2fs_writepage(page, NODE);
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (unlikely(f2fs_cp_error(sbi)))
goto redirty_out;
@@ -1303,9 +1317,11 @@ static int f2fs_write_node_page(struct page *page,
} else {
down_read(&sbi->node_write);
}
+
set_page_writeback(page);
- write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
- set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
+ fio.blk_addr = ni.blk_addr;
+ write_node_page(sbi, page, nid, &fio);
+ set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
dec_page_count(sbi, F2FS_DIRTY_NODES);
up_read(&sbi->node_write);
unlock_page(page);
@@ -1355,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
__set_page_dirty_nobuffers(page);
inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
SetPagePrivate(page);
+ f2fs_trace_pid(page);
return 1;
}
return 0;
}
-static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
- unsigned int length)
-{
- struct inode *inode = page->mapping->host;
- if (PageDirty(page))
- dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
- ClearPagePrivate(page);
-}
-
-static int f2fs_release_node_page(struct page *page, gfp_t wait)
-{
- ClearPagePrivate(page);
- return 1;
-}
-
/*
* Structure of the f2fs node operations
*/
@@ -1382,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = {
.writepage = f2fs_write_node_page,
.writepages = f2fs_write_node_pages,
.set_page_dirty = f2fs_set_node_page_dirty,
- .invalidatepage = f2fs_invalidate_node_page,
- .releasepage = f2fs_release_node_page,
+ .invalidatepage = f2fs_invalidate_page,
+ .releasepage = f2fs_release_page,
};
static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1726,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
return 0;
}
-/*
- * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-read pages are allocated in bd_inode's mapping tree.
- */
-static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
- int start, int nrpages)
-{
- struct inode *inode = sbi->sb->s_bdev->bd_inode;
- struct address_space *mapping = inode->i_mapping;
- int i, page_idx = start;
- struct f2fs_io_info fio = {
- .type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO
- };
-
- for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
- /* alloc page in bd_inode for reading node summary info */
- pages[i] = grab_cache_page(mapping, page_idx);
- if (!pages[i])
- break;
- f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
- }
-
- f2fs_submit_merged_bio(sbi, META, READ);
- return i;
-}
-
int restore_node_summary(struct f2fs_sb_info *sbi,
unsigned int segno, struct f2fs_summary_block *sum)
{
struct f2fs_node *rn;
struct f2fs_summary *sum_entry;
- struct inode *inode = sbi->sb->s_bdev->bd_inode;
block_t addr;
int bio_blocks = MAX_BIO_BLOCKS(sbi);
- struct page *pages[bio_blocks];
- int i, idx, last_offset, nrpages, err = 0;
+ int i, idx, last_offset, nrpages;
/* scan the node segment */
last_offset = sbi->blocks_per_seg;
addr = START_BLOCK(sbi, segno);
sum_entry = &sum->entries[0];
- for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
+ for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
nrpages = min(last_offset - i, bio_blocks);
/* readahead node pages */
- nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
- if (!nrpages)
- return -ENOMEM;
+ ra_meta_pages(sbi, addr, nrpages, META_POR);
- for (idx = 0; idx < nrpages; idx++) {
- if (err)
- goto skip;
+ for (idx = addr; idx < addr + nrpages; idx++) {
+ struct page *page = get_meta_page(sbi, idx);
- lock_page(pages[idx]);
- if (unlikely(!PageUptodate(pages[idx]))) {
- err = -EIO;
- } else {
- rn = F2FS_NODE(pages[idx]);
- sum_entry->nid = rn->footer.nid;
- sum_entry->version = 0;
- sum_entry->ofs_in_node = 0;
- sum_entry++;
- }
- unlock_page(pages[idx]);
-skip:
- page_cache_release(pages[idx]);
+ rn = F2FS_NODE(page);
+ sum_entry->nid = rn->footer.nid;
+ sum_entry->version = 0;
+ sum_entry->ofs_in_node = 0;
+ sum_entry++;
+ f2fs_put_page(page, 1);
}
- invalidate_mapping_pages(inode->i_mapping, addr,
+ invalidate_mapping_pages(META_MAPPING(sbi), addr,
addr + nrpages);
}
- return err;
+ return 0;
}
static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1923,7 +1886,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
struct f2fs_summary_block *sum = curseg->sum_blk;
- struct nat_entry_set *setvec[NATVEC_SIZE];
+ struct nat_entry_set *setvec[SETVEC_SIZE];
struct nat_entry_set *set, *tmp;
unsigned int found;
nid_t set_idx = 0;
@@ -1940,7 +1903,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
remove_nats_in_journal(sbi);
while ((found = __gang_lookup_nat_set(nm_i,
- set_idx, NATVEC_SIZE, setvec))) {
+ set_idx, SETVEC_SIZE, setvec))) {
unsigned idx;
set_idx = setvec[found - 1]->set + 1;
for (idx = 0; idx < found; idx++)
@@ -2020,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
struct f2fs_nm_info *nm_i = NM_I(sbi);
struct free_nid *i, *next_i;
struct nat_entry *natvec[NATVEC_SIZE];
+ struct nat_entry_set *setvec[SETVEC_SIZE];
nid_t nid = 0;
unsigned int found;
@@ -2044,11 +2008,27 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
while ((found = __gang_lookup_nat_cache(nm_i,
nid, NATVEC_SIZE, natvec))) {
unsigned idx;
+
nid = nat_get_nid(natvec[found - 1]) + 1;
for (idx = 0; idx < found; idx++)
__del_from_nat_cache(nm_i, natvec[idx]);
}
f2fs_bug_on(sbi, nm_i->nat_cnt);
+
+ /* destroy nat set cache */
+ nid = 0;
+ while ((found = __gang_lookup_nat_set(nm_i,
+ nid, SETVEC_SIZE, setvec))) {
+ unsigned idx;
+
+ nid = setvec[found - 1]->set + 1;
+ for (idx = 0; idx < found; idx++) {
+ /* entry_cnt is not zero, when cp_error was occurred */
+ f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
+ radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
+ kmem_cache_free(nat_entry_set_slab, setvec[idx]);
+ }
+ }
up_write(&nm_i->nat_tree_lock);
kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d10b6448a671..f405bbf2435a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,10 +25,19 @@
/* vector size for gang look-up from nat cache that consists of radix tree */
#define NATVEC_SIZE 64
+#define SETVEC_SIZE 32
/* return value for read_node_page */
#define LOCKED_PAGE 1
+/* For flag in struct node_info */
+enum {
+ IS_CHECKPOINTED, /* is it checkpointed before? */
+ HAS_FSYNCED_INODE, /* is the inode fsynced before? */
+ HAS_LAST_FSYNC, /* has the latest node fsync mark? */
+ IS_DIRTY, /* this nat entry is dirty? */
+};
+
/*
* For node information
*/
@@ -37,18 +46,11 @@ struct node_info {
nid_t ino; /* inode number of the node's owner */
block_t blk_addr; /* block address of the node */
unsigned char version; /* version of the node */
-};
-
-enum {
- IS_CHECKPOINTED, /* is it checkpointed before? */
- HAS_FSYNCED_INODE, /* is the inode fsynced before? */
- HAS_LAST_FSYNC, /* has the latest node fsync mark? */
- IS_DIRTY, /* this nat entry is dirty? */
+ unsigned char flag; /* for node information bits */
};
struct nat_entry {
struct list_head list; /* for clean or dirty nat list */
- unsigned char flag; /* for node information bits */
struct node_info ni; /* in-memory node information */
};
@@ -63,20 +65,30 @@ struct nat_entry {
#define inc_node_version(version) (++version)
+static inline void copy_node_info(struct node_info *dst,
+ struct node_info *src)
+{
+ dst->nid = src->nid;
+ dst->ino = src->ino;
+ dst->blk_addr = src->blk_addr;
+ dst->version = src->version;
+ /* should not copy flag here */
+}
+
static inline void set_nat_flag(struct nat_entry *ne,
unsigned int type, bool set)
{
unsigned char mask = 0x01 << type;
if (set)
- ne->flag |= mask;
+ ne->ni.flag |= mask;
else
- ne->flag &= ~mask;
+ ne->ni.flag &= ~mask;
}
static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
{
unsigned char mask = 0x01 << type;
- return ne->flag & mask;
+ return ne->ni.flag & mask;
}
static inline void nat_reset_flag(struct nat_entry *ne)
@@ -108,6 +120,7 @@ enum mem_type {
NAT_ENTRIES, /* indicates the cached nat entry */
DIRTY_DENTS, /* indicates dirty dentry pages */
INO_ENTRIES, /* indicates inode entries */
+ BASE_CHECK, /* check kernel status */
};
struct nat_entry_set {
@@ -200,11 +213,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
nid_t ino, unsigned int ofs, bool reset)
{
struct f2fs_node *rn = F2FS_NODE(page);
+ unsigned int old_flag = 0;
+
if (reset)
memset(rn, 0, sizeof(*rn));
+ else
+ old_flag = le32_to_cpu(rn->footer.flag);
+
rn->footer.nid = cpu_to_le32(nid);
rn->footer.ino = cpu_to_le32(ino);
- rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+
+ /* should remain old flag bits such as COLD_BIT_SHIFT */
+ rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+ (old_flag & OFFSET_BIT_MASK));
}
static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9160a37e1c7a..41afb9534bbd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -346,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
if (IS_INODE(page)) {
recover_inline_xattr(inode, page);
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+ /*
+ * Deprecated; xattr blocks should be found from cold log.
+ * But, we should remain this for backward compatibility.
+ */
recover_xattr_data(inode, page, blkaddr);
goto out;
}
@@ -396,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
/* write dummy data page */
recover_data_page(sbi, NULL, &sum, src, dest);
- update_extent_cache(dest, &dn);
+ dn.data_blkaddr = dest;
+ update_extent_cache(&dn);
recovered++;
}
dn.ofs_in_node++;
@@ -503,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
INIT_LIST_HEAD(&inode_list);
/* step #1: find fsynced inode numbers */
- sbi->por_doing = true;
+ set_sbi_flag(sbi, SBI_POR_DOING);
/* prevent checkpoint */
mutex_lock(&sbi->cp_mutex);
@@ -536,7 +541,7 @@ out:
truncate_inode_pages_final(META_MAPPING(sbi));
}
- sbi->por_doing = false;
+ clear_sbi_flag(sbi, SBI_POR_DOING);
if (err) {
discard_next_dnode(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 42607a679923..daee4ab913da 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
#include "f2fs.h"
#include "segment.h"
#include "node.h"
+#include "trace.h"
#include <trace/events/f2fs.h>
#define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -181,6 +182,7 @@ void register_inmem_page(struct inode *inode, struct page *page)
int err;
SetPagePrivate(page);
+ f2fs_trace_pid(page);
new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -205,23 +207,6 @@ retry:
mutex_unlock(&fi->inmem_lock);
}
-void invalidate_inmem_page(struct inode *inode, struct page *page)
-{
- struct f2fs_inode_info *fi = F2FS_I(inode);
- struct inmem_pages *cur;
-
- mutex_lock(&fi->inmem_lock);
- cur = radix_tree_lookup(&fi->inmem_root, page->index);
- if (cur) {
- radix_tree_delete(&fi->inmem_root, cur->page->index);
- f2fs_put_page(cur->page, 0);
- list_del(&cur->list);
- kmem_cache_free(inmem_entry_slab, cur);
- dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
- }
- mutex_unlock(&fi->inmem_lock);
-}
-
void commit_inmem_pages(struct inode *inode, bool abort)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -230,7 +215,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
bool submit_bio = false;
struct f2fs_io_info fio = {
.type = DATA,
- .rw = WRITE_SYNC,
+ .rw = WRITE_SYNC | REQ_PRIO,
};
/*
@@ -240,33 +225,38 @@ void commit_inmem_pages(struct inode *inode, bool abort)
* Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
* inode becomes free by iget_locked in f2fs_iget.
*/
- if (!abort)
+ if (!abort) {
f2fs_balance_fs(sbi);
-
- f2fs_lock_op(sbi);
+ f2fs_lock_op(sbi);
+ }
mutex_lock(&fi->inmem_lock);
list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
- lock_page(cur->page);
- if (!abort && cur->page->mapping == inode->i_mapping) {
- f2fs_wait_on_page_writeback(cur->page, DATA);
- if (clear_page_dirty_for_io(cur->page))
- inode_dec_dirty_pages(inode);
- do_write_data_page(cur->page, &fio);
- submit_bio = true;
+ if (!abort) {
+ lock_page(cur->page);
+ if (cur->page->mapping == inode->i_mapping) {
+ f2fs_wait_on_page_writeback(cur->page, DATA);
+ if (clear_page_dirty_for_io(cur->page))
+ inode_dec_dirty_pages(inode);
+ do_write_data_page(cur->page, &fio);
+ submit_bio = true;
+ }
+ f2fs_put_page(cur->page, 1);
+ } else {
+ put_page(cur->page);
}
radix_tree_delete(&fi->inmem_root, cur->page->index);
- f2fs_put_page(cur->page, 1);
list_del(&cur->list);
kmem_cache_free(inmem_entry_slab, cur);
dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
}
- if (submit_bio)
- f2fs_submit_merged_bio(sbi, DATA, WRITE);
mutex_unlock(&fi->inmem_lock);
- filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
- f2fs_unlock_op(sbi);
+ if (!abort) {
+ f2fs_unlock_op(sbi);
+ if (submit_bio)
+ f2fs_submit_merged_bio(sbi, DATA, WRITE);
+ }
}
/*
@@ -290,7 +280,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
/* check the # of cached NAT entries and prefree segments */
if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
excess_prefree_segs(sbi) ||
- available_free_memory(sbi, INO_ENTRIES))
+ !available_free_memory(sbi, INO_ENTRIES))
f2fs_sync_fs(sbi->sb, true);
}
@@ -515,12 +505,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
- unsigned long dmap[entries];
+ unsigned long *dmap = SIT_I(sbi)->tmp_map;
unsigned int start = 0, end = -1;
bool force = (cpc->reason == CP_DISCARD);
int i;
- if (!force && !test_opt(sbi, DISCARD))
+ if (!force && (!test_opt(sbi, DISCARD) ||
+ SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
return;
if (force && !se->valid_blocks) {
@@ -548,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
for (i = 0; i < entries; i++)
- dmap[i] = ~(cur_map[i] | ckpt_map[i]);
+ dmap[i] = force ? ~ckpt_map[i] :
+ (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -735,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
/*
* Calculate the number of current summary pages for writing
*/
-int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
{
int valid_sum_count = 0;
int i, sum_in_page;
@@ -743,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
if (sbi->ckpt->alloc_type[i] == SSR)
valid_sum_count += sbi->blocks_per_seg;
- else
- valid_sum_count += curseg_blkoff(sbi, i);
+ else {
+ if (for_ra)
+ valid_sum_count += le16_to_cpu(
+ F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+ else
+ valid_sum_count += curseg_blkoff(sbi, i);
+ }
}
sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -803,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
int go_left = 0;
int i;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
segno = find_next_zero_bit(free_i->free_segmap,
@@ -876,7 +873,7 @@ got_it:
f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
__set_inuse(sbi, segno);
*newseg = segno;
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -927,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
{
struct seg_entry *se = get_seg_entry(sbi, seg->segno);
int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
- unsigned long target_map[entries];
+ unsigned long *target_map = SIT_I(sbi)->tmp_map;
unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
int i, pos;
@@ -1027,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
stat_inc_seg_type(sbi, curseg);
}
+static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+{
+ struct curseg_info *curseg = CURSEG_I(sbi, type);
+ unsigned int old_segno;
+
+ old_segno = curseg->segno;
+ SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+ locate_dirty_segment(sbi, old_segno);
+}
+
void allocate_new_segments(struct f2fs_sb_info *sbi)
{
- struct curseg_info *curseg;
- unsigned int old_curseg;
int i;
- for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
- curseg = CURSEG_I(sbi, i);
- old_curseg = curseg->segno;
- SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
- locate_dirty_segment(sbi, old_curseg);
- }
+ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
+ __allocate_new_segments(sbi, i);
}
static const struct segment_allocation default_salloc_ops = {
@@ -1047,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
{
- __u64 start = range->start >> sbi->log_blocksize;
- __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+ __u64 start = F2FS_BYTES_TO_BLK(range->start);
+ __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
unsigned int start_segno, end_segno;
struct cp_control cpc;
@@ -1065,16 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
GET_SEGNO(sbi, end);
cpc.reason = CP_DISCARD;
- cpc.trim_start = start_segno;
- cpc.trim_end = end_segno;
- cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
+ cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
/* do checkpoint to issue discard commands safely */
- mutex_lock(&sbi->gc_mutex);
- write_checkpoint(sbi, &cpc);
- mutex_unlock(&sbi->gc_mutex);
+ for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
+ cpc.trim_start = start_segno;
+ cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+ BATCHED_TRIM_SEGMENTS(sbi),
+ sbi->segs_per_sec) - 1, end_segno);
+
+ mutex_lock(&sbi->gc_mutex);
+ write_checkpoint(sbi, &cpc);
+ mutex_unlock(&sbi->gc_mutex);
+ }
out:
- range->len = cpc.trimmed << sbi->log_blocksize;
+ range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
return 0;
}
@@ -1151,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg;
+ bool direct_io = (type == CURSEG_DIRECT_IO);
+
+ type = direct_io ? CURSEG_WARM_DATA : type;
curseg = CURSEG_I(sbi, type);
mutex_lock(&curseg->curseg_mutex);
+ /* direct_io'ed data is aligned to the segment for better performance */
+ if (direct_io && curseg->next_blkoff)
+ __allocate_new_segments(sbi, type);
+
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
/*
@@ -1187,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
}
static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
- block_t old_blkaddr, block_t *new_blkaddr,
- struct f2fs_summary *sum, struct f2fs_io_info *fio)
+ struct f2fs_summary *sum,
+ struct f2fs_io_info *fio)
{
int type = __get_segment_type(page, fio->type);
- allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+ allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
/* writeout dirty page into bdev */
- f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+ f2fs_submit_page_mbio(sbi, page, fio);
}
void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
{
struct f2fs_io_info fio = {
.type = META,
- .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+ .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+ .blk_addr = page->index,
};
set_page_writeback(page);
- f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+ f2fs_submit_page_mbio(sbi, page, &fio);
}
void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
- struct f2fs_io_info *fio,
- unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
+ unsigned int nid, struct f2fs_io_info *fio)
{
struct f2fs_summary sum;
set_summary(&sum, nid, 0, 0);
- do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
+ do_write_page(sbi, page, &sum, fio);
}
void write_data_page(struct page *page, struct dnode_of_data *dn,
- block_t *new_blkaddr, struct f2fs_io_info *fio)
+ struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
struct f2fs_summary sum;
@@ -1228,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
get_node_info(sbi, dn->nid, &ni);
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-
- do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
+ do_write_page(sbi, page, &sum, fio);
+ dn->data_blkaddr = fio->blk_addr;
}
-void rewrite_data_page(struct page *page, block_t old_blkaddr,
- struct f2fs_io_info *fio)
+void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
{
- f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
+ stat_inc_inplace_blocks(F2FS_P_SB(page));
+ f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
}
void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1393,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
segno = le32_to_cpu(ckpt->cur_data_segno[type]);
blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
CURSEG_HOT_DATA]);
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ if (__exist_node_summaries(sbi))
blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
else
blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1402,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
CURSEG_HOT_NODE]);
blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
CURSEG_HOT_NODE]);
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+ if (__exist_node_summaries(sbi))
blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
type - CURSEG_HOT_NODE);
else
@@ -1413,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
sum = (struct f2fs_summary_block *)page_address(new);
if (IS_NODESEG(type)) {
- if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+ if (__exist_node_summaries(sbi)) {
struct f2fs_summary *ns = &sum->entries[0];
int i;
for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1450,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
int err;
if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+ int npages = npages_for_summary_flush(sbi, true);
+
+ if (npages >= 2)
+ ra_meta_pages(sbi, start_sum_block(sbi), npages,
+ META_CP);
+
/* restore for compacted data summary */
if (read_compacted_summaries(sbi))
return -EINVAL;
type = CURSEG_HOT_NODE;
}
+ if (__exist_node_summaries(sbi))
+ ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+ NR_CURSEG_TYPE - type, META_CP);
+
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
if (err)
@@ -1549,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
{
- if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
- write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
+ write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
}
int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
@@ -1754,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
se = get_seg_entry(sbi, segno);
/* add discard candidates */
- if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+ if (cpc->reason != CP_DISCARD) {
cpc->trim_start = segno;
add_discard_addrs(sbi, cpc);
}
@@ -1833,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
}
+ sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+ if (!sit_i->tmp_map)
+ return -ENOMEM;
+
if (sbi->segs_per_sec > 1) {
sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
sizeof(struct sec_entry));
@@ -1897,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
free_i->free_segments = 0;
free_i->free_sections = 0;
- rwlock_init(&free_i->segmap_lock);
+ spin_lock_init(&free_i->segmap_lock);
return 0;
}
@@ -2110,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
sm_info->nr_discards = 0;
sm_info->max_discards = 0;
+ sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
+
INIT_LIST_HEAD(&sm_info->sit_entry_set);
if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
@@ -2212,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
kfree(sit_i->sentries[start].ckpt_valid_map);
}
}
+ kfree(sit_i->tmp_map);
+
vfree(sit_i->sentries);
vfree(sit_i->sec_entries);
kfree(sit_i->dirty_sentries_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f327c0ba4e3..7fd35111cf62 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -189,6 +189,7 @@ struct sit_info {
char *sit_bitmap; /* SIT bitmap pointer */
unsigned int bitmap_size; /* SIT bitmap size */
+ unsigned long *tmp_map; /* bitmap for temporal use */
unsigned long *dirty_sentries_bitmap; /* bitmap for dirty sentries */
unsigned int dirty_sentries; /* # of dirty sentries */
unsigned int sents_per_block; /* # of SIT entries per block */
@@ -207,7 +208,7 @@ struct free_segmap_info {
unsigned int start_segno; /* start segment number logically */
unsigned int free_segments; /* # of free segments */
unsigned int free_sections; /* # of free sections */
- rwlock_t segmap_lock; /* free segmap lock */
+ spinlock_t segmap_lock; /* free segmap lock */
unsigned long *free_segmap; /* free segment bitmap */
unsigned long *free_secmap; /* free section bitmap */
};
@@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
unsigned int max, unsigned int segno)
{
unsigned int ret;
- read_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
ret = find_next_bit(free_i->free_segmap, max, segno);
- read_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
return ret;
}
@@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
unsigned int start_segno = secno * sbi->segs_per_sec;
unsigned int next;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
clear_bit(segno, free_i->free_segmap);
free_i->free_segments++;
@@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
clear_bit(secno, free_i->free_secmap);
free_i->free_sections++;
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void __set_inuse(struct f2fs_sb_info *sbi,
@@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
unsigned int start_segno = secno * sbi->segs_per_sec;
unsigned int next;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
if (test_and_clear_bit(segno, free_i->free_segmap)) {
free_i->free_segments++;
@@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
free_i->free_sections++;
}
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
@@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
{
struct free_segmap_info *free_i = FREE_I(sbi);
unsigned int secno = segno / sbi->segs_per_sec;
- write_lock(&free_i->segmap_lock);
+ spin_lock(&free_i->segmap_lock);
if (!test_and_set_bit(segno, free_i->free_segmap)) {
free_i->free_segments--;
if (!test_and_set_bit(secno, free_i->free_secmap))
free_i->free_sections--;
}
- write_unlock(&free_i->segmap_lock);
+ spin_unlock(&free_i->segmap_lock);
}
static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
@@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- if (unlikely(sbi->por_doing))
+ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
@@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
{
if (segno > TOTAL_SEGS(sbi) - 1)
- sbi->need_fsck = true;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
{
if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
- sbi->need_fsck = true;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
/*
@@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
{
/* check segment usage */
if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
- sbi->need_fsck = true;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
/* check boundary of a given segment number */
if (segno > TOTAL_SEGS(sbi) - 1)
- sbi->need_fsck = true;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
}
#endif
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d70475..f2fe666a6ea9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
#include "segment.h"
#include "xattr.h"
#include "gc.h"
+#include "trace.h"
#define CREATE_TRACE_POINTS
#include <trace/events/f2fs.h>
@@ -41,6 +42,7 @@ static struct kset *f2fs_kset;
enum {
Opt_gc_background,
Opt_disable_roll_forward,
+ Opt_norecovery,
Opt_discard,
Opt_noheap,
Opt_user_xattr,
@@ -61,6 +63,7 @@ enum {
static match_table_t f2fs_tokens = {
{Opt_gc_background, "background_gc=%s"},
{Opt_disable_roll_forward, "disable_roll_forward"},
+ {Opt_norecovery, "norecovery"},
{Opt_discard, "discard"},
{Opt_noheap, "no_heap"},
{Opt_user_xattr, "user_xattr"},
@@ -192,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -207,6 +211,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(gc_idle),
ATTR_LIST(reclaim_segments),
ATTR_LIST(max_small_discards),
+ ATTR_LIST(batched_trim_sections),
ATTR_LIST(ipu_policy),
ATTR_LIST(min_ipu_util),
ATTR_LIST(min_fsync_blocks),
@@ -286,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options)
case Opt_disable_roll_forward:
set_opt(sbi, DISABLE_ROLL_FORWARD);
break;
+ case Opt_norecovery:
+ /* this option mounts f2fs with ro */
+ set_opt(sbi, DISABLE_ROLL_FORWARD);
+ if (!f2fs_readonly(sb))
+ return -EINVAL;
+ break;
case Opt_discard:
set_opt(sbi, DISCARD);
break;
@@ -446,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb)
f2fs_destroy_stats(sbi);
stop_gc_thread(sbi);
- /* We don't need to do checkpoint when it's clean */
- if (sbi->s_dirty) {
+ /*
+ * We don't need to do checkpoint when superblock is clean.
+ * But, the previous checkpoint was not done by umount, it needs to do
+ * clean checkpoint again.
+ */
+ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+ !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
struct cp_control cpc = {
.reason = CP_UMOUNT,
};
@@ -486,13 +502,15 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
if (sync) {
struct cp_control cpc;
- cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+ cpc.reason = __get_cp_reason(sbi);
+
mutex_lock(&sbi->gc_mutex);
write_checkpoint(sbi, &cpc);
mutex_unlock(&sbi->gc_mutex);
} else {
f2fs_balance_fs(sbi);
}
+ f2fs_trace_ios(NULL, NULL, 1);
return 0;
}
@@ -887,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
- sbi->need_fsck = false;
+ clear_sbi_flag(sbi, SBI_NEED_FSCK);
}
/*
@@ -942,6 +960,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
struct inode *root;
long err = -EINVAL;
bool retry = true;
+ char *options = NULL;
int i;
try_onemore:
@@ -973,9 +992,15 @@ try_onemore:
set_opt(sbi, POSIX_ACL);
#endif
/* parse mount options */
- err = parse_options(sb, (char *)data);
- if (err)
+ options = kstrdup((const char *)data, GFP_KERNEL);
+ if (data && !options) {
+ err = -ENOMEM;
goto free_sb_buf;
+ }
+
+ err = parse_options(sb, options);
+ if (err)
+ goto free_options;
sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
sb->s_max_links = F2FS_LINK_MAX;
@@ -998,7 +1023,7 @@ try_onemore:
mutex_init(&sbi->writepages);
mutex_init(&sbi->cp_mutex);
init_rwsem(&sbi->node_write);
- sbi->por_doing = false;
+ clear_sbi_flag(sbi, SBI_POR_DOING);
spin_lock_init(&sbi->stat_lock);
init_rwsem(&sbi->read_io.io_rwsem);
@@ -1019,7 +1044,7 @@ try_onemore:
if (IS_ERR(sbi->meta_inode)) {
f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_sb_buf;
+ goto free_options;
}
err = get_valid_checkpoint(sbi);
@@ -1122,10 +1147,19 @@ try_onemore:
goto free_proc;
if (!retry)
- sbi->need_fsck = true;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
/* recover fsynced data */
if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ /*
+ * mount should be failed, when device has readonly mode, and
+ * previous checkpoint was not done by clean system shutdown.
+ */
+ if (bdev_read_only(sb->s_bdev) &&
+ !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+ err = -EROFS;
+ goto free_kobj;
+ }
err = recover_fsync_data(sbi);
if (err) {
f2fs_msg(sb, KERN_ERR,
@@ -1144,6 +1178,7 @@ try_onemore:
if (err)
goto free_kobj;
}
+ kfree(options);
return 0;
free_kobj:
@@ -1168,6 +1203,8 @@ free_cp:
free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
+free_options:
+ kfree(options);
free_sb_buf:
brelse(raw_super_buf);
free_sbi:
@@ -1188,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
}
+static void kill_f2fs_super(struct super_block *sb)
+{
+ if (sb->s_root)
+ set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+ kill_block_super(sb);
+}
+
static struct file_system_type f2fs_fs_type = {
.owner = THIS_MODULE,
.name = "f2fs",
.mount = f2fs_mount,
- .kill_sb = kill_block_super,
+ .kill_sb = kill_f2fs_super,
.fs_flags = FS_REQUIRES_DEV,
};
MODULE_ALIAS_FS("f2fs");
@@ -1220,6 +1264,8 @@ static int __init init_f2fs_fs(void)
{
int err;
+ f2fs_build_trace_ios();
+
err = init_inodecache();
if (err)
goto fail;
@@ -1229,12 +1275,9 @@ static int __init init_f2fs_fs(void)
err = create_segment_manager_caches();
if (err)
goto free_node_manager_caches;
- err = create_gc_caches();
- if (err)
- goto free_segment_manager_caches;
err = create_checkpoint_caches();
if (err)
- goto free_gc_caches;
+ goto free_segment_manager_caches;
f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
if (!f2fs_kset) {
err = -ENOMEM;
@@ -1251,8 +1294,6 @@ free_kset:
kset_unregister(f2fs_kset);
free_checkpoint_caches:
destroy_checkpoint_caches();
-free_gc_caches:
- destroy_gc_caches();
free_segment_manager_caches:
destroy_segment_manager_caches();
free_node_manager_caches:
@@ -1269,11 +1310,11 @@ static void __exit exit_f2fs_fs(void)
f2fs_destroy_root_stats();
unregister_filesystem(&f2fs_fs_type);
destroy_checkpoint_caches();
- destroy_gc_caches();
destroy_segment_manager_caches();
destroy_node_manager_caches();
destroy_inodecache();
kset_unregister(f2fs_kset);
+ f2fs_destroy_trace_ios();
}
module_init(init_f2fs_fs)
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..875aa8179bc1
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
+#include <linux/radix-tree.h>
+
+#include "f2fs.h"
+#include "trace.h"
+
+static RADIX_TREE(pids, GFP_ATOMIC);
+static spinlock_t pids_lock;
+static struct last_io_info last_io;
+
+static inline void __print_last_io(void)
+{
+ if (!last_io.len)
+ return;
+
+ trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+ last_io.major, last_io.minor,
+ last_io.pid, "----------------",
+ last_io.type,
+ last_io.fio.rw, last_io.fio.blk_addr,
+ last_io.len);
+ memset(&last_io, 0, sizeof(last_io));
+}
+
+static int __file_type(struct inode *inode, pid_t pid)
+{
+ if (f2fs_is_atomic_file(inode))
+ return __ATOMIC_FILE;
+ else if (f2fs_is_volatile_file(inode))
+ return __VOLATILE_FILE;
+ else if (S_ISDIR(inode->i_mode))
+ return __DIR_FILE;
+ else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
+ return __NODE_FILE;
+ else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
+ return __META_FILE;
+ else if (pid)
+ return __NORMAL_FILE;
+ else
+ return __MISC_FILE;
+}
+
+void f2fs_trace_pid(struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ pid_t pid = task_pid_nr(current);
+ void *p;
+
+ page->private = pid;
+
+ if (radix_tree_preload(GFP_NOFS))
+ return;
+
+ spin_lock(&pids_lock);
+ p = radix_tree_lookup(&pids, pid);
+ if (p == current)
+ goto out;
+ if (p)
+ radix_tree_delete(&pids, pid);
+
+ f2fs_radix_tree_insert(&pids, pid, current);
+
+ trace_printk("%3x:%3x %4x %-16s\n",
+ MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+ pid, current->comm);
+out:
+ spin_unlock(&pids_lock);
+ radix_tree_preload_end();
+}
+
+void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+{
+ struct inode *inode;
+ pid_t pid;
+ int major, minor;
+
+ if (flush) {
+ __print_last_io();
+ return;
+ }
+
+ inode = page->mapping->host;
+ pid = page_private(page);
+
+ major = MAJOR(inode->i_sb->s_dev);
+ minor = MINOR(inode->i_sb->s_dev);
+
+ if (last_io.major == major && last_io.minor == minor &&
+ last_io.pid == pid &&
+ last_io.type == __file_type(inode, pid) &&
+ last_io.fio.rw == fio->rw &&
+ last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+ last_io.len++;
+ return;
+ }
+
+ __print_last_io();
+
+ last_io.major = major;
+ last_io.minor = minor;
+ last_io.pid = pid;
+ last_io.type = __file_type(inode, pid);
+ last_io.fio = *fio;
+ last_io.len = 1;
+ return;
+}
+
+void f2fs_build_trace_ios(void)
+{
+ spin_lock_init(&pids_lock);
+}
+
+#define PIDVEC_SIZE 128
+static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
+ unsigned int max_items)
+{
+ struct radix_tree_iter iter;
+ void **slot;
+ unsigned int ret = 0;
+
+ if (unlikely(!max_items))
+ return 0;
+
+ radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
+ results[ret] = iter.index;
+ if (++ret == PIDVEC_SIZE)
+ break;
+ }
+ return ret;
+}
+
+void f2fs_destroy_trace_ios(void)
+{
+ pid_t pid[PIDVEC_SIZE];
+ pid_t next_pid = 0;
+ unsigned int found;
+
+ spin_lock(&pids_lock);
+ while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
+ unsigned idx;
+
+ next_pid = pid[found - 1] + 1;
+ for (idx = 0; idx < found; idx++)
+ radix_tree_delete(&pids, pid[idx]);
+ }
+ spin_unlock(&pids_lock);
+}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..1041dbeb52ae
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_TRACE_H__
+#define __F2FS_TRACE_H__
+
+#ifdef CONFIG_F2FS_IO_TRACE
+#include <trace/events/f2fs.h>
+
+enum file_type {
+ __NORMAL_FILE,
+ __DIR_FILE,
+ __NODE_FILE,
+ __META_FILE,
+ __ATOMIC_FILE,
+ __VOLATILE_FILE,
+ __MISC_FILE,
+};
+
+struct last_io_info {
+ int major, minor;
+ pid_t pid;
+ enum file_type type;
+ struct f2fs_io_info fio;
+ block_t len;
+};
+
+extern void f2fs_trace_pid(struct page *);
+extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_build_trace_ios(void);
+extern void f2fs_destroy_trace_ios(void);
+#else
+#define f2fs_trace_pid(p)
+#define f2fs_trace_ios(p, i, n)
+#define f2fs_build_trace_ios()
+#define f2fs_destroy_trace_ios()
+
+#endif
+#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7b41a2dcdd76..497c7c5263c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb,
{
struct buffer_head *bh;
struct fat_boot_sector *b;
- struct msdos_sb_info *sbi = sb->s_fs_info;
+ struct msdos_sb_info *sbi = MSDOS_SB(sb);
/* do not change any thing if mounted read only */
if ((sb->s_flags & MS_RDONLY) && !force)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 99d440a4a6ba..ee85cd4e136a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
- __FMODE_EXEC | O_PATH | __O_TMPFILE
+ __FMODE_EXEC | O_PATH | __O_TMPFILE |
+ __FMODE_NONOTIFY
));
fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..073657f755d4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(writeback_in_progress);
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
+ struct super_block *sb;
- if (sb_is_blkdev_sb(sb))
- return inode->i_mapping->backing_dev_info;
+ if (!inode)
+ return &noop_backing_dev_info;
+ sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+ if (sb_is_blkdev_sb(sb))
+ return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
return sb->s_bdi;
}
+EXPORT_SYMBOL_GPL(inode_to_bdi);
static inline struct inode *wb_inode(struct list_head *head)
{
@@ -247,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
return ret;
}
+#define EXPIRE_DIRTY_ATIME 0x0001
+
/*
* Move expired (dirtied before work->older_than_this) dirty inodes from
* @delaying_queue to @dispatch_queue.
*/
static int move_expired_inodes(struct list_head *delaying_queue,
struct list_head *dispatch_queue,
+ int flags,
struct wb_writeback_work *work)
{
+ unsigned long *older_than_this = NULL;
+ unsigned long expire_time;
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
@@ -262,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
int do_sb_sort = 0;
int moved = 0;
+ if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+ older_than_this = work->older_than_this;
+ else if ((work->reason == WB_REASON_SYNC) == 0) {
+ expire_time = jiffies - (HZ * 86400);
+ older_than_this = &expire_time;
+ }
while (!list_empty(delaying_queue)) {
inode = wb_inode(delaying_queue->prev);
- if (work->older_than_this &&
- inode_dirtied_after(inode, *work->older_than_this))
+ if (older_than_this &&
+ inode_dirtied_after(inode, *older_than_this))
break;
list_move(&inode->i_wb_list, &tmp);
moved++;
+ if (flags & EXPIRE_DIRTY_ATIME)
+ set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
if (sb_is_blkdev_sb(inode->i_sb))
continue;
if (sb && sb != inode->i_sb)
@@ -309,9 +328,12 @@ out:
static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
{
int moved;
+
assert_spin_locked(&wb->list_lock);
list_splice_init(&wb->b_more_io, &wb->b_io);
- moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+ moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+ moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+ EXPIRE_DIRTY_ATIME, work);
trace_writeback_queue_io(wb, work, moved);
}
@@ -435,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
* updates after data IO completion.
*/
redirty_tail(inode, wb);
+ } else if (inode->i_state & I_DIRTY_TIME) {
+ list_move(&inode->i_wb_list, &wb->b_dirty_time);
} else {
/* The inode is clean. Remove from writeback lists. */
list_del_init(&inode->i_wb_list);
@@ -481,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_lock(&inode->i_lock);
dirty = inode->i_state & I_DIRTY;
- inode->i_state &= ~I_DIRTY;
+ if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+ (inode->i_state & I_DIRTY_TIME)) ||
+ (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+ dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+ trace_writeback_lazytime(inode);
+ }
+ inode->i_state &= ~dirty;
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
@@ -501,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
spin_unlock(&inode->i_lock);
+ if (dirty & I_DIRTY_TIME)
+ mark_inode_dirty_sync(inode);
/* Don't write the inode if only I_DIRTY_PAGES was set */
- if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (dirty & ~I_DIRTY_PAGES) {
int err = write_inode(inode, wbc);
if (ret == 0)
ret = err;
@@ -550,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* make sure inode is on some writeback list and leave it there unless
* we have completely cleaned the inode.
*/
- if (!(inode->i_state & I_DIRTY) &&
+ if (!(inode->i_state & I_DIRTY_ALL) &&
(wbc->sync_mode != WB_SYNC_ALL ||
!mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
goto out;
@@ -565,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
* If inode is clean, remove it from writeback lists. Otherwise don't
* touch it. See comment above for explanation.
*/
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
list_del_init(&inode->i_wb_list);
spin_unlock(&wb->list_lock);
inode_sync_complete(inode);
@@ -707,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb,
wrote += write_chunk - wbc.nr_to_write;
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
wrote++;
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
@@ -1145,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
struct super_block *sb = inode->i_sb;
struct backing_dev_info *bdi = NULL;
+ int dirtytime;
+
+ trace_writeback_mark_inode_dirty(inode, flags);
/*
* Don't do this for I_DIRTY_PAGES - that doesn't actually
* dirty the inode itself
*/
- if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+ if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
trace_writeback_dirty_inode_start(inode, flags);
if (sb->s_op->dirty_inode)
@@ -1162,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
trace_writeback_dirty_inode(inode, flags);
}
+ if (flags & I_DIRTY_INODE)
+ flags &= ~I_DIRTY_TIME;
+ dirtytime = flags & I_DIRTY_TIME;
/*
* Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
*/
smp_mb();
- if ((inode->i_state & flags) == flags)
+ if (((inode->i_state & flags) == flags) ||
+ (dirtytime && (inode->i_state & I_DIRTY_INODE)))
return;
if (unlikely(block_dump))
block_dump___mark_inode_dirty(inode);
spin_lock(&inode->i_lock);
+ if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+ goto out_unlock_inode;
if ((inode->i_state & flags) != flags) {
const int was_dirty = inode->i_state & I_DIRTY;
+ if (flags & I_DIRTY_INODE)
+ inode->i_state &= ~I_DIRTY_TIME;
inode->i_state |= flags;
/*
@@ -1225,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
}
inode->dirtied_when = jiffies;
- list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+ list_move(&inode->i_wb_list, dirtytime ?
+ &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
spin_unlock(&bdi->wb.list_lock);
+ trace_writeback_dirty_inode_enqueue(inode);
if (wakeup_bdi)
bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..b06c98796afb 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,78 +1,102 @@
#include <linux/fs.h>
+#include <linux/sched.h>
#include <linux/slab.h>
-#include <linux/fs_pin.h>
#include "internal.h"
#include "mount.h"
-static void pin_free_rcu(struct rcu_head *head)
-{
- kfree(container_of(head, struct fs_pin, rcu));
-}
-
static DEFINE_SPINLOCK(pin_lock);
-void pin_put(struct fs_pin *p)
-{
- if (atomic_long_dec_and_test(&p->count))
- call_rcu(&p->rcu, pin_free_rcu);
-}
-
void pin_remove(struct fs_pin *pin)
{
spin_lock(&pin_lock);
hlist_del(&pin->m_list);
hlist_del(&pin->s_list);
spin_unlock(&pin_lock);
+ spin_lock_irq(&pin->wait.lock);
+ pin->done = 1;
+ wake_up_locked(&pin->wait);
+ spin_unlock_irq(&pin->wait.lock);
}
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
{
spin_lock(&pin_lock);
- hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+ if (p)
+ hlist_add_head(&pin->s_list, p);
hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
spin_unlock(&pin_lock);
}
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+ pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+
+void pin_kill(struct fs_pin *p)
+{
+ wait_queue_t wait;
+
+ if (!p) {
+ rcu_read_unlock();
+ return;
+ }
+ init_wait(&wait);
+ spin_lock_irq(&p->wait.lock);
+ if (likely(!p->done)) {
+ p->done = -1;
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ p->kill(p);
+ return;
+ }
+ if (p->done > 0) {
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ return;
+ }
+ __add_wait_queue(&p->wait, &wait);
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&p->wait.lock);
+ rcu_read_unlock();
+ schedule();
+ rcu_read_lock();
+ if (likely(list_empty(&wait.task_list)))
+ break;
+ /* OK, we know p couldn't have been freed yet */
+ spin_lock_irq(&p->wait.lock);
+ if (p->done > 0) {
+ spin_unlock_irq(&p->wait.lock);
+ break;
+ }
+ }
+ rcu_read_unlock();
+}
+
void mnt_pin_kill(struct mount *m)
{
while (1) {
struct hlist_node *p;
- struct fs_pin *pin;
rcu_read_lock();
p = ACCESS_ONCE(m->mnt_pins.first);
if (!p) {
rcu_read_unlock();
break;
}
- pin = hlist_entry(p, struct fs_pin, m_list);
- if (!atomic_long_inc_not_zero(&pin->count)) {
- rcu_read_unlock();
- cpu_relax();
- continue;
- }
- rcu_read_unlock();
- pin->kill(pin);
+ pin_kill(hlist_entry(p, struct fs_pin, m_list));
}
}
-void sb_pin_kill(struct super_block *sb)
+void group_pin_kill(struct hlist_head *p)
{
while (1) {
- struct hlist_node *p;
- struct fs_pin *pin;
+ struct hlist_node *q;
rcu_read_lock();
- p = ACCESS_ONCE(sb->s_pins.first);
- if (!p) {
+ q = ACCESS_ONCE(p->first);
+ if (!q) {
rcu_read_unlock();
break;
}
- pin = hlist_entry(p, struct fs_pin, s_list);
- if (!atomic_long_inc_not_zero(&pin->count)) {
- rcu_read_unlock();
- cpu_relax();
- continue;
- }
- rcu_read_unlock();
- pin->kill(pin);
+ pin_kill(hlist_entry(q, struct fs_pin, s_list));
}
}
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba1107977f2e..ed19a7d622fa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -131,6 +131,13 @@ static void fuse_req_init_context(struct fuse_req *req)
req->in.h.pid = current->pid;
}
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+ /* Make sure stores before this are seen on another CPU */
+ smp_wmb();
+ fc->initialized = 1;
+}
+
static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
{
return !fc->initialized || (for_background && fc->blocked);
@@ -155,6 +162,8 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
if (intr)
goto out;
}
+ /* Matches smp_wmb() in fuse_set_initialized() */
+ smp_rmb();
err = -ENOTCONN;
if (!fc->connected)
@@ -253,6 +262,8 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
atomic_inc(&fc->num_waiting);
wait_event(fc->blocked_waitq, fc->initialized);
+ /* Matches smp_wmb() in fuse_set_initialized() */
+ smp_rmb();
req = fuse_request_alloc(0);
if (!req)
req = get_reserved_req(fc, file);
@@ -511,6 +522,39 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
}
EXPORT_SYMBOL_GPL(fuse_request_send);
+static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
+{
+ if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
+ args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+
+ if (fc->minor < 9) {
+ switch (args->in.h.opcode) {
+ case FUSE_LOOKUP:
+ case FUSE_CREATE:
+ case FUSE_MKNOD:
+ case FUSE_MKDIR:
+ case FUSE_SYMLINK:
+ case FUSE_LINK:
+ args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ break;
+ case FUSE_GETATTR:
+ case FUSE_SETATTR:
+ args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ break;
+ }
+ }
+ if (fc->minor < 12) {
+ switch (args->in.h.opcode) {
+ case FUSE_CREATE:
+ args->in.args[0].size = sizeof(struct fuse_open_in);
+ break;
+ case FUSE_MKNOD:
+ args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+ break;
+ }
+ }
+}
+
ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
{
struct fuse_req *req;
@@ -520,6 +564,9 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
if (IS_ERR(req))
return PTR_ERR(req);
+ /* Needs to be done after fuse_get_req() so that fc->minor is valid */
+ fuse_adjust_compat(fc, args);
+
req->in.h.opcode = args->in.h.opcode;
req->in.h.nodeid = args->in.h.nodeid;
req->in.numargs = args->in.numargs;
@@ -2127,7 +2174,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
if (fc->connected) {
fc->connected = 0;
fc->blocked = 0;
- fc->initialized = 1;
+ fuse_set_initialized(fc);
end_io_requests(fc);
end_queued_requests(fc);
end_polls(fc);
@@ -2146,7 +2193,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
spin_lock(&fc->lock);
fc->connected = 0;
fc->blocked = 0;
- fc->initialized = 1;
+ fuse_set_initialized(fc);
end_queued_requests(fc);
end_polls(fc);
wake_up_all(&fc->blocked_waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 252b8a5de8b5..08e7b1a9d5d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,10 +156,7 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
args->in.args[0].size = name->len + 1;
args->in.args[0].value = name->name;
args->out.numargs = 1;
- if (fc->minor < 9)
- args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
- else
- args->out.args[0].size = sizeof(struct fuse_entry_out);
+ args->out.args[0].size = sizeof(struct fuse_entry_out);
args->out.args[0].value = outarg;
}
@@ -422,16 +419,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
args.in.h.opcode = FUSE_CREATE;
args.in.h.nodeid = get_node_id(dir);
args.in.numargs = 2;
- args.in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
- sizeof(inarg);
+ args.in.args[0].size = sizeof(inarg);
args.in.args[0].value = &inarg;
args.in.args[1].size = entry->d_name.len + 1;
args.in.args[1].value = entry->d_name.name;
args.out.numargs = 2;
- if (fc->minor < 9)
- args.out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
- else
- args.out.args[0].size = sizeof(outentry);
+ args.out.args[0].size = sizeof(outentry);
args.out.args[0].value = &outentry;
args.out.args[1].size = sizeof(outopen);
args.out.args[1].value = &outopen;
@@ -539,10 +532,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
memset(&outarg, 0, sizeof(outarg));
args->in.h.nodeid = get_node_id(dir);
args->out.numargs = 1;
- if (fc->minor < 9)
- args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
- else
- args->out.args[0].size = sizeof(outarg);
+ args->out.args[0].size = sizeof(outarg);
args->out.args[0].value = &outarg;
err = fuse_simple_request(fc, args);
if (err)
@@ -592,8 +582,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
inarg.umask = current_umask();
args.in.h.opcode = FUSE_MKNOD;
args.in.numargs = 2;
- args.in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
- sizeof(inarg);
+ args.in.args[0].size = sizeof(inarg);
args.in.args[0].value = &inarg;
args.in.args[1].size = entry->d_name.len + 1;
args.in.args[1].value = entry->d_name.name;
@@ -899,10 +888,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.in.args[0].size = sizeof(inarg);
args.in.args[0].value = &inarg;
args.out.numargs = 1;
- if (fc->minor < 9)
- args.out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
- else
- args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].size = sizeof(outarg);
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err) {
@@ -1574,10 +1560,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
args->in.args[0].size = sizeof(*inarg_p);
args->in.args[0].value = inarg_p;
args->out.numargs = 1;
- if (fc->minor < 9)
- args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
- else
- args->out.args[0].size = sizeof(*outarg_p);
+ args->out.args[0].size = sizeof(*outarg_p);
args->out.args[0].value = outarg_p;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&inode->i_mutex);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
{
struct inode *inode = req->inode;
struct fuse_inode *fi = get_fuse_inode(inode);
- struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
int i;
list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
req->end = fuse_writepage_end;
req->inode = inode;
- inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
old_req->state == FUSE_REQ_PENDING)) {
- struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
copy_highpage(old_req->pages[0], page);
spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
req->page_descs[req->num_pages].offset = 0;
req->page_descs[req->num_pages].length = PAGE_SIZE;
- inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = fuse_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e0fc6725d1d0..1cdfb07c1376 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -906,4 +906,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
int fuse_do_setattr(struct inode *inode, struct iattr *attr,
struct file *file);
+void fuse_set_initialized(struct fuse_conn *fc);
+
#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6749109f255d..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
if (!fc->writeback_cache || !S_ISREG(attr->mode))
inode->i_flags |= S_NOCMTIME;
inode->i_generation = generation;
- inode->i_data.backing_dev_info = &fc->bdi;
fuse_init_inode(inode, attr);
unlock_new_inode(inode);
} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
@@ -424,8 +423,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
args.in.h.opcode = FUSE_STATFS;
args.in.h.nodeid = get_node_id(dentry->d_inode);
args.out.numargs = 1;
- args.out.args[0].size =
- fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
+ args.out.args[0].size = sizeof(outarg);
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err)
@@ -898,7 +896,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
fc->max_write = max_t(unsigned, 4096, fc->max_write);
fc->conn_init = 1;
}
- fc->initialized = 1;
+ fuse_set_initialized(fc);
wake_up_all(&fc->blocked_waitq);
}
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
BUG_ON(name == NULL);
- if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+ if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
return -E2BIG;
if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(inode));
ret = __gfs2_jdata_writepage(page, wbc);
if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
if (ht == NULL)
- ht = vzalloc(size);
+ ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+ PAGE_KERNEL);
if (!ht)
return -ENOMEM;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..3e32bb8e2d7e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = gfs2_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
/**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- int sync_state = inode->i_state & I_DIRTY;
+ int sync_state = inode->i_state & I_DIRTY_ALL;
struct gfs2_inode *ip = GFS2_I(inode);
int ret = 0, ret1 = 0;
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
if (!gfs2_is_jdata(ip))
sync_state &= ~I_DIRTY_PAGES;
if (datasync)
- sync_state &= ~I_DIRTY_SYNC;
+ sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
if (sync_state) {
ret = sync_inode_metadata(inode, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
spin_unlock(&lru_lock);
}
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
{
+ spin_lock(&lru_lock);
if (!list_empty(&gl->gl_lru)) {
list_del_init(&gl->gl_lru);
atomic_dec(&lru_count);
clear_bit(GLF_LRU, &gl->gl_flags);
}
-}
-
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
- spin_lock(&lru_lock);
- __gfs2_glock_remove_from_lru(gl);
spin_unlock(&lru_lock);
}
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
lockref_mark_dead(&gl->gl_lockref);
- spin_lock(&lru_lock);
- __gfs2_glock_remove_from_lru(gl);
- spin_unlock(&lru_lock);
+ gfs2_glock_remove_from_lru(gl);
spin_unlock(&gl->gl_lockref.lock);
spin_lock_bucket(gl->gl_hash);
hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = s->s_bdi;
mapping->writeback_index = 0;
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
}
error = gfs2_dir_add(&dip->i_inode, name, ip, da);
- if (error)
- goto fail_end_trans;
-fail_end_trans:
gfs2_trans_end(sdp);
fail_ipreserv:
gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = sb->s_bdi;
mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
}
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *dispose = arg;
struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
if (qd->qd_lockref.count == 0) {
lockref_mark_dead(&qd->qd_lockref);
- list_move(&qd->qd_lru, dispose);
+ list_lru_isolate_move(lru, &qd->qd_lru, dispose);
}
spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
- freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
- &dispose, &sc->nr_to_scan);
+ freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+ gfs2_qd_isolate, &dispose);
gfs2_qd_dispose(&dispose);
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+ return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
}
struct shrinker gfs2_qd_shrinker = {
@@ -667,7 +668,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
s64 change, struct gfs2_quota_data *qd,
- struct fs_disk_quota *fdq)
+ struct qc_dqblk *fdq)
{
struct inode *inode = &ip->i_inode;
struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +698,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
be64_add_cpu(&q.qu_value, change);
qd->qd_qb.qb_value = q.qu_value;
if (fdq) {
- if (fdq->d_fieldmask & FS_DQ_BSOFT) {
- q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+ if (fdq->d_fieldmask & QC_SPC_SOFT) {
+ q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
qd->qd_qb.qb_warn = q.qu_warn;
}
- if (fdq->d_fieldmask & FS_DQ_BHARD) {
- q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+ if (fdq->d_fieldmask & QC_SPC_HARD) {
+ q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
qd->qd_qb.qb_limit = q.qu_limit;
}
- if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
- q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+ if (fdq->d_fieldmask & QC_SPACE) {
+ q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
qd->qd_qb.qb_value = q.qu_value;
}
}
@@ -1497,7 +1498,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
}
static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
- struct fs_disk_quota *fdq)
+ struct qc_dqblk *fdq)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
struct gfs2_holder q_gh;
int error;
- memset(fdq, 0, sizeof(struct fs_disk_quota));
+ memset(fdq, 0, sizeof(*fdq));
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1523,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
goto out;
qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
- fdq->d_version = FS_DQUOT_VERSION;
- fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
- fdq->d_id = from_kqid_munged(current_user_ns(), qid);
- fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
- fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
- fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
+ fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
+ fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
+ fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
gfs2_glock_dq_uninit(&q_gh);
out:
@@ -1536,10 +1534,10 @@ out:
}
/* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
- struct fs_disk_quota *fdq)
+ struct qc_dqblk *fdq)
{
struct gfs2_sbd *sdp = sb->s_fs_info;
struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1581,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
goto out_i;
/* If nothing has changed, this is a no-op */
- if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
- ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
- fdq->d_fieldmask ^= FS_DQ_BSOFT;
+ if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
+ ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+ fdq->d_fieldmask ^= QC_SPC_SOFT;
- if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
- ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
- fdq->d_fieldmask ^= FS_DQ_BHARD;
+ if ((fdq->d_fieldmask & QC_SPC_HARD) &&
+ ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+ fdq->d_fieldmask ^= QC_SPC_HARD;
- if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
- ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
- fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+ if ((fdq->d_fieldmask & QC_SPACE) &&
+ ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+ fdq->d_fieldmask ^= QC_SPACE;
if (fdq->d_fieldmask == 0)
goto out_i;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
- sprintf(env_jid, "JID=%d", jid);
+ sprintf(env_jid, "JID=%u", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
- struct backing_dev_info *bdi = metamapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
int ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
struct super_block *sb = sdp->sd_vfs;
int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
- return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
+ return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
}
static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
}
-static struct backing_dev_info hugetlbfs_backing_dev_info = {
- .name = "hugetlbfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
int sysctl_hugetlb_shm_group;
enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
- inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_mapping->private_data = resv_map;
info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
return -ENOTSUPP;
}
- error = bdi_init(&hugetlbfs_backing_dev_info);
- if (error)
- return error;
-
error = -ENOMEM;
hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
out2:
- bdi_destroy(&hugetlbfs_backing_dev_info);
return error;
}
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
for_each_hstate(h)
kern_unmount(hugetlbfs_vfsmount[i++]);
unregister_filesystem(&hugetlbfs_fs_type);
- bdi_destroy(&hugetlbfs_backing_dev_info);
}
module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f00b16f45507 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
#include <linux/buffer_head.h> /* for inode_has_buffers */
#include <linux/ratelimit.h>
#include <linux/list_lru.h>
+#include <trace/events/writeback.h>
#include "internal.h"
/*
@@ -30,7 +31,7 @@
* inode_sb_list_lock protects:
* sb->s_inodes, inode->i_sb_list
* bdi->wb.list_lock protects:
- * bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
* inode_hash_lock protects:
* inode_hashtable, inode->i_hash
*
@@ -170,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&mapping->i_mmap_writable, 0);
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
mapping->private_data = NULL;
- mapping->backing_dev_info = &default_backing_dev_info;
mapping->writeback_index = 0;
-
- /*
- * If the block_device provides a backing_dev_info for client
- * inodes then use that. Otherwise the inode share the bdev's
- * backing_dev_info.
- */
- if (sb->s_bdev) {
- struct backing_dev_info *bdi;
-
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- mapping->backing_dev_info = bdi;
- }
inode->i_private = NULL;
inode->i_mapping = mapping;
INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */
@@ -194,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
#ifdef CONFIG_FSNOTIFY
inode->i_fsnotify_mask = 0;
#endif
-
+ inode->i_flctx = NULL;
this_cpu_inc(nr_inodes);
return 0;
@@ -237,6 +225,7 @@ void __destroy_inode(struct inode *inode)
BUG_ON(inode_has_buffers(inode));
security_inode_free(inode);
fsnotify_inode_delete(inode);
+ locks_free_lock_context(inode->i_flctx);
if (!inode->i_nlink) {
WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +344,6 @@ void address_space_init_once(struct address_space *mapping)
INIT_LIST_HEAD(&mapping->private_list);
spin_lock_init(&mapping->private_lock);
mapping->i_mmap = RB_ROOT;
- INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
}
EXPORT_SYMBOL(address_space_init_once);
@@ -416,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode)
*/
void inode_add_lru(struct inode *inode)
{
- if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+ if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+ I_FREEING | I_WILL_FREE)) &&
!atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
inode_lru_list_add(inode);
}
@@ -647,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
spin_unlock(&inode->i_lock);
continue;
}
- if (inode->i_state & I_DIRTY && !kill_dirty) {
+ if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
spin_unlock(&inode->i_lock);
busy = 1;
continue;
@@ -685,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
* LRU does not have strict ordering. Hence we don't want to reclaim inodes
* with this flag set because they are the inodes that are out of order.
*/
-static enum lru_status
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
{
struct list_head *freeable = arg;
struct inode *inode = container_of(item, struct inode, i_lru);
@@ -704,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
*/
if (atomic_read(&inode->i_count) ||
(inode->i_state & ~I_REFERENCED)) {
- list_del_init(&inode->i_lru);
+ list_lru_isolate(lru, &inode->i_lru);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
return LRU_REMOVED;
@@ -738,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
WARN_ON(inode->i_state & I_NEW);
inode->i_state |= I_FREEING;
- list_move(&inode->i_lru, freeable);
+ list_lru_isolate_move(lru, &inode->i_lru, freeable);
spin_unlock(&inode->i_lock);
this_cpu_dec(nr_unused);
@@ -751,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list().
*/
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid)
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
{
LIST_HEAD(freeable);
long freed;
- freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
- &freeable, &nr_to_scan);
+ freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+ inode_lru_isolate, &freeable);
dispose_list(&freeable);
return freed;
}
@@ -1282,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
}
EXPORT_SYMBOL(ilookup);
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb: super block of file system to search
+ * @hashval: hash value (usually inode number) to search for
+ * @match: callback used for comparisons between inodes
+ * @data: opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped. The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1. It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction. The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+ unsigned long hashval,
+ int (*match)(struct inode *, unsigned long,
+ void *),
+ void *data)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *inode, *ret_inode = NULL;
+ int mval;
+
+ spin_lock(&inode_hash_lock);
+ hlist_for_each_entry(inode, head, i_hash) {
+ if (inode->i_sb != sb)
+ continue;
+ mval = match(inode, hashval, data);
+ if (mval == 0)
+ continue;
+ if (mval == 1)
+ ret_inode = inode;
+ goto out;
+ }
+out:
+ spin_unlock(&inode_hash_lock);
+ return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
int insert_inode_locked(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -1432,11 +1470,20 @@ static void iput_final(struct inode *inode)
*/
void iput(struct inode *inode)
{
- if (inode) {
- BUG_ON(inode->i_state & I_CLEAR);
-
- if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
- iput_final(inode);
+ if (!inode)
+ return;
+ BUG_ON(inode->i_state & I_CLEAR);
+retry:
+ if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+ if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+ atomic_inc(&inode->i_count);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ trace_writeback_lazytime_iput(inode);
+ mark_inode_dirty_sync(inode);
+ goto retry;
+ }
+ iput_final(inode);
}
}
EXPORT_SYMBOL(iput);
@@ -1495,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
return 0;
}
-/*
- * This does the actual work of updating an inodes time or version. Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
{
- if (inode->i_op->update_time)
- return inode->i_op->update_time(inode, time, flags);
+ int iflags = I_DIRTY_TIME;
if (flags & S_ATIME)
inode->i_atime = *time;
@@ -1512,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
inode->i_ctime = *time;
if (flags & S_MTIME)
inode->i_mtime = *time;
- mark_inode_dirty_sync(inode);
+
+ if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+ iflags |= I_DIRTY_SYNC;
+ __mark_inode_dirty(inode, iflags);
return 0;
}
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version. Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+ int (*update_time)(struct inode *, struct timespec *, int);
+
+ update_time = inode->i_op->update_time ? inode->i_op->update_time :
+ generic_update_time;
+
+ return update_time(inode, time, flags);
+}
/**
* touch_atime - update the access time
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..30459dab409d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
struct linux_binprm;
struct path;
struct mount;
+struct shrink_control;
/*
* block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid);
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
/*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
*/
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
- int nid);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
/*
* read_write.c
@@ -145,7 +144,7 @@ extern const struct file_operations pipefifo_fops;
/*
* fs_pin.c
*/
-extern void sb_pin_kill(struct super_block *sb);
+extern void group_pin_kill(struct hlist_head *p);
extern void mnt_pin_kill(struct mount *m);
/*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
past_eof = true;
}
cond_resched();
+ if (fatal_signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
} while (1);
/* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index bb63254ed848..735d7522a3a9 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -362,6 +362,9 @@ repeat:
rs.cont_size = isonum_733(rr->u.CE.size);
break;
case SIG('E', 'R'):
+ /* Invalid length of ER tag id? */
+ if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
+ goto out;
ISOFS_SB(inode->i_sb)->s_rock = 1;
printk(KERN_DEBUG "ISO 9660 Extensions: ");
{
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
* linux/fs/isofs/util.c
*/
+#include <linux/time.h>
#include "isofs.h"
/*
@@ -17,9 +18,9 @@
int iso_date(char * p, int flag)
{
int year, month, day, hour, minute, second, tz;
- int crtime, days, i;
+ int crtime;
- year = p[0] - 70;
+ year = p[0];
month = p[1];
day = p[2];
hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
if (year < 0) {
crtime = 0;
} else {
- int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
-
- days = year * 365;
- if (year > 2)
- days += (year+1) / 4;
- for (i = 1; i < month; i++)
- days += monlen[i-1];
- if (((year+2) % 4) == 0 && month > 2)
- days++;
- days += day - 1;
- crtime = ((((days * 24) + hour) * 60 + minute) * 60)
- + second;
+ crtime = mktime64(year+1900, month, day, hour, minute, second);
/* sign extend */
if (tz & 0x80)
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644bf867..556de100ebd5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
return bit;
}
-static inline int pulledbits(struct pushpull *pp)
-{
- return pp->ofs;
-}
-
static void init_rubin(struct rubin_state *rs, int div, int *bits)
{
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
sumlen = c->sector_size - je32_to_cpu(sm->offset);
sumptr = buf + buf_size - sumlen;
+ /* sm->offset maybe wrong but MAGIC maybe right */
+ if (sumlen > c->sector_size)
+ goto full_scan;
+
/* Now, make sure the summary itself is available */
if (sumlen > buf_size) {
/* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
}
}
+full_scan:
buf_ofs = jeb->offset;
if (!buf_size) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (C) International Business Machines Corp., 2001
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _H_ENDIAN24
-#define _H_ENDIAN24
-
-/*
- * endian24.h:
- *
- * Endian conversion for 24-byte data
- *
- */
-#define __swab24(x) \
-({ \
- __u32 __x = (x); \
- ((__u32)( \
- ((__x & (__u32)0x000000ffUL) << 16) | \
- (__x & (__u32)0x0000ff00UL) | \
- ((__x & (__u32)0x00ff0000UL) >> 16) )); \
-})
-
-#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
- #define __cpu_to_le24(x) ((__u32)(x))
- #define __le24_to_cpu(x) ((__u32)(x))
-#else
- #define __cpu_to_le24(x) __swab24(x)
- #define __le24_to_cpu(x) __swab24(x)
-#endif
-
-#ifdef __KERNEL__
- #define cpu_to_le24 __cpu_to_le24
- #define le24_to_cpu __le24_to_cpu
-#endif
-
-#endif /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
return rc;
mutex_lock(&inode->i_mutex);
- if (!(inode->i_state & I_DIRTY) ||
+ if (!(inode->i_state & I_DIRTY_ALL) ||
(datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
/* Make sure committed changes hit the disk */
jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
pxdlist.maxnpxd = 1;
pxdlist.npxd = 0;
pxd = &pxdlist.pxd[0];
- PXDaddress(pxd, nxaddr)
- PXDlength(pxd, xlen + n);
+ PXDaddress(pxd, nxaddr);
+ PXDlength(pxd, xlen + n);
split->pxdlist = &pxdlist;
if ((rc = dtExtendPage(tid, ip, split, btstack))) {
nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
#include <linux/types.h>
#include <linux/nls.h>
-#include "endian24.h"
-
/*
* transaction and lock id's
*
@@ -59,26 +57,42 @@ struct timestruc_t {
/*
* physical xd (pxd)
+ *
+ * The leftmost 24 bits of len_addr are the extent length.
+ * The rightmost 8 bits of len_addr are the most signficant bits of
+ * the extent address
*/
typedef struct {
- unsigned len:24;
- unsigned addr1:8;
+ __le32 len_addr;
__le32 addr2;
} pxd_t;
/* xd_t field construction */
-#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32))
-#define PXDaddress(pxd, address64)\
-{\
- (pxd)->addr1 = ((s64)address64) >> 32;\
- (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+static inline void PXDlength(pxd_t *pxd, __u32 len)
+{
+ pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
+ cpu_to_le32(len & 0xffffff);
+}
+
+static inline void PXDaddress(pxd_t *pxd, __u64 addr)
+{
+ pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
+ cpu_to_le32((addr >> 32)<<24);
+ pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
}
/* xd_t field extraction */
-#define lengthPXD(pxd) __le24_to_cpu((pxd)->len)
-#define addressPXD(pxd)\
- ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+static inline __u32 lengthPXD(pxd_t *pxd)
+{
+ return le32_to_cpu((pxd)->len_addr) & 0xffffff;
+}
+
+static inline __u64 addressPXD(pxd_t *pxd)
+{
+ __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
+ return (n << 8) + le32_to_cpu(pxd->addr2);
+}
#define MAXTREEHEIGHT 8
/* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
* data extent descriptor (dxd)
*/
typedef struct {
- unsigned flag:8; /* 1: flags */
- unsigned rsrvd:24;
+ __u8 flag; /* 1: flags */
+ __u8 rsrvd[3];
__le32 size; /* 4: size in byte */
- unsigned len:24; /* 3: length in unit of fsblksize */
- unsigned addr1:8; /* 1: address in unit of fsblksize */
- __le32 addr2; /* 4: address in unit of fsblksize */
+ pxd_t loc; /* 8: address and length in unit of fsblksize */
} dxd_t; /* - 16 - */
/* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
#define DXD_CORRUPT 0x08 /* Inconsistency detected */
/* dxd_t field construction
- * Conveniently, the PXD macros work for DXD
*/
-#define DXDlength PXDlength
-#define DXDaddress PXDaddress
-#define lengthDXD lengthPXD
-#define addressDXD addressPXD
+#define DXDlength(dxd, len) PXDlength(&(dxd)->loc, len)
+#define DXDaddress(dxd, addr) PXDaddress(&(dxd)->loc, addr)
+#define lengthDXD(dxd) lengthPXD(&(dxd)->loc)
+#define addressDXD(dxd) addressPXD(&(dxd)->loc)
#define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
#define sizeDXD(dxd) le32_to_cpu((dxd)->size)
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
* extent allocation descriptor (xad)
*/
typedef struct xad {
- unsigned flag:8; /* 1: flag */
- unsigned rsvrd:16; /* 2: reserved */
- unsigned off1:8; /* 1: offset in unit of fsblksize */
- __le32 off2; /* 4: offset in unit of fsblksize */
- unsigned len:24; /* 3: length in unit of fsblksize */
- unsigned addr1:8; /* 1: address in unit of fsblksize */
- __le32 addr2; /* 4: address in unit of fsblksize */
+ __u8 flag; /* 1: flag */
+ __u8 rsvrd[2]; /* 2: reserved */
+ __u8 off1; /* 1: offset in unit of fsblksize */
+ __le32 off2; /* 4: offset in unit of fsblksize */
+ pxd_t loc; /* 8: length and address in unit of fsblksize */
} xad_t; /* (16) */
#define MAXXLEN ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
(xad)->off1 = ((u64)offset64) >> 32;\
(xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
}
-#define XADaddress(xad, address64)\
-{\
- (xad)->addr1 = ((u64)address64) >> 32;\
- (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
-}
-#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32)
+#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
+#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
/* xad_t field extraction */
#define offsetXAD(xad)\
( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
-#define addressXAD(xad)\
- ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
-#define lengthXAD(xad) __le24_to_cpu((xad)->len)
+#define addressXAD(xad) addressPXD(&(xad)->loc)
+#define lengthXAD(xad) lengthPXD(&(xad)->loc)
/* xad list */
struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
iput(sbi->direct_inode);
sbi->direct_inode = NULL;
out_unload:
- if (sbi->nls_tab)
- unload_nls(sbi->nls_tab);
+ unload_nls(sbi->nls_tab);
out_kfree:
kfree(sbi);
return ret;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 37989f02a226..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -201,10 +201,14 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
static int kernfs_name_compare(unsigned int hash, const char *name,
const void *ns, const struct kernfs_node *kn)
{
- if (hash != kn->hash)
- return hash - kn->hash;
- if (ns != kn->ns)
- return ns - kn->ns;
+ if (hash < kn->hash)
+ return -1;
+ if (hash > kn->hash)
+ return 1;
+ if (ns < kn->ns)
+ return -1;
+ if (ns > kn->ns)
+ return 1;
return strcmp(name, kn->name);
}
@@ -407,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- if (!(kn->flags & KERNFS_STATIC_NAME))
- kfree(kn->name);
+
+ kfree_const(kn->name);
+
if (kn->iattr) {
if (kn->iattr->ia_secdata)
security_release_secctx(kn->iattr->ia_secdata,
@@ -502,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
const char *name, umode_t mode,
unsigned flags)
{
- char *dup_name = NULL;
struct kernfs_node *kn;
int ret;
- if (!(flags & KERNFS_STATIC_NAME)) {
- name = dup_name = kstrdup(name, GFP_KERNEL);
- if (!name)
- return NULL;
- }
+ name = kstrdup_const(name, GFP_KERNEL);
+ if (!name)
+ return NULL;
kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
if (!kn)
@@ -534,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
- kfree(dup_name);
+ kfree_const(name);
return NULL;
}
@@ -1260,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
/* rename kernfs_node */
if (strcmp(kn->name, new_name) != 0) {
error = -ENOMEM;
- new_name = kstrdup(new_name, GFP_KERNEL);
+ new_name = kstrdup_const(new_name, GFP_KERNEL);
if (!new_name)
goto out;
} else {
@@ -1281,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
kn->ns = new_ns;
if (new_name) {
- if (!(kn->flags & KERNFS_STATIC_NAME))
- old_name = kn->name;
- kn->flags &= ~KERNFS_STATIC_NAME;
+ old_name = kn->name;
kn->name = new_name;
}
@@ -1293,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
kernfs_link_sibling(kn);
kernfs_put(old_parent);
- kfree(old_name);
+ kfree_const(old_name);
error = 0;
out:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
* @ops: kernfs operations for the file
* @priv: private data for the file
* @ns: optional namespace tag of the file
- * @name_is_static: don't copy file name
* @key: lockdep key for the file's active_ref, %NULL to disable lockdep
*
* Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
umode_t mode, loff_t size,
const struct kernfs_ops *ops,
void *priv, const void *ns,
- bool name_is_static,
struct lock_class_key *key)
{
struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
int rc;
flags = KERNFS_FILE;
- if (name_is_static)
- flags |= KERNFS_STATIC_NAME;
kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
if (!kn)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
.write_end = simple_write_end,
};
-static struct backing_dev_info kernfs_bdi = {
- .name = "kernfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static const struct inode_operations kernfs_iops = {
.permission = kernfs_iop_permission,
.setattr = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
.listxattr = kernfs_iop_listxattr,
};
-void __init kernfs_inode_init(void)
-{
- if (bdi_init(&kernfs_bdi))
- panic("failed to init kernfs_bdi");
-}
-
static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
{
static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
kernfs_get(kn);
inode->i_private = kn;
inode->i_mapping->a_ops = &kernfs_aops;
- inode->i_mapping->backing_dev_info = &kernfs_bdi;
inode->i_op = &kernfs_iops;
set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
size_t size);
ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-void kernfs_inode_init(void);
/*
* dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
sizeof(struct kernfs_node),
0, SLAB_PANIC, NULL);
- kernfs_inode_init();
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
mutex_lock(&inode->i_mutex);
ret = sync_mapping_buffers(inode->i_mapping);
- if (!(inode->i_state & I_DIRTY))
+ if (!(inode->i_state & I_DIRTY_ALL))
goto out;
if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
goto out;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
return (struct sockaddr *)&nsm->sm_addr;
}
-static struct rpc_clnt *nsm_create(struct net *net)
+static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
{
struct sockaddr_in sin = {
.sin_family = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
.address = (struct sockaddr *)&sin,
.addrsize = sizeof(sin),
.servername = "rpc.statd",
+ .nodename = nodename,
.program = &nsm_program,
.version = NSM_VERSION,
.authflavor = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
return clnt;
}
-static struct rpc_clnt *nsm_client_get(struct net *net)
+static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
{
struct rpc_clnt *clnt, *new;
struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
if (clnt != NULL)
goto out;
- clnt = new = nsm_create(net);
+ clnt = new = nsm_create(net, nodename);
if (IS_ERR(clnt))
goto out;
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_res res;
int status;
struct rpc_clnt *clnt;
+ const char *nodename = NULL;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
+ if (host->h_rpcclnt)
+ nodename = host->h_rpcclnt->cl_nodename;
+
/*
* Choose whether to record the caller_name or IP address of
* this peer in the local rpc.statd's database.
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- clnt = nsm_client_get(host->net);
+ clnt = nsm_client_get(host->net, nodename);
if (IS_ERR(clnt)) {
status = PTR_ERR(clnt);
dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e94c887da2d7..55505cbe11af 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -138,10 +138,6 @@ lockd(void *vrqstp)
dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
- if (!nlm_timeout)
- nlm_timeout = LOCKD_DFLT_TIMEO;
- nlmsvc_timeout = nlm_timeout * HZ;
-
/*
* The main request loop. We don't terminate until the last
* NFS mount or NFS daemon has gone away.
@@ -350,6 +346,10 @@ static struct svc_serv *lockd_create_svc(void)
printk(KERN_WARNING
"lockd_up: no pid, %d users??\n", nlmsvc_users);
+ if (!nlm_timeout)
+ nlm_timeout = LOCKD_DFLT_TIMEO;
+ nlmsvc_timeout = nlm_timeout * HZ;
+
serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
{
/*
- * We can get away with a static buffer because we're only
- * called with BKL held.
+ * We can get away with a static buffer because this is only called
+ * from lockd, which is single-threaded.
*/
static char buf[2*NLM_MAXCOOKIELEN+1];
unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
+ struct file_lock_context *flctx = inode->i_flctx;
struct nlm_host *lockhost;
+ if (!flctx || list_empty_careful(&flctx->flc_posix))
+ return 0;
again:
file->f_locks = 0;
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
if (fl->fl_lmops != &nlmsvc_lock_operations)
continue;
@@ -180,7 +183,7 @@ again:
if (match(lockhost, host)) {
struct file_lock lock = *fl;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
lock.fl_type = F_UNLCK;
lock.fl_start = 0;
lock.fl_end = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
goto again;
}
}
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
return 0;
}
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
{
struct inode *inode = nlmsvc_file_inode(file);
struct file_lock *fl;
+ struct file_lock_context *flctx = inode->i_flctx;
if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
return 1;
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl; fl = fl->fl_next) {
- if (fl->fl_lmops == &nlmsvc_lock_operations) {
- spin_unlock(&inode->i_lock);
- return 1;
+ if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_lmops == &nlmsvc_lock_operations) {
+ spin_unlock(&flctx->flc_lock);
+ return 1;
+ }
}
+ spin_unlock(&flctx->flc_lock);
}
- spin_unlock(&inode->i_lock);
file->f_locks = 0;
return 0;
}
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
return p + XDR_QUADLEN(NFS2_FHSIZE);
}
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
- *p++ = htonl(NFS2_FHSIZE);
- memcpy(p, f->data, NFS2_FHSIZE);
- return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
-
/*
* Encode and decode owner handle
*/
diff --git a/fs/locks.c b/fs/locks.c
index 735b8d3fa78c..365c82e1b3a9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
#define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
#define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK)
static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
int leases_enable = 1;
int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
- for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
-
/*
* The global file_lock_list is only used for displaying /proc/locks, so we
* keep a list on each CPU, with each list protected by its own spinlock via
* the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
*/
DEFINE_STATIC_LGLOCK(file_lock_lglock);
static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
* contrast to those that are acting as records of acquired locks).
*
* Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
* protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
*
* In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
- * an entry from the list however only requires the file_lock_lock.
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
+ * Deleting an entry from the list however only requires the file_lock_lock.
*/
static DEFINE_SPINLOCK(blocked_lock_lock);
+static struct kmem_cache *flctx_cache __read_mostly;
static struct kmem_cache *filelock_cache __read_mostly;
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+ struct file_lock_context *new;
+
+ if (likely(inode->i_flctx))
+ goto out;
+
+ new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+ if (!new)
+ goto out;
+
+ spin_lock_init(&new->flc_lock);
+ INIT_LIST_HEAD(&new->flc_flock);
+ INIT_LIST_HEAD(&new->flc_posix);
+ INIT_LIST_HEAD(&new->flc_lease);
+
+ /*
+ * Assign the pointer if it's not already assigned. If it is, then
+ * free the context we just allocated.
+ */
+ spin_lock(&inode->i_lock);
+ if (likely(!inode->i_flctx)) {
+ inode->i_flctx = new;
+ new = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (new)
+ kmem_cache_free(flctx_cache, new);
+out:
+ return inode->i_flctx;
+}
+
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+ if (ctx) {
+ WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+ WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+ WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+ kmem_cache_free(flctx_cache, ctx);
+ }
+}
+
static void locks_init_lock_heads(struct file_lock *fl)
{
INIT_HLIST_NODE(&fl->fl_link);
+ INIT_LIST_HEAD(&fl->fl_list);
INIT_LIST_HEAD(&fl->fl_block);
init_waitqueue_head(&fl->fl_wait);
}
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
void locks_free_lock(struct file_lock *fl)
{
BUG_ON(waitqueue_active(&fl->fl_wait));
+ BUG_ON(!list_empty(&fl->fl_list));
BUG_ON(!list_empty(&fl->fl_block));
BUG_ON(!hlist_unhashed(&fl->fl_link));
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
struct file_lock *fl;
while (!list_empty(dispose)) {
- fl = list_first_entry(dispose, struct file_lock, fl_block);
- list_del_init(&fl->fl_block);
+ fl = list_first_entry(dispose, struct file_lock, fl_list);
+ list_del_init(&fl->fl_list);
locks_free_lock(fl);
}
}
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
return fl1->fl_owner == fl2->fl_owner;
}
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
static void locks_insert_global_locks(struct file_lock *fl)
{
lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
lg_local_unlock(&file_lock_lglock);
}
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
static void locks_delete_global_locks(struct file_lock *fl)
{
/*
* Avoid taking lock if already unhashed. This is safe since this check
- * is done while holding the i_lock, and new insertions into the list
+ * is done while holding the flc_lock, and new insertions into the list
* also require that it be held.
*/
if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
* the order they blocked. The documentation doesn't require this but
* it seems like the reasonable thing to do.
*
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
- * in some cases when we see that the fl_block list is empty.
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
+ * that the flc_lock is also held on insertions we can avoid taking the
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
*/
static void __locks_insert_block(struct file_lock *blocker,
struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
locks_insert_global_blocked(waiter);
}
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
static void locks_insert_block(struct file_lock *blocker,
struct file_lock *waiter)
{
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
/*
* Wake up processes blocked waiting for blocker.
*
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
*/
static void locks_wake_up_blocks(struct file_lock *blocker)
{
/*
* Avoid taking global lock if list is empty. This is safe since new
- * blocked requests are only added to the list under the i_lock, and
- * the i_lock is always held here. Note that removal from the fl_block
- * list does not require the i_lock, so we must recheck list_empty()
+ * blocked requests are only added to the list under the flc_lock, and
+ * the flc_lock is always held here. Note that removal from the fl_block
+ * list does not require the flc_lock, so we must recheck list_empty()
* after acquiring the blocked_lock_lock.
*/
if (list_empty(&blocker->fl_block))
@@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
spin_unlock(&blocked_lock_lock);
}
-/* Insert file lock fl into an inode's lock list at the position indicated
- * by pos. At the same time add the lock to the global file lock list.
- *
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
+static void
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
{
fl->fl_nspid = get_pid(task_tgid(current));
-
- /* insert into file's list */
- fl->fl_next = *pos;
- *pos = fl;
-
+ list_add_tail(&fl->fl_list, before);
locks_insert_global_locks(fl);
}
-/**
- * locks_delete_lock - Delete a lock and then free it.
- * @thisfl_p: pointer that points to the fl_next field of the previous
- * inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
+static void
+locks_unlink_lock_ctx(struct file_lock *fl)
{
- struct file_lock *fl = *thisfl_p;
-
locks_delete_global_locks(fl);
-
- *thisfl_p = fl->fl_next;
- fl->fl_next = NULL;
-
+ list_del_init(&fl->fl_list);
if (fl->fl_nspid) {
put_pid(fl->fl_nspid);
fl->fl_nspid = NULL;
}
-
locks_wake_up_blocks(fl);
}
-/*
- * Unlink a lock from all lists and free it.
- *
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
- struct list_head *dispose)
+static void
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
{
- struct file_lock *fl = *thisfl_p;
-
- locks_unlink_lock(thisfl_p);
+ locks_unlink_lock_ctx(fl);
if (dispose)
- list_add(&fl->fl_block, dispose);
+ list_add(&fl->fl_list, dispose);
else
locks_free_lock(fl);
}
@@ -746,22 +760,27 @@ void
posix_test_lock(struct file *filp, struct file_lock *fl)
{
struct file_lock *cfl;
+ struct file_lock_context *ctx;
struct inode *inode = file_inode(filp);
- spin_lock(&inode->i_lock);
- for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
- if (!IS_POSIX(cfl))
- continue;
- if (posix_locks_conflict(fl, cfl))
- break;
- }
- if (cfl) {
- locks_copy_conflock(fl, cfl);
- if (cfl->fl_nspid)
- fl->fl_pid = pid_vnr(cfl->fl_nspid);
- } else
+ ctx = inode->i_flctx;
+ if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
- spin_unlock(&inode->i_lock);
+ return;
+ }
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+ if (posix_locks_conflict(fl, cfl)) {
+ locks_copy_conflock(fl, cfl);
+ if (cfl->fl_nspid)
+ fl->fl_pid = pid_vnr(cfl->fl_nspid);
+ goto out;
+ }
+ }
+ fl->fl_type = F_UNLCK;
+out:
+ spin_unlock(&ctx->flc_lock);
return;
}
EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
static int flock_lock_file(struct file *filp, struct file_lock *request)
{
struct file_lock *new_fl = NULL;
- struct file_lock **before;
- struct inode * inode = file_inode(filp);
+ struct file_lock *fl;
+ struct file_lock_context *ctx;
+ struct inode *inode = file_inode(filp);
int error = 0;
- int found = 0;
+ bool found = false;
LIST_HEAD(dispose);
+ ctx = locks_get_lock_context(inode);
+ if (!ctx)
+ return -ENOMEM;
+
if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
new_fl = locks_alloc_lock();
if (!new_fl)
return -ENOMEM;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ctx->flc_lock);
if (request->fl_flags & FL_ACCESS)
goto find_conflict;
- for_each_lock(inode, before) {
- struct file_lock *fl = *before;
- if (IS_POSIX(fl))
- break;
- if (IS_LEASE(fl))
- continue;
+ list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
if (filp != fl->fl_file)
continue;
if (request->fl_type == fl->fl_type)
goto out;
- found = 1;
- locks_delete_lock(before, &dispose);
+ found = true;
+ locks_delete_lock_ctx(fl, &dispose);
break;
}
@@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
goto out;
}
- /*
- * If a higher-priority process was blocked on the old file lock,
- * give it the opportunity to lock the file.
- */
- if (found) {
- spin_unlock(&inode->i_lock);
- cond_resched();
- spin_lock(&inode->i_lock);
- }
-
find_conflict:
- for_each_lock(inode, before) {
- struct file_lock *fl = *before;
- if (IS_POSIX(fl))
- break;
- if (IS_LEASE(fl))
- continue;
+ list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
if (!flock_locks_conflict(request, fl))
continue;
error = -EAGAIN;
@@ -911,12 +915,12 @@ find_conflict:
if (request->fl_flags & FL_ACCESS)
goto out;
locks_copy_lock(new_fl, request);
- locks_insert_lock(before, new_fl);
+ locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
new_fl = NULL;
error = 0;
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ctx->flc_lock);
if (new_fl)
locks_free_lock(new_fl);
locks_dispose_list(&dispose);
@@ -925,16 +929,20 @@ out:
static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
{
- struct file_lock *fl;
+ struct file_lock *fl, *tmp;
struct file_lock *new_fl = NULL;
struct file_lock *new_fl2 = NULL;
struct file_lock *left = NULL;
struct file_lock *right = NULL;
- struct file_lock **before;
+ struct file_lock_context *ctx;
int error;
bool added = false;
LIST_HEAD(dispose);
+ ctx = locks_get_lock_context(inode);
+ if (!ctx)
+ return -ENOMEM;
+
/*
* We may need two file_lock structures for this operation,
* so we get them in advance to avoid races.
@@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
new_fl2 = locks_alloc_lock();
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ctx->flc_lock);
/*
* New lock request. Walk all POSIX locks and look for conflicts. If
* there are any, either return error or put the request on the
* blocker's list of waiters and the global blocked_hash.
*/
if (request->fl_type != F_UNLCK) {
- for_each_lock(inode, before) {
- fl = *before;
+ list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (!IS_POSIX(fl))
continue;
if (!posix_locks_conflict(request, fl))
@@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
if (request->fl_flags & FL_ACCESS)
goto out;
- /*
- * Find the first old lock with the same owner as the new lock.
- */
-
- before = &inode->i_flock;
-
- /* First skip locks owned by other processes. */
- while ((fl = *before) && (!IS_POSIX(fl) ||
- !posix_same_owner(request, fl))) {
- before = &fl->fl_next;
+ /* Find the first old lock with the same owner as the new lock */
+ list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
+ if (posix_same_owner(request, fl))
+ break;
}
/* Process locks with this owner. */
- while ((fl = *before) && posix_same_owner(request, fl)) {
- /* Detect adjacent or overlapping regions (if same lock type)
- */
+ list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
+ if (!posix_same_owner(request, fl))
+ break;
+
+ /* Detect adjacent or overlapping regions (if same lock type) */
if (request->fl_type == fl->fl_type) {
/* In all comparisons of start vs end, use
* "start - 1" rather than "end + 1". If end
* is OFFSET_MAX, end + 1 will become negative.
*/
if (fl->fl_end < request->fl_start - 1)
- goto next_lock;
+ continue;
/* If the next lock in the list has entirely bigger
* addresses than the new one, insert the lock here.
*/
@@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
else
request->fl_end = fl->fl_end;
if (added) {
- locks_delete_lock(before, &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
continue;
}
request = fl;
added = true;
- }
- else {
+ } else {
/* Processing for different lock types is a bit
* more complex.
*/
if (fl->fl_end < request->fl_start)
- goto next_lock;
+ continue;
if (fl->fl_start > request->fl_end)
break;
if (request->fl_type == F_UNLCK)
@@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
* one (This may happen several times).
*/
if (added) {
- locks_delete_lock(before, &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
continue;
}
/*
@@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_copy_lock(new_fl, request);
request = new_fl;
new_fl = NULL;
- locks_delete_lock(before, &dispose);
- locks_insert_lock(before, request);
+ locks_insert_lock_ctx(request, &fl->fl_list);
+ locks_delete_lock_ctx(fl, &dispose);
added = true;
}
}
- /* Go on to next lock.
- */
- next_lock:
- before = &fl->fl_next;
}
/*
@@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
goto out;
}
locks_copy_lock(new_fl, request);
- locks_insert_lock(before, new_fl);
+ locks_insert_lock_ctx(new_fl, &fl->fl_list);
+ fl = new_fl;
new_fl = NULL;
}
if (right) {
@@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
left = new_fl2;
new_fl2 = NULL;
locks_copy_lock(left, right);
- locks_insert_lock(before, left);
+ locks_insert_lock_ctx(left, &fl->fl_list);
}
right->fl_start = request->fl_end + 1;
locks_wake_up_blocks(right);
@@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
locks_wake_up_blocks(left);
}
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ctx->flc_lock);
/*
* Free any unused locks.
*/
@@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
*/
int locks_mandatory_locked(struct file *file)
{
+ int ret;
struct inode *inode = file_inode(file);
+ struct file_lock_context *ctx;
struct file_lock *fl;
+ ctx = inode->i_flctx;
+ if (!ctx || list_empty_careful(&ctx->flc_posix))
+ return 0;
+
/*
* Search the lock list for this inode for any POSIX locks.
*/
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
- if (!IS_POSIX(fl))
- continue;
+ spin_lock(&ctx->flc_lock);
+ ret = 0;
+ list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
if (fl->fl_owner != current->files &&
- fl->fl_owner != file)
+ fl->fl_owner != file) {
+ ret = -EAGAIN;
break;
+ }
}
- spin_unlock(&inode->i_lock);
- return fl ? -EAGAIN : 0;
+ spin_unlock(&ctx->flc_lock);
+ return ret;
}
/**
@@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
}
/* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
{
- struct file_lock *fl = *before;
int error = assign_type(fl, arg);
if (error)
@@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
fl->fl_fasync = NULL;
}
- locks_delete_lock(before, dispose);
+ locks_delete_lock_ctx(fl, dispose);
}
return 0;
}
@@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then)
static void time_out_leases(struct inode *inode, struct list_head *dispose)
{
- struct file_lock **before;
- struct file_lock *fl;
+ struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock *fl, *tmp;
- lockdep_assert_held(&inode->i_lock);
+ lockdep_assert_held(&ctx->flc_lock);
- before = &inode->i_flock;
- while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
trace_time_out_leases(inode, fl);
if (past_time(fl->fl_downgrade_time))
- lease_modify(before, F_RDLCK, dispose);
+ lease_modify(fl, F_RDLCK, dispose);
if (past_time(fl->fl_break_time))
- lease_modify(before, F_UNLCK, dispose);
- if (fl == *before) /* lease_modify may have freed fl */
- before = &fl->fl_next;
+ lease_modify(fl, F_UNLCK, dispose);
}
}
static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
{
+ if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+ return false;
if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
return false;
return locks_conflict(breaker, lease);
@@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
static bool
any_leases_conflict(struct inode *inode, struct file_lock *breaker)
{
+ struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl;
- lockdep_assert_held(&inode->i_lock);
+ lockdep_assert_held(&ctx->flc_lock);
- for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (leases_conflict(fl, breaker))
return true;
}
@@ -1384,7 +1389,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
struct file_lock *new_fl;
- struct file_lock *fl, **before;
+ struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock *fl;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
LIST_HEAD(dispose);
@@ -1394,7 +1400,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
return PTR_ERR(new_fl);
new_fl->fl_flags = type;
- spin_lock(&inode->i_lock);
+ /* typically we will check that ctx is non-NULL before calling */
+ if (!ctx) {
+ WARN_ON_ONCE(1);
+ return error;
+ }
+
+ spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
@@ -1408,9 +1420,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
break_time++; /* so that 0 means no break time */
}
- for (before = &inode->i_flock;
- ((fl = *before) != NULL) && IS_LEASE(fl);
- before = &fl->fl_next) {
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
if (!leases_conflict(fl, new_fl))
continue;
if (want_write) {
@@ -1419,17 +1429,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
fl->fl_flags |= FL_UNLOCK_PENDING;
fl->fl_break_time = break_time;
} else {
- if (lease_breaking(inode->i_flock))
+ if (lease_breaking(fl))
continue;
fl->fl_flags |= FL_DOWNGRADE_PENDING;
fl->fl_downgrade_time = break_time;
}
if (fl->fl_lmops->lm_break(fl))
- locks_delete_lock(before, &dispose);
+ locks_delete_lock_ctx(fl, &dispose);
}
- fl = inode->i_flock;
- if (!fl || !IS_LEASE(fl))
+ if (list_empty(&ctx->flc_lease))
goto out;
if (mode & O_NONBLOCK) {
@@ -1439,18 +1448,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
}
restart:
- break_time = inode->i_flock->fl_break_time;
+ fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+ break_time = fl->fl_break_time;
if (break_time != 0)
break_time -= jiffies;
if (break_time == 0)
break_time++;
- locks_insert_block(inode->i_flock, new_fl);
+ locks_insert_block(fl, new_fl);
trace_break_lease_block(inode, new_fl);
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
!new_fl->fl_next, break_time);
- spin_lock(&inode->i_lock);
+ spin_lock(&ctx->flc_lock);
trace_break_lease_unblock(inode, new_fl);
locks_delete_block(new_fl);
if (error >= 0) {
@@ -1462,12 +1472,10 @@ restart:
time_out_leases(inode, &dispose);
if (any_leases_conflict(inode, new_fl))
goto restart;
-
error = 0;
}
-
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
locks_free_lock(new_fl);
return error;
@@ -1487,14 +1495,18 @@ EXPORT_SYMBOL(__break_lease);
void lease_get_mtime(struct inode *inode, struct timespec *time)
{
bool has_lease = false;
- struct file_lock *flock;
+ struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock *fl;
- if (inode->i_flock) {
- spin_lock(&inode->i_lock);
- flock = inode->i_flock;
- if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
- has_lease = true;
- spin_unlock(&inode->i_lock);
+ if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+ spin_lock(&ctx->flc_lock);
+ if (!list_empty(&ctx->flc_lease)) {
+ fl = list_first_entry(&ctx->flc_lease,
+ struct file_lock, fl_list);
+ if (fl->fl_type == F_WRLCK)
+ has_lease = true;
+ }
+ spin_unlock(&ctx->flc_lock);
}
if (has_lease)
@@ -1532,20 +1544,22 @@ int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx = inode->i_flctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
- spin_lock(&inode->i_lock);
- time_out_leases(file_inode(filp), &dispose);
- for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
- fl = fl->fl_next) {
- if (fl->fl_file == filp) {
+ if (ctx && !list_empty_careful(&ctx->flc_lease)) {
+ spin_lock(&ctx->flc_lock);
+ time_out_leases(file_inode(filp), &dispose);
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+ if (fl->fl_file != filp)
+ continue;
type = target_leasetype(fl);
break;
}
+ spin_unlock(&ctx->flc_lock);
+ locks_dispose_list(&dispose);
}
- spin_unlock(&inode->i_lock);
- locks_dispose_list(&dispose);
return type;
}
@@ -1560,11 +1574,14 @@ int fcntl_getlease(struct file *filp)
* conflict with the lease we're trying to set.
*/
static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
{
int ret = 0;
struct inode *inode = dentry->d_inode;
+ if (flags & FL_LAYOUT)
+ return 0;
+
if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
return -EAGAIN;
@@ -1578,9 +1595,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
static int
generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
{
- struct file_lock *fl, **before, **my_before = NULL, *lease;
+ struct file_lock *fl, *my_fl = NULL, *lease;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ struct file_lock_context *ctx;
bool is_deleg = (*flp)->fl_flags & FL_DELEG;
int error;
LIST_HEAD(dispose);
@@ -1588,6 +1606,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
lease = *flp;
trace_generic_add_lease(inode, lease);
+ ctx = locks_get_lock_context(inode);
+ if (!ctx)
+ return -ENOMEM;
+
/*
* In the delegation case we need mutual exclusion with
* a number of operations that take the i_mutex. We trylock
@@ -1606,9 +1628,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
return -EINVAL;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&ctx->flc_lock);
time_out_leases(inode, &dispose);
- error = check_conflicting_open(dentry, arg);
+ error = check_conflicting_open(dentry, arg, lease->fl_flags);
if (error)
goto out;
@@ -1621,13 +1643,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
* except for this filp.
*/
error = -EAGAIN;
- for (before = &inode->i_flock;
- ((fl = *before) != NULL) && IS_LEASE(fl);
- before = &fl->fl_next) {
- if (fl->fl_file == filp) {
- my_before = before;
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+ if (fl->fl_file == filp &&
+ fl->fl_owner == lease->fl_owner) {
+ my_fl = fl;
continue;
}
+
/*
* No exclusive leases if someone else has a lease on
* this file:
@@ -1642,9 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
goto out;
}
- if (my_before != NULL) {
- lease = *my_before;
- error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
+ if (my_fl != NULL) {
+ error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
if (error)
goto out;
goto out_setup;
@@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
if (!leases_enable)
goto out;
- locks_insert_lock(before, lease);
+ locks_insert_lock_ctx(lease, &ctx->flc_lease);
/*
* The check in break_lease() is lockless. It's possible for another
* open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
* precedes these checks.
*/
smp_mb();
- error = check_conflicting_open(dentry, arg);
- if (error)
- goto out_unlink;
+ error = check_conflicting_open(dentry, arg, lease->fl_flags);
+ if (error) {
+ locks_unlink_lock_ctx(lease);
+ goto out;
+ }
out_setup:
if (lease->fl_lmops->lm_setup)
lease->fl_lmops->lm_setup(lease, priv);
out:
- spin_unlock(&inode->i_lock);
+ spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
if (is_deleg)
mutex_unlock(&inode->i_mutex);
- if (!error && !my_before)
+ if (!error && !my_fl)
*flp = NULL;
return error;
-out_unlink:
- locks_unlink_lock(before);
- goto out;
}
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
- struct file_lock *fl, **before;
+ struct file_lock *fl, *victim = NULL;
struct dentry *dentry = filp->f_path.dentry;
struct inode *inode = dentry->d_inode;
+ struct file_lock_context *ctx = inode->i_flctx;
LIST_HEAD(dispose);
- spin_lock(&inode->i_lock);
- time_out_leases(inode, &dispose);
- for (before = &inode->i_flock;
- ((fl = *before) != NULL) && IS_LEASE(fl);
- before = &fl->fl_next) {
- if (fl->fl_file == filp)
+ if (!ctx) {
+ trace_generic_delete_lease(inode, NULL);
+ return error;
+ }
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+ if (fl->fl_file == filp &&
+ fl->fl_owner == owner) {
+ victim = fl;
break;
+ }
}
trace_generic_delete_lease(inode, fl);
- if (fl)
- error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
- spin_unlock(&inode->i_lock);
+ if (victim)
+ error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
+ spin_unlock(&ctx->flc_lock);
locks_dispose_list(&dispose);
return error;
}
@@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
switch (arg) {
case F_UNLCK:
- return generic_delete_lease(filp);
+ return generic_delete_lease(filp, *priv);
case F_RDLCK:
case F_WRLCK:
if (!(*flp)->fl_lmops->lm_break) {
WARN_ON_ONCE(1);
return -ENOLCK;
}
+
return generic_add_lease(filp, arg, flp, priv);
default:
return -EINVAL;
@@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
if (arg == F_UNLCK)
- return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+ return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
return do_fcntl_add_lease(fd, filp, arg);
}
@@ -2171,7 +2198,7 @@ again:
*/
/*
* we need that spin_lock here - it prevents reordering between
- * update of inode->i_flock and check for it done in close().
+ * update of i_flctx->flc_posix and check for it done in close().
* rcu_read_lock() wouldn't do.
*/
spin_lock(&current->files->file_lock);
@@ -2331,13 +2358,14 @@ out:
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
struct file_lock lock;
+ struct file_lock_context *ctx = file_inode(filp)->i_flctx;
/*
* If there are no locks held on this file, we don't need to call
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
- if (!file_inode(filp)->i_flock)
+ if (!ctx || list_empty(&ctx->flc_posix))
return;
lock.fl_type = F_UNLCK;
@@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
EXPORT_SYMBOL(locks_remove_posix);
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_flock(struct file *filp)
+{
+ struct file_lock fl = {
+ .fl_owner = filp,
+ .fl_pid = current->tgid,
+ .fl_file = filp,
+ .fl_flags = FL_FLOCK,
+ .fl_type = F_UNLCK,
+ .fl_end = OFFSET_MAX,
+ };
+ struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+
+ if (list_empty(&flctx->flc_flock))
+ return;
+
+ if (filp->f_op->flock)
+ filp->f_op->flock(filp, F_SETLKW, &fl);
+ else
+ flock_lock_file(filp, &fl);
+
+ if (fl.fl_ops && fl.fl_ops->fl_release_private)
+ fl.fl_ops->fl_release_private(&fl);
+}
+
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_lease(struct file *filp)
+{
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock *fl, *tmp;
+ LIST_HEAD(dispose);
+
+ if (list_empty(&ctx->flc_lease))
+ return;
+
+ spin_lock(&ctx->flc_lock);
+ list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+ if (filp == fl->fl_file)
+ lease_modify(fl, F_UNLCK, &dispose);
+ spin_unlock(&ctx->flc_lock);
+ locks_dispose_list(&dispose);
+}
+
/*
* This function is called on the last close of an open file.
*/
void locks_remove_file(struct file *filp)
{
- struct inode * inode = file_inode(filp);
- struct file_lock *fl;
- struct file_lock **before;
- LIST_HEAD(dispose);
-
- if (!inode->i_flock)
+ if (!file_inode(filp)->i_flctx)
return;
+ /* remove any OFD locks */
locks_remove_posix(filp, filp);
- if (filp->f_op->flock) {
- struct file_lock fl = {
- .fl_owner = filp,
- .fl_pid = current->tgid,
- .fl_file = filp,
- .fl_flags = FL_FLOCK,
- .fl_type = F_UNLCK,
- .fl_end = OFFSET_MAX,
- };
- filp->f_op->flock(filp, F_SETLKW, &fl);
- if (fl.fl_ops && fl.fl_ops->fl_release_private)
- fl.fl_ops->fl_release_private(&fl);
- }
-
- spin_lock(&inode->i_lock);
- before = &inode->i_flock;
+ /* remove flock locks */
+ locks_remove_flock(filp);
- while ((fl = *before) != NULL) {
- if (fl->fl_file == filp) {
- if (IS_LEASE(fl)) {
- lease_modify(before, F_UNLCK, &dispose);
- continue;
- }
-
- /*
- * There's a leftover lock on the list of a type that
- * we didn't expect to see. Most likely a classic
- * POSIX lock that ended up not getting released
- * properly, or that raced onto the list somehow. Log
- * some info about it and then just remove it from
- * the list.
- */
- WARN(!IS_FLOCK(fl),
- "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
- MAJOR(inode->i_sb->s_dev),
- MINOR(inode->i_sb->s_dev), inode->i_ino,
- fl->fl_type, fl->fl_flags,
- fl->fl_start, fl->fl_end);
-
- locks_delete_lock(before, &dispose);
- continue;
- }
- before = &fl->fl_next;
- }
- spin_unlock(&inode->i_lock);
- locks_dispose_list(&dispose);
+ /* remove any leases */
+ locks_remove_lease(filp);
}
/**
@@ -2621,6 +2650,9 @@ static int __init filelock_init(void)
{
int i;
+ flctx_cache = kmem_cache_create("file_lock_ctx",
+ sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
+
filelock_cache = kmem_cache_create("file_lock_cache",
sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
diff --git a/fs/mount.h b/fs/mount.h
index 0ad6f760ce52..6a61c2b3e385 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -2,6 +2,7 @@
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/ns_common.h>
+#include <linux/fs_pin.h>
struct mnt_namespace {
atomic_t count;
@@ -62,7 +63,8 @@ struct mount {
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
- struct path mnt_ex_mountpoint;
+ struct fs_pin mnt_umount;
+ struct dentry *mnt_ex_mountpoint;
};
#define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namei.c b/fs/namei.c
index bc35b02883bb..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
* POSIX.1 2.4: an empty pathname is invalid (ENOENT).
* PATH_MAX includes the nul terminator --RR.
*/
-void final_putname(struct filename *name)
-{
- if (name->separate) {
- __putname(name->name);
- kfree(name);
- } else {
- __putname(name);
- }
-}
#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename))
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
+ result->refcnt = 1;
/*
* First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
}
result->name = kname;
result->separate = true;
+ result->refcnt = 1;
max = PATH_MAX;
goto recopy;
}
@@ -202,7 +195,7 @@ recopy:
return result;
error:
- final_putname(result);
+ putname(result);
return err;
}
@@ -212,43 +205,56 @@ getname(const char __user * filename)
return getname_flags(filename, 0, NULL);
}
-/*
- * The "getname_kernel()" interface doesn't do pathnames longer
- * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
- */
struct filename *
getname_kernel(const char * filename)
{
struct filename *result;
- char *kname;
- int len;
-
- len = strlen(filename);
- if (len >= EMBEDDED_NAME_MAX)
- return ERR_PTR(-ENAMETOOLONG);
+ int len = strlen(filename) + 1;
result = __getname();
if (unlikely(!result))
return ERR_PTR(-ENOMEM);
- kname = (char *)result + sizeof(*result);
- result->name = kname;
+ if (len <= EMBEDDED_NAME_MAX) {
+ result->name = (char *)(result) + sizeof(*result);
+ result->separate = false;
+ } else if (len <= PATH_MAX) {
+ struct filename *tmp;
+
+ tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+ if (unlikely(!tmp)) {
+ __putname(result);
+ return ERR_PTR(-ENOMEM);
+ }
+ tmp->name = (char *)result;
+ tmp->separate = true;
+ result = tmp;
+ } else {
+ __putname(result);
+ return ERR_PTR(-ENAMETOOLONG);
+ }
+ memcpy((char *)result->name, filename, len);
result->uptr = NULL;
result->aname = NULL;
- result->separate = false;
+ result->refcnt = 1;
+ audit_getname(result);
- strlcpy(kname, filename, EMBEDDED_NAME_MAX);
return result;
}
-#ifdef CONFIG_AUDITSYSCALL
void putname(struct filename *name)
{
- if (unlikely(!audit_dummy_context()))
- return audit_putname(name);
- final_putname(name);
+ BUG_ON(name->refcnt <= 0);
+
+ if (--name->refcnt > 0)
+ return;
+
+ if (name->separate) {
+ __putname(name->name);
+ kfree(name);
+ } else
+ __putname(name);
}
-#endif
static int check_acl(struct inode *inode, int mask)
{
@@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name,
static int do_path_lookup(int dfd, const char *name,
unsigned int flags, struct nameidata *nd)
{
- struct filename filename = { .name = name };
+ struct filename *filename = getname_kernel(name);
+ int retval = PTR_ERR(filename);
- return filename_lookup(dfd, &filename, flags, nd);
+ if (!IS_ERR(filename)) {
+ retval = filename_lookup(dfd, filename, flags, nd);
+ putname(filename);
+ }
+ return retval;
}
/* does lookup, returns the object with parent locked */
struct dentry *kern_path_locked(const char *name, struct path *path)
{
+ struct filename *filename = getname_kernel(name);
struct nameidata nd;
struct dentry *d;
- int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
- if (err)
- return ERR_PTR(err);
+ int err;
+
+ if (IS_ERR(filename))
+ return ERR_CAST(filename);
+
+ err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
+ if (err) {
+ d = ERR_PTR(err);
+ goto out;
+ }
if (nd.last_type != LAST_NORM) {
path_put(&nd.path);
- return ERR_PTR(-EINVAL);
+ d = ERR_PTR(-EINVAL);
+ goto out;
}
mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
d = __lookup_hash(&nd.last, nd.path.dentry, 0);
if (IS_ERR(d)) {
mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
path_put(&nd.path);
- return d;
+ goto out;
}
*path = nd.path;
+out:
+ putname(filename);
return d;
}
@@ -2351,13 +2373,17 @@ static int
filename_mountpoint(int dfd, struct filename *s, struct path *path,
unsigned int flags)
{
- int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+ int error;
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+ error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
if (unlikely(error == -ECHILD))
error = path_mountpoint(dfd, s->name, path, flags);
if (unlikely(error == -ESTALE))
error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
if (likely(!error))
audit_inode(s, path->dentry, 0);
+ putname(s);
return error;
}
@@ -2379,21 +2405,14 @@ int
user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
struct path *path)
{
- struct filename *s = getname(name);
- int error;
- if (IS_ERR(s))
- return PTR_ERR(s);
- error = filename_mountpoint(dfd, s, path, flags);
- putname(s);
- return error;
+ return filename_mountpoint(dfd, getname(name), path, flags);
}
int
kern_path_mountpoint(int dfd, const char *name, struct path *path,
unsigned int flags)
{
- struct filename s = {.name = name};
- return filename_mountpoint(dfd, &s, path, flags);
+ return filename_mountpoint(dfd, getname_kernel(name), path, flags);
}
EXPORT_SYMBOL(kern_path_mountpoint);
@@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
{
struct nameidata nd;
struct file *file;
- struct filename filename = { .name = name };
+ struct filename *filename;
int flags = op->lookup_flags | LOOKUP_ROOT;
nd.root.mnt = mnt;
@@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
return ERR_PTR(-ELOOP);
- file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
+ filename = getname_kernel(name);
+ if (unlikely(IS_ERR(filename)))
+ return ERR_CAST(filename);
+
+ file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
if (unlikely(file == ERR_PTR(-ECHILD)))
- file = path_openat(-1, &filename, &nd, op, flags);
+ file = path_openat(-1, filename, &nd, op, flags);
if (unlikely(file == ERR_PTR(-ESTALE)))
- file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
+ file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+ putname(filename);
return file;
}
-struct dentry *kern_path_create(int dfd, const char *pathname,
+static struct dentry *filename_create(int dfd, struct filename *name,
struct path *path, unsigned int lookup_flags)
{
struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
*/
lookup_flags &= LOOKUP_REVAL;
- error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
+ error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
if (error)
return ERR_PTR(error);
@@ -3359,6 +3383,19 @@ out:
path_put(&nd.path);
return dentry;
}
+
+struct dentry *kern_path_create(int dfd, const char *pathname,
+ struct path *path, unsigned int lookup_flags)
+{
+ struct filename *filename = getname_kernel(pathname);
+ struct dentry *res;
+
+ if (IS_ERR(filename))
+ return ERR_CAST(filename);
+ res = filename_create(dfd, filename, path, lookup_flags);
+ putname(filename);
+ return res;
+}
EXPORT_SYMBOL(kern_path_create);
void done_path_create(struct path *path, struct dentry *dentry)
@@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
struct dentry *res;
if (IS_ERR(tmp))
return ERR_CAST(tmp);
- res = kern_path_create(dfd, tmp->name, path, lookup_flags);
+ res = filename_create(dfd, tmp, path, lookup_flags);
putname(tmp);
return res;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e9681a0cf..72a286e0d33e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt)
#endif
}
+static void drop_mountpoint(struct fs_pin *p)
+{
+ struct mount *m = container_of(p, struct mount, mnt_umount);
+ dput(m->mnt_ex_mountpoint);
+ pin_remove(p);
+ mntput(&m->mnt);
+}
+
static struct mount *alloc_vfsmnt(const char *name)
{
struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name)
goto out_free_cache;
if (name) {
- mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+ mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
if (!mnt->mnt_devname)
goto out_free_id;
}
@@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name)
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
#endif
+ init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
}
return mnt;
#ifdef CONFIG_SMP
out_free_devname:
- kfree(mnt->mnt_devname);
+ kfree_const(mnt->mnt_devname);
#endif
out_free_id:
mnt_free_id(mnt);
@@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
static void free_vfsmnt(struct mount *mnt)
{
- kfree(mnt->mnt_devname);
+ kfree_const(mnt->mnt_devname);
#ifdef CONFIG_SMP
free_percpu(mnt->mnt_pcp);
#endif
@@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static void namespace_unlock(void)
{
- struct mount *mnt;
struct hlist_head head = unmounted;
if (likely(hlist_empty(&head))) {
@@ -1299,23 +1307,11 @@ static void namespace_unlock(void)
head.first->pprev = &head.first;
INIT_HLIST_HEAD(&unmounted);
-
- /* undo decrements we'd done in umount_tree() */
- hlist_for_each_entry(mnt, &head, mnt_hash)
- if (mnt->mnt_ex_mountpoint.mnt)
- mntget(mnt->mnt_ex_mountpoint.mnt);
-
up_write(&namespace_sem);
synchronize_rcu();
- while (!hlist_empty(&head)) {
- mnt = hlist_entry(head.first, struct mount, mnt_hash);
- hlist_del_init(&mnt->mnt_hash);
- if (mnt->mnt_ex_mountpoint.mnt)
- path_put(&mnt->mnt_ex_mountpoint);
- mntput(&mnt->mnt);
- }
+ group_pin_kill(&head);
}
static inline void namespace_lock(void)
@@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how)
{
HLIST_HEAD(tmp_list);
struct mount *p;
- struct mount *last = NULL;
for (p = mnt; p; p = next_mnt(p, mnt)) {
hlist_del_init_rcu(&p->mnt_hash);
@@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how)
if (how)
propagate_umount(&tmp_list);
- hlist_for_each_entry(p, &tmp_list, mnt_hash) {
+ while (!hlist_empty(&tmp_list)) {
+ p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+ hlist_del_init_rcu(&p->mnt_hash);
list_del_init(&p->mnt_expire);
list_del_init(&p->mnt_list);
__touch_mnt_namespace(p->mnt_ns);
p->mnt_ns = NULL;
if (how < 2)
p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+
+ pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
if (mnt_has_parent(p)) {
hlist_del_init(&p->mnt_mp_list);
put_mountpoint(p->mnt_mp);
mnt_add_count(p->mnt_parent, -1);
- /* move the reference to mountpoint into ->mnt_ex_mountpoint */
- p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
- p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
+ /* old mountpoint will be dropped when we can do that */
+ p->mnt_ex_mountpoint = p->mnt_mountpoint;
p->mnt_mountpoint = p->mnt.mnt_root;
p->mnt_parent = p;
p->mnt_mp = NULL;
}
change_mnt_propagation(p, MS_PRIVATE);
- last = p;
- }
- if (last) {
- last->mnt_hash.next = unmounted.first;
- if (unmounted.first)
- unmounted.first->pprev = &last->mnt_hash.next;
- unmounted.first = tmp_list.first;
- unmounted.first->pprev = &unmounted.first;
}
}
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 008960101520..e7ca827d7694 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *);
static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
unsigned int, const char *, const struct qstr *);
static int ncp_delete_dentry(const struct dentry *);
+static void ncp_d_prune(struct dentry *dentry);
const struct dentry_operations ncp_dentry_operations =
{
@@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations =
.d_hash = ncp_hash_dentry,
.d_compare = ncp_compare_dentry,
.d_delete = ncp_delete_dentry,
+ .d_prune = ncp_d_prune,
};
#define ncp_namespace(i) (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
@@ -384,42 +386,6 @@ finished:
return val;
}
-static struct dentry *
-ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
- struct dentry *dent = dentry;
-
- if (d_validate(dent, parent)) {
- if (dent->d_name.len <= NCP_MAXPATHLEN &&
- (unsigned long)dent->d_fsdata == fpos) {
- if (!dent->d_inode) {
- dput(dent);
- dent = NULL;
- }
- return dent;
- }
- dput(dent);
- }
-
- /* If a pointer is invalid, we search the dentry. */
- spin_lock(&parent->d_lock);
- list_for_each_entry(dent, &parent->d_subdirs, d_child) {
- if ((unsigned long)dent->d_fsdata == fpos) {
- if (dent->d_inode)
- dget(dent);
- else
- dent = NULL;
- spin_unlock(&parent->d_lock);
- goto out;
- }
- }
- spin_unlock(&parent->d_lock);
- return NULL;
-
-out:
- return dent;
-}
-
static time_t ncp_obtain_mtime(struct dentry *dentry)
{
struct inode *inode = dentry->d_inode;
@@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
}
+static inline void
+ncp_invalidate_dircache_entries(struct dentry *parent)
+{
+ struct ncp_server *server = NCP_SERVER(parent->d_inode);
+ struct dentry *dentry;
+
+ spin_lock(&parent->d_lock);
+ list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
+ dentry->d_fsdata = NULL;
+ ncp_age_dentry(server, dentry);
+ }
+ spin_unlock(&parent->d_lock);
+}
+
static int ncp_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
@@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
struct dentry *dent;
bool over;
- dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
- dentry, ctx->pos);
- if (!dent)
+ spin_lock(&dentry->d_lock);
+ if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) {
+ spin_unlock(&dentry->d_lock);
+ goto invalid_cache;
+ }
+ dent = ctl.cache->dentry[ctl.idx];
+ if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
+ spin_unlock(&dentry->d_lock);
+ goto invalid_cache;
+ }
+ spin_unlock(&dentry->d_lock);
+ if (!dent->d_inode) {
+ dput(dent);
goto invalid_cache;
+ }
over = !dir_emit(ctx, dent->d_name.name,
dent->d_name.len,
dent->d_inode->i_ino, DT_UNKNOWN);
@@ -548,6 +539,9 @@ init_cache:
ctl.filled = 0;
ctl.valid = 1;
read_really:
+ spin_lock(&dentry->d_lock);
+ NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
+ spin_unlock(&dentry->d_lock);
if (ncp_is_server_root(inode)) {
ncp_read_volume_list(file, ctx, &ctl);
} else {
@@ -573,6 +567,13 @@ out:
return result;
}
+static void ncp_d_prune(struct dentry *dentry)
+{
+ if (!dentry->d_fsdata) /* not referenced from page cache */
+ return;
+ NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
+}
+
static int
ncp_fill_cache(struct file *file, struct dir_context *ctx,
struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
@@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
d_instantiate(newdent, inode);
if (!hashed)
d_rehash(newdent);
+ } else {
+ spin_lock(&dentry->d_lock);
+ NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+ spin_unlock(&dentry->d_lock);
}
} else {
struct inode *inode = newdent->d_inode;
@@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
mutex_unlock(&inode->i_mutex);
}
- if (newdent->d_inode) {
- ino = newdent->d_inode->i_ino;
- newdent->d_fsdata = (void *) ctl.fpos;
- ncp_new_dentry(newdent);
- }
-
if (ctl.idx >= NCP_DIRCACHE_SIZE) {
if (ctl.page) {
kunmap(ctl.page);
@@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
ctl.cache = kmap(ctl.page);
}
if (ctl.cache) {
- ctl.cache->dentry[ctl.idx] = newdent;
- valid = 1;
+ if (newdent->d_inode) {
+ newdent->d_fsdata = newdent;
+ ctl.cache->dentry[ctl.idx] = newdent;
+ ino = newdent->d_inode->i_ino;
+ ncp_new_dentry(newdent);
+ }
+ valid = 1;
}
dput(newdent);
end_advance:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
if (inode) {
atomic_set(&NCP_FINFO(inode)->opened, info->opened);
- inode->i_mapping->backing_dev_info = sb->s_bdi;
inode->i_ino = info->ino;
ncp_set_attr(inode, info);
if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
server = NCP_SBP(sb);
memset(server, 0, sizeof(*server));
- error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+ error = bdi_setup_and_register(&server->bdi, "ncpfs");
if (error)
goto out_fput;
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index 4b0bec477846..c4794504f843 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -22,6 +22,7 @@ struct ncp_inode_info {
int access;
int flags;
#define NCPI_KLUDGE_SYMLINK 0x0001
+#define NCPI_DIR_CACHE 0x0002
__u8 file_handle[6];
struct inode vfs_inode;
};
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b785f74bfe3c..250e443a07f3 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry)
dentry->d_time = jiffies;
}
-static inline void
-ncp_renew_dentries(struct dentry *parent)
-{
- struct ncp_server *server = NCP_SERVER(parent->d_inode);
- struct dentry *dentry;
-
- spin_lock(&parent->d_lock);
- list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
- if (dentry->d_fsdata == NULL)
- ncp_age_dentry(server, dentry);
- else
- ncp_new_dentry(dentry);
- }
- spin_unlock(&parent->d_lock);
-}
-
-static inline void
-ncp_invalidate_dircache_entries(struct dentry *parent)
-{
- struct ncp_server *server = NCP_SERVER(parent->d_inode);
- struct dentry *dentry;
-
- spin_lock(&parent->d_lock);
- list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
- dentry->d_fsdata = NULL;
- ncp_age_dentry(server, dentry);
- }
- spin_unlock(&parent->d_lock);
-}
-
struct ncp_cache_head {
time_t mtime;
unsigned long time; /* cache age */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
depends on NFS_V4_1 && SCSI_OSD_ULD
default NFS_V4
+config PNFS_FLEXFILE_LAYOUT
+ tristate
+ depends on NFS_V4_1 && NFS_V3
+ default m
+
config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
string "NFSv4.1 Implementation ID Domain"
depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
dns_resolve.o nfs4trace.o
nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o
nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o
obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
.pg_init = bl_pg_init_read,
.pg_test = bl_pg_test_read,
.pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops bl_pg_write_ops = {
.pg_init = bl_pg_init_write,
.pg_test = bl_pg_test_write,
.pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
- prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+ prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
struct rpc_rqst, rq_bc_list);
list_del(&req->rq_bc_list);
spin_unlock_bh(&serv->sv_cb_lock);
+ finish_wait(&serv->sv_cb_waitq, &wq);
dprintk("Invoking bc_svc_process()\n");
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
- schedule();
+ /* schedule_timeout to game the hung task watchdog */
+ schedule_timeout(60 * HZ);
+ finish_wait(&serv->sv_cb_waitq, &wq);
}
- finish_wait(&serv->sv_cb_waitq, &wq);
}
return 0;
}
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e36a9d78ea49..197806fb87ff 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
if (clp == NULL)
goto out;
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ goto out;
tbl = &clp->cl_session->bc_slot_table;
spin_lock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6521ec..19ca95cdfd9b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
goto out;
}
- args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+ args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
status = htonl(NFS4ERR_DELAY);
goto out;
@@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
if (unlikely(p == NULL))
goto out;
- rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+ rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
sizeof(*rc_list->rcl_refcalls),
GFP_KERNEL);
if (unlikely(rc_list->rcl_refcalls == NULL))
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
for (i = 0; i < args->csa_nrclists; i++) {
status = decode_rc_list(xdr, &args->csa_rclists[i]);
- if (status)
+ if (status) {
+ args->csa_nrclists = i;
goto out_free;
+ }
}
}
status = 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f3f60641344..a1f0685b42ff 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
{
struct inode *inode = state->inode;
struct file_lock *fl;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct list_head *list;
int status = 0;
- if (inode->i_flock == NULL)
+ if (flctx == NULL)
goto out;
- /* Protect inode->i_flock using the i_lock */
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
- if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
- continue;
+ list = &flctx->flc_posix;
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
if (nfs_file_open_context(fl->fl_file) != ctx)
continue;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
status = nfs4_lock_delegation_recall(fl, state, stateid);
if (status < 0)
goto out;
- spin_lock(&inode->i_lock);
+ spin_lock(&flctx->flc_lock);
}
- spin_unlock(&inode->i_lock);
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
out:
return status;
}
@@ -175,7 +180,6 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
delegation->cred = get_rpccred(cred);
clear_bit(NFS_DELEGATION_NEED_RECLAIM,
&delegation->flags);
- NFS_I(inode)->delegation_state = delegation->type;
spin_unlock(&delegation->lock);
put_rpccred(oldcred);
rcu_read_unlock();
@@ -270,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
list_del_rcu(&delegation->super_list);
delegation->inode = NULL;
- nfsi->delegation_state = 0;
rcu_assign_pointer(nfsi->delegation, NULL);
spin_unlock(&delegation->lock);
return delegation;
@@ -301,6 +304,17 @@ nfs_inode_detach_delegation(struct inode *inode)
return nfs_detach_delegation(nfsi, delegation, server);
}
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+ const struct nfs_delegation *update)
+{
+ if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+ delegation->stateid.seqid = update->stateid.seqid;
+ smp_wmb();
+ delegation->type = update->type;
+ }
+}
+
/**
* nfs_inode_set_delegation - set up a delegation on an inode
* @inode: inode to which delegation applies
@@ -334,9 +348,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
old_delegation = rcu_dereference_protected(nfsi->delegation,
lockdep_is_held(&clp->cl_lock));
if (old_delegation != NULL) {
- if (nfs4_stateid_match(&delegation->stateid,
- &old_delegation->stateid) &&
- delegation->type == old_delegation->type) {
+ /* Is this an update of the existing delegation? */
+ if (nfs4_stateid_match_other(&old_delegation->stateid,
+ &delegation->stateid)) {
+ nfs_update_inplace_delegation(old_delegation,
+ delegation);
goto out;
}
/*
@@ -360,7 +376,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
goto out;
}
list_add_rcu(&delegation->super_list, &server->delegations);
- nfsi->delegation_state = delegation->type;
rcu_assign_pointer(nfsi->delegation, delegation);
delegation = NULL;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10bf07280f4a..e907c8cf732e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
/*
* This represents a set of asynchronous requests that we're waiting on
*/
+struct nfs_direct_mirror {
+ ssize_t count;
+};
+
struct nfs_direct_req {
struct kref kref; /* release manager */
@@ -78,8 +82,13 @@ struct nfs_direct_req {
/* completion state */
atomic_t io_count; /* i/os we're waiting for */
spinlock_t lock; /* protect completion state */
+
+ struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
+ int mirror_count;
+
ssize_t count, /* bytes actually processed */
bytes_left, /* bytes left to be sent */
+ io_start, /* start of IO */
error; /* any reported error */
struct completion completion; /* wait for i/o completion */
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
return atomic_dec_and_test(&dreq->io_count);
}
+void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
+{
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+}
+EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
+
+static void
+nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+{
+ int i;
+ ssize_t count;
+
+ WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
+
+ count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+ if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+ count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+ dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+ }
+
+ /* update the dreq->count by finding the minimum agreed count from all
+ * mirrors */
+ count = dreq->mirrors[0].count;
+
+ for (i = 1; i < dreq->mirror_count; i++)
+ count = min(count, dreq->mirrors[i].count);
+
+ dreq->count = count;
+}
+
/*
* nfs_direct_select_verf - select the right verifier
* @dreq - direct request possibly spanning multiple servers
* @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ * @commit_idx - commit bucket index for the DS
*
* returns the correct verifier to use given the role of the server
*/
static struct nfs_writeverf *
nfs_direct_select_verf(struct nfs_direct_req *dreq,
struct nfs_client *ds_clp,
- int ds_idx)
+ int commit_idx)
{
struct nfs_writeverf *verfp = &dreq->verf;
#ifdef CONFIG_NFS_V4_1
if (ds_clp) {
/* pNFS is in use, use the DS verf */
- if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
- verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+ if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
+ verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
else
WARN_ON_ONCE(1);
}
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
- hdr->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
WARN_ON_ONCE(verfp->committed >= 0);
memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
{
struct nfs_writeverf *verfp;
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
- hdr->ds_idx);
+ verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
if (verfp->committed < 0) {
nfs_direct_set_hdr_verf(dreq, hdr);
return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
verfp = nfs_direct_select_verf(dreq, data->ds_clp,
data->ds_commit_index);
- WARN_ON_ONCE(verfp->committed < 0);
+
+ /* verifier not set so always fail */
+ if (verfp->committed < 0)
+ return 1;
+
return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
}
@@ -212,6 +253,12 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
*/
ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+
+ /* we only support swap file calling nfs_direct_IO */
+ if (!IS_SWAPFILE(inode))
+ return 0;
+
#ifndef CONFIG_NFS_SWAP
dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
iocb->ki_filp, (long long) pos, iter->nr_segs);
@@ -236,13 +283,25 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq)
{
- cinfo->lock = &dreq->lock;
+ cinfo->lock = &dreq->inode->i_lock;
cinfo->mds = &dreq->mds_cinfo;
cinfo->ds = &dreq->ds_cinfo;
cinfo->dreq = dreq;
cinfo->completion_ops = &nfs_direct_commit_completion_ops;
}
+static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
+ struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ int mirror_count = 1;
+
+ if (pgio->pg_ops->pg_get_mirror_count)
+ mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+
+ dreq->mirror_count = mirror_count;
+}
+
static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
{
struct nfs_direct_req *dreq;
@@ -257,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+ dreq->mirror_count = 1;
spin_lock_init(&dreq->lock);
return dreq;
@@ -363,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
dreq->error = hdr->error;
else
- dreq->count += hdr->good_bytes;
+ nfs_direct_good_bytes(dreq, hdr);
+
spin_unlock(&dreq->lock);
while (!list_empty(&hdr->pages)) {
@@ -541,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
dreq->inode = inode;
dreq->bytes_left = count;
+ dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -573,6 +635,20 @@ out:
return result;
}
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+ struct list_head *list,
+ struct nfs_commit_info *cinfo)
+{
+ spin_lock(cinfo->lock);
+#ifdef CONFIG_NFS_V4_1
+ if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
+ NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
+#endif
+ nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+ spin_unlock(cinfo->lock);
+}
+
static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
{
struct nfs_pageio_descriptor desc;
@@ -580,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
LIST_HEAD(reqs);
struct nfs_commit_info cinfo;
LIST_HEAD(failed);
+ int i;
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
- spin_lock(cinfo.lock);
- nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
- spin_unlock(cinfo.lock);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
dreq->count = 0;
+ for (i = 0; i < dreq->mirror_count; i++)
+ dreq->mirrors[i].count = 0;
get_dreq(dreq);
nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
+ req = nfs_list_entry(reqs.next);
+ nfs_direct_setup_mirroring(dreq, &desc, req);
+
list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
if (!nfs_pageio_add_request(&desc, req)) {
nfs_list_remove_request(req);
@@ -640,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
nfs_list_remove_request(req);
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
/* Note the rewrite will go through mds */
- nfs_mark_request_commit(req, NULL, &cinfo);
+ nfs_mark_request_commit(req, NULL, &cinfo, 0);
} else
nfs_release_request(req);
nfs_unlock_and_release_request(req);
@@ -715,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
dreq->error = hdr->error;
}
if (dreq->error == 0) {
- dreq->count += hdr->good_bytes;
+ nfs_direct_good_bytes(dreq, hdr);
if (nfs_write_need_commit(hdr)) {
if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
request_commit = true;
@@ -739,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (request_commit) {
kref_get(&req->wb_kref);
- nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->ds_commit_idx);
}
nfs_unlock_and_release_request(req);
}
@@ -820,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
result = PTR_ERR(req);
break;
}
+
+ nfs_direct_setup_mirroring(dreq, &desc, req);
+
nfs_lock_request(req);
req->wb_index = pos >> PAGE_SHIFT;
req->wb_offset = pos & ~PAGE_MASK;
@@ -928,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
dreq->inode = inode;
dreq->bytes_left = count;
+ dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = nfs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..91e88a7ecef0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
}
}
-static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
-{
- if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
- return;
- pnfs_return_layout(inode);
-}
-
static int filelayout_async_handle_error(struct rpc_task *task,
struct nfs4_state *state,
struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
dprintk("%s DS connection error %d\n", __func__,
task->tk_status);
nfs4_mark_deviceid_unavailable(devid);
- set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ pnfs_error_mark_layout_for_return(inode, lseg);
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
-static void filelayout_read_release(void *data)
-{
- struct nfs_pgio_header *hdr = data;
- struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-
- filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(hdr->ds_clp);
- hdr->mds_ops->rpc_release(data);
-}
-
static int filelayout_write_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
return 0;
}
-/* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_commit_data *data)
-{
- struct nfs_page *first = nfs_list_entry(data->pages.next);
-
- data->task.tk_status = 0;
- memcpy(&data->verf.verifier, &first->wb_verf,
- sizeof(data->verf.verifier));
- data->verf.verifier.data[0]++; /* ensure verifier mismatch */
-}
-
static int filelayout_commit_done_cb(struct rpc_task *task,
struct nfs_commit_data *data)
{
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_MDS:
- prepare_to_resend_writes(data);
+ pnfs_generic_prepare_to_resend_writes(data);
return -EAGAIN;
case -EAGAIN:
rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
}
-static void filelayout_write_release(void *data)
-{
- struct nfs_pgio_header *hdr = data;
- struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-
- filelayout_fenceme(lo->plh_inode, lo);
- nfs_put_client(hdr->ds_clp);
- hdr->mds_ops->rpc_release(data);
-}
-
static void filelayout_commit_prepare(struct rpc_task *task, void *data)
{
struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
task);
}
-static void filelayout_write_commit_done(struct rpc_task *task, void *data)
-{
- struct nfs_commit_data *wdata = data;
-
- /* Note this may cause RPC to be resent */
- wdata->mds_ops->rpc_call_done(task, data);
-}
-
static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
{
struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
}
-static void filelayout_commit_release(void *calldata)
-{
- struct nfs_commit_data *data = calldata;
-
- data->completion_ops->completion(data);
- pnfs_put_lseg(data->lseg);
- nfs_put_client(data->ds_clp);
- nfs_commitdata_release(data);
-}
-
static const struct rpc_call_ops filelayout_read_call_ops = {
.rpc_call_prepare = filelayout_read_prepare,
.rpc_call_done = filelayout_read_call_done,
.rpc_count_stats = filelayout_read_count_stats,
- .rpc_release = filelayout_read_release,
+ .rpc_release = pnfs_generic_rw_release,
};
static const struct rpc_call_ops filelayout_write_call_ops = {
.rpc_call_prepare = filelayout_write_prepare,
.rpc_call_done = filelayout_write_call_done,
.rpc_count_stats = filelayout_write_count_stats,
- .rpc_release = filelayout_write_release,
+ .rpc_release = pnfs_generic_rw_release,
};
static const struct rpc_call_ops filelayout_commit_call_ops = {
.rpc_call_prepare = filelayout_commit_prepare,
- .rpc_call_done = filelayout_write_commit_done,
+ .rpc_call_done = pnfs_generic_write_commit_done,
.rpc_count_stats = filelayout_commit_count_stats,
- .rpc_release = filelayout_commit_release,
+ .rpc_release = pnfs_generic_commit_release,
};
static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
/* No multipath support. Use first DS */
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_idx = idx;
+ hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
hdr->mds_offset = offset;
/* Perform an asynchronous read to ds */
- nfs_initiate_pgio(ds_clnt, hdr,
- &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+ 0, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
hdr->pgio_done_cb = filelayout_write_done_cb;
atomic_inc(&ds->ds_clp->cl_count);
hdr->ds_clp = ds->ds_clp;
- hdr->ds_idx = idx;
+ hdr->ds_commit_idx = idx;
fh = nfs4_fl_select_ds_fh(lseg, j);
if (fh)
hdr->args.fh = fh;
hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
/* Perform an asynchronous write */
- nfs_initiate_pgio(ds_clnt, hdr,
- &filelayout_write_call_ops, sync,
- RPC_TASK_SOFTCONN);
+ nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
+ NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
+ sync, RPC_TASK_SOFTCONN);
return PNFS_ATTEMPTED;
}
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
.pg_init = filelayout_pg_init_read,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops filelayout_pg_write_ops = {
.pg_init = filelayout_pg_init_write,
.pg_test = filelayout_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,87 +951,28 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
return j;
}
-/* The generic layer is about to remove the req from the commit list.
- * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this is must be called holding the inode (/cinfo) lock
- */
-static void
-filelayout_clear_request_commit(struct nfs_page *req,
- struct nfs_commit_info *cinfo)
-{
- struct pnfs_layout_segment *freeme = NULL;
-
- if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
- goto out;
- cinfo->ds->nwritten--;
- if (list_is_singular(&req->wb_list)) {
- struct pnfs_commit_bucket *bucket;
-
- bucket = list_first_entry(&req->wb_list,
- struct pnfs_commit_bucket,
- written);
- freeme = bucket->wlseg;
- bucket->wlseg = NULL;
- }
-out:
- nfs_request_remove_commit_list(req, cinfo);
- pnfs_put_lseg_locked(freeme);
-}
-
static void
filelayout_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
{
struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
u32 i, j;
- struct list_head *list;
- struct pnfs_commit_bucket *buckets;
if (fl->commit_through_mds) {
- list = &cinfo->mds->list;
- spin_lock(cinfo->lock);
- goto mds_commit;
- }
-
- /* Note that we are calling nfs4_fl_calc_j_index on each page
- * that ends up being committed to a data server. An attractive
- * alternative is to add a field to nfs_write_data and nfs_page
- * to store the value calculated in filelayout_write_pagelist
- * and just use that here.
- */
- j = nfs4_fl_calc_j_index(lseg, req_offset(req));
- i = select_bucket_index(fl, j);
- spin_lock(cinfo->lock);
- buckets = cinfo->ds->buckets;
- list = &buckets[i].written;
- if (list_empty(list)) {
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * filelayout_clear_request_commit
+ nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+ } else {
+ /* Note that we are calling nfs4_fl_calc_j_index on each page
+ * that ends up being committed to a data server. An attractive
+ * alternative is to add a field to nfs_write_data and nfs_page
+ * to store the value calculated in filelayout_write_pagelist
+ * and just use that here.
*/
- buckets[i].wlseg = pnfs_get_lseg(lseg);
- }
- set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- cinfo->ds->nwritten++;
-
-mds_commit:
- /* nfs_request_add_commit_list(). We need to add req to list without
- * dropping cinfo lock.
- */
- set_bit(PG_CLEAN, &(req)->wb_flags);
- nfs_list_add_request(req, list);
- cinfo->mds->ncommit++;
- spin_unlock(cinfo->lock);
- if (!cinfo->dreq) {
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
- BDI_RECLAIMABLE);
- __mark_inode_dirty(req->wb_context->dentry->d_inode,
- I_DIRTY_DATASYNC);
+ j = nfs4_fl_calc_j_index(lseg, req_offset(req));
+ i = select_bucket_index(fl, j);
+ pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
}
}
@@ -1138,101 +1026,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
if (fh)
data->args.fh = fh;
- return nfs_initiate_commit(ds_clnt, data,
+ return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
&filelayout_commit_call_ops, how,
RPC_TASK_SOFTCONN);
out_err:
- prepare_to_resend_writes(data);
- filelayout_commit_release(data);
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
return -EAGAIN;
}
-static int
-transfer_commit_list(struct list_head *src, struct list_head *dst,
- struct nfs_commit_info *cinfo, int max)
-{
- struct nfs_page *req, *tmp;
- int ret = 0;
-
- list_for_each_entry_safe(req, tmp, src, wb_list) {
- if (!nfs_lock_request(req))
- continue;
- kref_get(&req->wb_kref);
- if (cond_resched_lock(cinfo->lock))
- list_safe_reset_next(req, tmp, wb_list);
- nfs_request_remove_commit_list(req, cinfo);
- clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
- nfs_list_add_request(req, dst);
- ret++;
- if ((ret == max) && !cinfo->dreq)
- break;
- }
- return ret;
-}
-
-/* Note called with cinfo->lock held. */
-static int
-filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
- struct nfs_commit_info *cinfo,
- int max)
-{
- struct list_head *src = &bucket->written;
- struct list_head *dst = &bucket->committing;
- int ret;
-
- ret = transfer_commit_list(src, dst, cinfo, max);
- if (ret) {
- cinfo->ds->nwritten -= ret;
- cinfo->ds->ncommitting += ret;
- bucket->clseg = bucket->wlseg;
- if (list_empty(src))
- bucket->wlseg = NULL;
- else
- pnfs_get_lseg(bucket->clseg);
- }
- return ret;
-}
-
-/* Move reqs from written to committing lists, returning count of number moved.
- * Note called with cinfo->lock held.
- */
-static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
- int max)
-{
- int i, rv = 0, cnt;
-
- for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
- cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
- cinfo, max);
- max -= cnt;
- rv += cnt;
- }
- return rv;
-}
-
-/* Pull everything off the committing lists and dump into @dst */
-static void filelayout_recover_commit_reqs(struct list_head *dst,
- struct nfs_commit_info *cinfo)
-{
- struct pnfs_commit_bucket *b;
- struct pnfs_layout_segment *freeme;
- int i;
-
-restart:
- spin_lock(cinfo->lock);
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
- freeme = b->wlseg;
- b->wlseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- goto restart;
- }
- }
- cinfo->ds->nwritten = 0;
- spin_unlock(cinfo->lock);
-}
-
/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
* for @page
* @cinfo - commit info for current inode
@@ -1263,108 +1065,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
return NULL;
}
-static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
-{
- struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- struct pnfs_commit_bucket *bucket;
- struct pnfs_layout_segment *freeme;
- int i;
-
- for (i = idx; i < fl_cinfo->nbuckets; i++) {
- bucket = &fl_cinfo->buckets[i];
- if (list_empty(&bucket->committing))
- continue;
- nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
- spin_lock(cinfo->lock);
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- pnfs_put_lseg(freeme);
- }
-}
-
-static unsigned int
-alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
-{
- struct pnfs_ds_commit_info *fl_cinfo;
- struct pnfs_commit_bucket *bucket;
- struct nfs_commit_data *data;
- int i;
- unsigned int nreq = 0;
-
- fl_cinfo = cinfo->ds;
- bucket = fl_cinfo->buckets;
- for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
- if (list_empty(&bucket->committing))
- continue;
- data = nfs_commitdata_alloc();
- if (!data)
- break;
- data->ds_commit_index = i;
- spin_lock(cinfo->lock);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- spin_unlock(cinfo->lock);
- list_add(&data->pages, list);
- nreq++;
- }
-
- /* Clean up on error */
- filelayout_retry_commit(cinfo, i);
- /* Caller will clean up entries put on list */
- return nreq;
-}
-
-/* This follows nfs_commit_list pretty closely */
static int
filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int how, struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data, *tmp;
- LIST_HEAD(list);
- unsigned int nreq = 0;
-
- if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc();
- if (data != NULL) {
- data->lseg = NULL;
- list_add(&data->pages, &list);
- nreq++;
- } else {
- nfs_retry_commit(mds_pages, NULL, cinfo);
- filelayout_retry_commit(cinfo, 0);
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
- return -ENOMEM;
- }
- }
-
- nreq += alloc_ds_commits(cinfo, &list);
-
- if (nreq == 0) {
- cinfo->completion_ops->error_cleanup(NFS_I(inode));
- goto out;
- }
-
- atomic_add(nreq, &cinfo->mds->rpcs_out);
-
- list_for_each_entry_safe(data, tmp, &list, pages) {
- list_del_init(&data->pages);
- if (!data->lseg) {
- nfs_init_commit(data, mds_pages, NULL, cinfo);
- nfs_initiate_commit(NFS_CLIENT(inode), data,
- data->mds_ops, how, 0);
- } else {
- struct pnfs_commit_bucket *buckets;
-
- buckets = cinfo->ds->buckets;
- nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
- filelayout_initiate_commit(data, how);
- }
- }
-out:
- cinfo->ds->ncommitting = 0;
- return PNFS_ATTEMPTED;
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ filelayout_initiate_commit);
}
+
static struct nfs4_deviceid_node *
filelayout_alloc_deviceid_node(struct nfs_server *server,
struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1129,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.pg_write_ops = &filelayout_pg_write_ops,
.get_ds_info = &filelayout_get_ds_info,
.mark_request_commit = filelayout_mark_request_commit,
- .clear_request_commit = filelayout_clear_request_commit,
- .scan_commit_lists = filelayout_scan_commit_lists,
- .recover_commit_reqs = filelayout_recover_commit_reqs,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
.search_commit_reqs = filelayout_search_commit_reqs,
.commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
#include "../pnfs.h"
/*
- * Default data server connection timeout and retrans vaules.
- * Set by module paramters dataserver_timeo and dataserver_retrans.
- */
-#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
-#define NFS4_DEF_DS_RETRANS 5
-
-/*
* Field testing shows we need to support up to 4096 stripe indices.
* We store each index as a u8 (u32 on the wire) to keep the memory footprint
* reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
#define NFS4_PNFS_MAX_STRIPE_CNT 4096
#define NFS4_PNFS_MAX_MULTI_CNT 256 /* 256 fit into a u8 stripe_index */
-/* error codes for internal use */
-#define NFS4ERR_RESET_TO_MDS 12001
-
enum stripetype4 {
STRIPE_SPARSE = 1,
STRIPE_DENSE = 2
};
-/* Individual ip address */
-struct nfs4_pnfs_ds_addr {
- struct sockaddr_storage da_addr;
- size_t da_addrlen;
- struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
- char *da_remotestr; /* human readable addr+port */
-};
-
-struct nfs4_pnfs_ds {
- struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
- char *ds_remotestr; /* comma sep list of addrs */
- struct list_head ds_addrs;
- struct nfs_client *ds_clp;
- atomic_t ds_count;
- unsigned long ds_state;
-#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
-};
-
struct nfs4_file_layout_dsaddr {
struct nfs4_deviceid_node id_node;
u32 stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
}
-static inline void
-filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
-{
- u32 *p = (u32 *)&node->deviceid;
-
- printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
- p[0], p[1], p[2], p[3]);
-
- set_bit(NFS_DEVICEID_INVALID, &node->flags);
-}
-
static inline bool
filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
{
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
extern struct nfs_fh *
nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
-extern void print_ds(struct nfs4_pnfs_ds *ds);
u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
#include <linux/nfs_fs.h>
#include <linux/vmalloc.h>
#include <linux/module.h>
-#include <linux/sunrpc/addr.h>
#include "../internal.h"
#include "../nfs4session.h"
@@ -42,183 +41,6 @@
static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
-/*
- * Data server cache
- *
- * Data servers can be mapped to different device ids.
- * nfs4_pnfs_ds reference counting
- * - set to 1 on allocation
- * - incremented when a device id maps a data server already in the cache.
- * - decremented when deviceid is removed from the cache.
- */
-static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-static LIST_HEAD(nfs4_data_server_cache);
-
-/* Debug routines */
-void
-print_ds(struct nfs4_pnfs_ds *ds)
-{
- if (ds == NULL) {
- printk("%s NULL device\n", __func__);
- return;
- }
- printk(" ds %s\n"
- " ref count %d\n"
- " client %p\n"
- " cl_exchange_flags %x\n",
- ds->ds_remotestr,
- atomic_read(&ds->ds_count), ds->ds_clp,
- ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
-}
-
-static bool
-same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
-{
- struct sockaddr_in *a, *b;
- struct sockaddr_in6 *a6, *b6;
-
- if (addr1->sa_family != addr2->sa_family)
- return false;
-
- switch (addr1->sa_family) {
- case AF_INET:
- a = (struct sockaddr_in *)addr1;
- b = (struct sockaddr_in *)addr2;
-
- if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
- a->sin_port == b->sin_port)
- return true;
- break;
-
- case AF_INET6:
- a6 = (struct sockaddr_in6 *)addr1;
- b6 = (struct sockaddr_in6 *)addr2;
-
- /* LINKLOCAL addresses must have matching scope_id */
- if (ipv6_addr_src_scope(&a6->sin6_addr) ==
- IPV6_ADDR_SCOPE_LINKLOCAL &&
- a6->sin6_scope_id != b6->sin6_scope_id)
- return false;
-
- if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
- a6->sin6_port == b6->sin6_port)
- return true;
- break;
-
- default:
- dprintk("%s: unhandled address family: %u\n",
- __func__, addr1->sa_family);
- return false;
- }
-
- return false;
-}
-
-static bool
-_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
- const struct list_head *dsaddrs2)
-{
- struct nfs4_pnfs_ds_addr *da1, *da2;
-
- /* step through both lists, comparing as we go */
- for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
- da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
- da1 != NULL && da2 != NULL;
- da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
- da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
- if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return false;
- }
- if (da1 == NULL && da2 == NULL)
- return true;
-
- return false;
-}
-
-/*
- * Lookup DS by addresses. nfs4_ds_cache_lock is held
- */
-static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
-{
- struct nfs4_pnfs_ds *ds;
-
- list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
- if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
- return ds;
- return NULL;
-}
-
-/*
- * Create an rpc connection to the nfs4_pnfs_ds data server
- * Currently only supports IPv4 and IPv6 addresses
- */
-static int
-nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
-{
- struct nfs_client *clp = ERR_PTR(-EIO);
- struct nfs4_pnfs_ds_addr *da;
- int status = 0;
-
- dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
- mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-
- list_for_each_entry(da, &ds->ds_addrs, da_node) {
- dprintk("%s: DS %s: trying address %s\n",
- __func__, ds->ds_remotestr, da->da_remotestr);
-
- clp = nfs4_set_ds_client(mds_srv->nfs_client,
- (struct sockaddr *)&da->da_addr,
- da->da_addrlen, IPPROTO_TCP,
- dataserver_timeo, dataserver_retrans);
- if (!IS_ERR(clp))
- break;
- }
-
- if (IS_ERR(clp)) {
- status = PTR_ERR(clp);
- goto out;
- }
-
- status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
- if (status)
- goto out_put;
-
- smp_wmb();
- ds->ds_clp = clp;
- dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
-out:
- return status;
-out_put:
- nfs_put_client(clp);
- goto out;
-}
-
-static void
-destroy_ds(struct nfs4_pnfs_ds *ds)
-{
- struct nfs4_pnfs_ds_addr *da;
-
- dprintk("--> %s\n", __func__);
- ifdebug(FACILITY)
- print_ds(ds);
-
- nfs_put_client(ds->ds_clp);
-
- while (!list_empty(&ds->ds_addrs)) {
- da = list_first_entry(&ds->ds_addrs,
- struct nfs4_pnfs_ds_addr,
- da_node);
- list_del_init(&da->da_node);
- kfree(da->da_remotestr);
- kfree(da);
- }
-
- kfree(ds->ds_remotestr);
- kfree(ds);
-}
-
void
nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
{
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
for (i = 0; i < dsaddr->ds_num; i++) {
ds = dsaddr->ds_list[i];
- if (ds != NULL) {
- if (atomic_dec_and_lock(&ds->ds_count,
- &nfs4_ds_cache_lock)) {
- list_del_init(&ds->ds_node);
- spin_unlock(&nfs4_ds_cache_lock);
- destroy_ds(ds);
- }
- }
+ if (ds != NULL)
+ nfs4_pnfs_ds_put(ds);
}
kfree(dsaddr->stripe_indices);
kfree(dsaddr);
}
-/*
- * Create a string with a human readable address and port to avoid
- * complicated setup around many dprinks.
- */
-static char *
-nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
- struct nfs4_pnfs_ds_addr *da;
- char *remotestr;
- size_t len;
- char *p;
-
- len = 3; /* '{', '}' and eol */
- list_for_each_entry(da, dsaddrs, da_node) {
- len += strlen(da->da_remotestr) + 1; /* string plus comma */
- }
-
- remotestr = kzalloc(len, gfp_flags);
- if (!remotestr)
- return NULL;
-
- p = remotestr;
- *(p++) = '{';
- len--;
- list_for_each_entry(da, dsaddrs, da_node) {
- size_t ll = strlen(da->da_remotestr);
-
- if (ll > len)
- goto out_err;
-
- memcpy(p, da->da_remotestr, ll);
- p += ll;
- len -= ll;
-
- if (len < 1)
- goto out_err;
- (*p++) = ',';
- len--;
- }
- if (len < 2)
- goto out_err;
- *(p++) = '}';
- *p = '\0';
- return remotestr;
-out_err:
- kfree(remotestr);
- return NULL;
-}
-
-static struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
- struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
- char *remotestr;
-
- if (list_empty(dsaddrs)) {
- dprintk("%s: no addresses defined\n", __func__);
- goto out;
- }
-
- ds = kzalloc(sizeof(*ds), gfp_flags);
- if (!ds)
- goto out;
-
- /* this is only used for debugging, so it's ok if its NULL */
- remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
-
- spin_lock(&nfs4_ds_cache_lock);
- tmp_ds = _data_server_lookup_locked(dsaddrs);
- if (tmp_ds == NULL) {
- INIT_LIST_HEAD(&ds->ds_addrs);
- list_splice_init(dsaddrs, &ds->ds_addrs);
- ds->ds_remotestr = remotestr;
- atomic_set(&ds->ds_count, 1);
- INIT_LIST_HEAD(&ds->ds_node);
- ds->ds_clp = NULL;
- list_add(&ds->ds_node, &nfs4_data_server_cache);
- dprintk("%s add new data server %s\n", __func__,
- ds->ds_remotestr);
- } else {
- kfree(remotestr);
- kfree(ds);
- atomic_inc(&tmp_ds->ds_count);
- dprintk("%s data server %s found, inc'ed ds_count to %d\n",
- __func__, tmp_ds->ds_remotestr,
- atomic_read(&tmp_ds->ds_count));
- ds = tmp_ds;
- }
- spin_unlock(&nfs4_ds_cache_lock);
-out:
- return ds;
-}
-
-/*
- * Currently only supports ipv4, ipv6 and one multi-path address.
- */
-static struct nfs4_pnfs_ds_addr *
-decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
-{
- struct nfs4_pnfs_ds_addr *da = NULL;
- char *buf, *portstr;
- __be16 port;
- int nlen, rlen;
- int tmp[2];
- __be32 *p;
- char *netid, *match_netid;
- size_t len, match_netid_len;
- char *startsep = "";
- char *endsep = "";
-
-
- /* r_netid */
- p = xdr_inline_decode(streamp, 4);
- if (unlikely(!p))
- goto out_err;
- nlen = be32_to_cpup(p++);
-
- p = xdr_inline_decode(streamp, nlen);
- if (unlikely(!p))
- goto out_err;
-
- netid = kmalloc(nlen+1, gfp_flags);
- if (unlikely(!netid))
- goto out_err;
-
- netid[nlen] = '\0';
- memcpy(netid, p, nlen);
-
- /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
- p = xdr_inline_decode(streamp, 4);
- if (unlikely(!p))
- goto out_free_netid;
- rlen = be32_to_cpup(p);
-
- p = xdr_inline_decode(streamp, rlen);
- if (unlikely(!p))
- goto out_free_netid;
-
- /* port is ".ABC.DEF", 8 chars max */
- if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
- dprintk("%s: Invalid address, length %d\n", __func__,
- rlen);
- goto out_free_netid;
- }
- buf = kmalloc(rlen + 1, gfp_flags);
- if (!buf) {
- dprintk("%s: Not enough memory\n", __func__);
- goto out_free_netid;
- }
- buf[rlen] = '\0';
- memcpy(buf, p, rlen);
-
- /* replace port '.' with '-' */
- portstr = strrchr(buf, '.');
- if (!portstr) {
- dprintk("%s: Failed finding expected dot in port\n",
- __func__);
- goto out_free_buf;
- }
- *portstr = '-';
-
- /* find '.' between address and port */
- portstr = strrchr(buf, '.');
- if (!portstr) {
- dprintk("%s: Failed finding expected dot between address and "
- "port\n", __func__);
- goto out_free_buf;
- }
- *portstr = '\0';
-
- da = kzalloc(sizeof(*da), gfp_flags);
- if (unlikely(!da))
- goto out_free_buf;
-
- INIT_LIST_HEAD(&da->da_node);
-
- if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
- sizeof(da->da_addr))) {
- dprintk("%s: error parsing address %s\n", __func__, buf);
- goto out_free_da;
- }
-
- portstr++;
- sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
- port = htons((tmp[0] << 8) | (tmp[1]));
-
- switch (da->da_addr.ss_family) {
- case AF_INET:
- ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
- da->da_addrlen = sizeof(struct sockaddr_in);
- match_netid = "tcp";
- match_netid_len = 3;
- break;
-
- case AF_INET6:
- ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
- da->da_addrlen = sizeof(struct sockaddr_in6);
- match_netid = "tcp6";
- match_netid_len = 4;
- startsep = "[";
- endsep = "]";
- break;
-
- default:
- dprintk("%s: unsupported address family: %u\n",
- __func__, da->da_addr.ss_family);
- goto out_free_da;
- }
-
- if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
- dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
- __func__, netid, match_netid);
- goto out_free_da;
- }
-
- /* save human readable address */
- len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
- da->da_remotestr = kzalloc(len, gfp_flags);
-
- /* NULL is ok, only used for dprintk */
- if (da->da_remotestr)
- snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
- buf, endsep, ntohs(port));
-
- dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
- kfree(buf);
- kfree(netid);
- return da;
-
-out_free_da:
- kfree(da);
-out_free_buf:
- dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
- kfree(buf);
-out_free_netid:
- kfree(netid);
-out_err:
- return NULL;
-}
-
/* Decode opaque device data and return the result */
struct nfs4_file_layout_dsaddr *
nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
mp_count = be32_to_cpup(p); /* multipath count */
for (j = 0; j < mp_count; j++) {
- da = decode_ds_addr(server->nfs_client->cl_net,
- &stream, gfp_flags);
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
if (da)
list_add_tail(&da->da_node, &dsaddrs);
}
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
return flseg->fh_array[i];
}
-static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
-{
- might_sleep();
- wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
- nfs_wait_bit_killable, TASK_KILLABLE);
-}
-
-static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
-{
- smp_mb__before_atomic();
- clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
- smp_mb__after_atomic();
- wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
-}
-
-
+/* Upon return, either ds is connected, or ds is NULL */
struct nfs4_pnfs_ds *
nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
{
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
struct nfs4_pnfs_ds *ret = ds;
+ struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
if (ds == NULL) {
printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
__func__, ds_idx);
- filelayout_mark_devid_invalid(devid);
+ pnfs_generic_mark_devid_invalid(devid);
goto out;
}
smp_rmb();
if (ds->ds_clp)
goto out_test_devid;
- if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
- struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
- int err;
-
- err = nfs4_ds_connect(s, ds);
- if (err)
- nfs4_mark_deviceid_unavailable(devid);
- nfs4_clear_ds_conn_bit(ds);
- } else {
- /* Either ds is connected, or ds is NULL */
- nfs4_wait_ds_connect(ds);
- }
+ nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+ dataserver_retrans, 4,
+ s->nfs_client->cl_minorversion,
+ s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+
out_test_devid:
if (filelayout_test_devid_unavailable(devid))
ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..315cc68945b9
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1533 @@
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_idmap.h>
+
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ)
+
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+ struct nfs4_flexfile_layout *ffl;
+
+ ffl = kzalloc(sizeof(*ffl), gfp_flags);
+ if (ffl) {
+ INIT_LIST_HEAD(&ffl->error_list);
+ return &ffl->generic_hdr;
+ } else
+ return NULL;
+}
+
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct nfs4_ff_layout_ds_err *err, *n;
+
+ list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+ list) {
+ list_del(&err->list);
+ kfree(err);
+ }
+ kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+ if (unlikely(p == NULL))
+ return -ENOBUFS;
+ memcpy(stateid, p, NFS4_STATEID_SIZE);
+ dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+ p[0], p[1], p[2], p[3]);
+ return 0;
+}
+
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+ nfs4_print_deviceid(devid);
+ return 0;
+}
+
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+ __be32 *p;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ fh->size = be32_to_cpup(p++);
+ if (fh->size > sizeof(struct nfs_fh)) {
+ printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+ fh->size);
+ return -EOVERFLOW;
+ }
+ /* fh.data */
+ p = xdr_inline_decode(xdr, fh->size);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ memcpy(&fh->data, p, fh->size);
+ dprintk("%s: fh len %d\n", __func__, fh->size);
+
+ return 0;
+}
+
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+ __be32 *p;
+ int len;
+
+ /* opaque_length(4)*/
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ len = be32_to_cpup(p++);
+ if (len < 0)
+ return -EINVAL;
+
+ dprintk("%s: len %u\n", __func__, len);
+
+ /* opaque body */
+ p = xdr_inline_decode(xdr, len);
+ if (unlikely(!p))
+ return -ENOBUFS;
+
+ if (!nfs_map_string_to_numeric((char *)p, len, id))
+ return -EINVAL;
+
+ return 0;
+}
+
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+ int i;
+
+ if (fls->mirror_array) {
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ /* normally mirror_ds is freed in
+ * .free_deviceid_node but we still do it here
+ * for .alloc_lseg error path */
+ if (fls->mirror_array[i]) {
+ kfree(fls->mirror_array[i]->fh_versions);
+ nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+ kfree(fls->mirror_array[i]);
+ }
+ }
+ kfree(fls->mirror_array);
+ fls->mirror_array = NULL;
+ }
+}
+
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+ int ret = 0;
+
+ dprintk("--> %s\n", __func__);
+
+ /* FIXME: remove this check when layout segment support is added */
+ if (lgr->range.offset != 0 ||
+ lgr->range.length != NFS4_MAX_UINT64) {
+ dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+ __func__);
+ ret = -EINVAL;
+ }
+
+ dprintk("--> %s returns %d\n", __func__, ret);
+ return ret;
+}
+
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+ if (fls) {
+ ff_layout_free_mirror_array(fls);
+ kfree(fls);
+ }
+}
+
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_ff_layout_mirror *tmp;
+ int i, j;
+
+ for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+ for (j = i + 1; j < fls->mirror_array_cnt; j++)
+ if (fls->mirror_array[i]->efficiency <
+ fls->mirror_array[j]->efficiency) {
+ tmp = fls->mirror_array[i];
+ fls->mirror_array[i] = fls->mirror_array[j];
+ fls->mirror_array[j] = tmp;
+ }
+ }
+}
+
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+ struct nfs4_layoutget_res *lgr,
+ gfp_t gfp_flags)
+{
+ struct pnfs_layout_segment *ret;
+ struct nfs4_ff_layout_segment *fls = NULL;
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ u64 stripe_unit;
+ u32 mirror_array_cnt;
+ __be32 *p;
+ int i, rc;
+
+ dprintk("--> %s\n", __func__);
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ return ERR_PTR(-ENOMEM);
+
+ xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+ lgr->layoutp->len);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* stripe unit and mirror_array_cnt */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 8 + 4);
+ if (!p)
+ goto out_err_free;
+
+ p = xdr_decode_hyper(p, &stripe_unit);
+ mirror_array_cnt = be32_to_cpup(p++);
+ dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+ stripe_unit, mirror_array_cnt);
+
+ if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+ mirror_array_cnt == 0)
+ goto out_err_free;
+
+ rc = -ENOMEM;
+ fls = kzalloc(sizeof(*fls), gfp_flags);
+ if (!fls)
+ goto out_err_free;
+
+ fls->mirror_array_cnt = mirror_array_cnt;
+ fls->stripe_unit = stripe_unit;
+ fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+ sizeof(fls->mirror_array[0]), gfp_flags);
+ if (fls->mirror_array == NULL)
+ goto out_err_free;
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ struct nfs4_deviceid devid;
+ struct nfs4_deviceid_node *idnode;
+ u32 ds_count;
+ u32 fh_count;
+ int j;
+
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ ds_count = be32_to_cpup(p);
+
+ /* FIXME: allow for striping? */
+ if (ds_count != 1)
+ goto out_err_free;
+
+ fls->mirror_array[i] =
+ kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+ gfp_flags);
+ if (fls->mirror_array[i] == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ spin_lock_init(&fls->mirror_array[i]->lock);
+ fls->mirror_array[i]->ds_count = ds_count;
+
+ /* deviceid */
+ rc = decode_deviceid(&stream, &devid);
+ if (rc)
+ goto out_err_free;
+
+ idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+ &devid, lh->plh_lc_cred,
+ gfp_flags);
+ /*
+ * upon success, mirror_ds is allocated by previous
+ * getdeviceinfo, or newly by .alloc_deviceid_node
+ * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+ */
+ if (idnode)
+ fls->mirror_array[i]->mirror_ds =
+ FF_LAYOUT_MIRROR_DS(idnode);
+ else
+ goto out_err_free;
+
+ /* efficiency */
+ rc = -EIO;
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+
+ /* stateid */
+ rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+ if (rc)
+ goto out_err_free;
+
+ /* fh */
+ p = xdr_inline_decode(&stream, 4);
+ if (!p)
+ goto out_err_free;
+ fh_count = be32_to_cpup(p);
+
+ fls->mirror_array[i]->fh_versions =
+ kzalloc(fh_count * sizeof(struct nfs_fh),
+ gfp_flags);
+ if (fls->mirror_array[i]->fh_versions == NULL) {
+ rc = -ENOMEM;
+ goto out_err_free;
+ }
+
+ for (j = 0; j < fh_count; j++) {
+ rc = decode_nfs_fh(&stream,
+ &fls->mirror_array[i]->fh_versions[j]);
+ if (rc)
+ goto out_err_free;
+ }
+
+ fls->mirror_array[i]->fh_versions_cnt = fh_count;
+
+ /* user */
+ rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+ if (rc)
+ goto out_err_free;
+
+ /* group */
+ rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+ if (rc)
+ goto out_err_free;
+
+ dprintk("%s: uid %d gid %d\n", __func__,
+ fls->mirror_array[i]->uid,
+ fls->mirror_array[i]->gid);
+ }
+
+ ff_layout_sort_mirrors(fls);
+ rc = ff_layout_check_layout(lgr);
+ if (rc)
+ goto out_err_free;
+
+ ret = &fls->generic_hdr;
+ dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+ __free_page(scratch);
+ return ret;
+out_err_free:
+ _ff_layout_free_lseg(fls);
+ ret = ERR_PTR(rc);
+ dprintk("<-- %s (%d)\n", __func__, rc);
+ goto out_free_page;
+}
+
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+ if (lseg->pls_range.iomode == IOMODE_RW)
+ return true;
+
+ return false;
+}
+
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ int i;
+
+ dprintk("--> %s\n", __func__);
+
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ if (fls->mirror_array[i]) {
+ nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+ fls->mirror_array[i]->mirror_ds = NULL;
+ if (fls->mirror_array[i]->cred) {
+ put_rpccred(fls->mirror_array[i]->cred);
+ fls->mirror_array[i]->cred = NULL;
+ }
+ }
+ }
+
+ if (lseg->pls_range.iomode == IOMODE_RW) {
+ struct nfs4_flexfile_layout *ffl;
+ struct inode *inode;
+
+ ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+ inode = ffl->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+ ffl->commit_info.nbuckets = 0;
+ kfree(ffl->commit_info.buckets);
+ ffl->commit_info.buckets = NULL;
+ }
+ spin_unlock(&inode->i_lock);
+ }
+ _ff_layout_free_lseg(fls);
+}
+
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+ return 1;
+}
+
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+ struct pnfs_commit_bucket *buckets;
+ int size;
+
+ if (cinfo->ds->nbuckets != 0) {
+ /* This assumes there is only one RW lseg per file.
+ * To support multiple lseg per file, we need to
+ * change struct pnfs_commit_bucket to allow dynamic
+ * increasing nbuckets.
+ */
+ return 0;
+ }
+
+ size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+
+ buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+ gfp_flags);
+ if (!buckets)
+ return -ENOMEM;
+ else {
+ int i;
+
+ spin_lock(cinfo->lock);
+ if (cinfo->ds->nbuckets != 0)
+ kfree(buckets);
+ else {
+ cinfo->ds->buckets = buckets;
+ cinfo->ds->nbuckets = size;
+ for (i = 0; i < size; i++) {
+ INIT_LIST_HEAD(&buckets[i].written);
+ INIT_LIST_HEAD(&buckets[i].committing);
+ /* mark direct verifier as unset */
+ buckets[i].direct_verf.committed =
+ NFS_INVALID_STABLE_HOW;
+ }
+ }
+ spin_unlock(cinfo->lock);
+ return 0;
+ }
+}
+
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ int *best_idx)
+{
+ struct nfs4_ff_layout_segment *fls;
+ struct nfs4_pnfs_ds *ds;
+ int idx;
+
+ fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+ /* mirrors are sorted by efficiency */
+ for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ if (ds) {
+ *best_idx = idx;
+ return ds;
+ }
+ }
+
+ return NULL;
+}
+
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs_pgio_mirror *pgm;
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_pnfs_ds *ds;
+ int ds_idx;
+
+ /* Use full layout for now */
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_READ,
+ GFP_KERNEL);
+ /* If no lseg, fall back to read through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ if (!ds)
+ goto out_mds;
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+
+ pgio->pg_mirror_idx = ds_idx;
+
+ /* read always uses only one mirror - idx 0 for pgio layer */
+ pgm = &pgio->pg_mirrors[0];
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+
+ return;
+out_mds:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_read_mds(pgio);
+}
+
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs_pgio_mirror *pgm;
+ struct nfs_commit_info cinfo;
+ struct nfs4_pnfs_ds *ds;
+ int i;
+ int status;
+
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ GFP_NOFS);
+ /* If no lseg, fall back to write through mds */
+ if (pgio->pg_lseg == NULL)
+ goto out_mds;
+
+ nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+ status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+ if (status < 0)
+ goto out_mds;
+
+ /* Use a direct mapping of ds_idx to pgio mirror_idx */
+ if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+ FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+ goto out_mds;
+
+ for (i = 0; i < pgio->pg_mirror_count; i++) {
+ ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+ if (!ds)
+ goto out_mds;
+ pgm = &pgio->pg_mirrors[i];
+ mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+ pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+ }
+
+ return;
+
+out_mds:
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_write_mds(pgio);
+}
+
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ if (!pgio->pg_lseg)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ 0,
+ NFS4_MAX_UINT64,
+ IOMODE_RW,
+ GFP_NOFS);
+ if (pgio->pg_lseg)
+ return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+
+ /* no lseg means that pnfs is not in use, so no mirroring here */
+ pnfs_put_lseg(pgio->pg_lseg);
+ pgio->pg_lseg = NULL;
+ nfs_pageio_reset_write_mds(pgio);
+ return 1;
+}
+
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+ .pg_init = ff_layout_pg_init_read,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+ .pg_init = ff_layout_pg_init_write,
+ .pg_test = pnfs_generic_pg_test,
+ .pg_doio = pnfs_generic_pg_writepages,
+ .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (retry_pnfs) {
+ dprintk("%s Reset task %5u for i/o through pNFS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ if (!hdr->dreq) {
+ struct nfs_open_context *ctx;
+
+ ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+ set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+ hdr->completion_ops->error_cleanup(&hdr->pages);
+ } else {
+ nfs_direct_set_resched_writes(hdr->dreq);
+ /* fake unstable write to let common nfs resend pages */
+ hdr->verf.committed = NFS_UNSTABLE;
+ hdr->good_bytes = 0;
+ }
+ return;
+ }
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+ }
+}
+
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+ struct rpc_task *task = &hdr->task;
+
+ pnfs_layoutcommit_inode(hdr->inode, false);
+
+ if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+ dprintk("%s Reset task %5u for i/o through MDS "
+ "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+ hdr->task.tk_pid,
+ hdr->inode->i_sb->s_id,
+ (unsigned long long)NFS_FILEID(hdr->inode),
+ hdr->args.count,
+ (unsigned long long)hdr->args.offset);
+
+ task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+ }
+}
+
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ struct pnfs_layout_hdr *lo = lseg->pls_layout;
+ struct inode *inode = lo->plh_inode;
+ struct nfs_server *mds_server = NFS_SERVER(inode);
+
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+ struct nfs_client *mds_client = mds_server->nfs_client;
+ struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ switch (task->tk_status) {
+ /* MDS state errors */
+ case -NFS4ERR_DELEG_REVOKED:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ if (state == NULL)
+ break;
+ nfs_remove_bad_delegation(state->inode);
+ case -NFS4ERR_OPENMODE:
+ if (state == NULL)
+ break;
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
+ goto wait_on_recovery;
+ case -NFS4ERR_EXPIRED:
+ if (state != NULL) {
+ if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+ goto out_bad_stateid;
+ }
+ nfs4_schedule_lease_recovery(mds_client);
+ goto wait_on_recovery;
+ /* DS session errors */
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ case -NFS4ERR_SEQ_FALSE_RETRY:
+ case -NFS4ERR_SEQ_MISORDERED:
+ dprintk("%s ERROR %d, Reset session. Exchangeid "
+ "flags 0x%x\n", __func__, task->tk_status,
+ clp->cl_exchange_flags);
+ nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+ break;
+ case -NFS4ERR_DELAY:
+ case -NFS4ERR_GRACE:
+ rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+ break;
+ case -NFS4ERR_RETRY_UNCACHED_REP:
+ break;
+ /* Invalidate Layout errors */
+ case -NFS4ERR_PNFS_NO_LAYOUT:
+ case -ESTALE: /* mapped NFS4ERR_STALE */
+ case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */
+ case -EISDIR: /* mapped NFS4ERR_ISDIR */
+ case -NFS4ERR_FHEXPIRED:
+ case -NFS4ERR_WRONG_TYPE:
+ dprintk("%s Invalid layout error %d\n", __func__,
+ task->tk_status);
+ /*
+ * Destroy layout so new i/o will get a new layout.
+ * Layout will not be destroyed until all current lseg
+ * references are put. Mark layout as invalid to resend failed
+ * i/o and all i/o waiting on the slot table to the MDS until
+ * layout is destroyed and a new valid layout is obtained.
+ */
+ pnfs_destroy_layout(NFS_I(inode));
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ goto reset;
+ /* RPC connection errors */
+ case -ECONNREFUSED:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EIO:
+ case -ETIMEDOUT:
+ case -EPIPE:
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ rpc_wake_up(&tbl->slot_tbl_waitq);
+ /* fall through */
+ default:
+ if (ff_layout_has_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+reset:
+ dprintk("%s Retry through MDS. Error %d\n", __func__,
+ task->tk_status);
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+out:
+ task->tk_status = 0;
+ return -EAGAIN;
+out_bad_stateid:
+ task->tk_status = -EIO;
+ return 0;
+wait_on_recovery:
+ rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+ rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+ goto out;
+}
+
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ if (task->tk_status >= 0)
+ return 0;
+
+ if (task->tk_status != -EJUKEBOX) {
+ dprintk("%s DS connection error %d\n", __func__,
+ task->tk_status);
+ nfs4_mark_deviceid_unavailable(devid);
+ if (ff_layout_has_available_ds(lseg))
+ return -NFS4ERR_RESET_TO_PNFS;
+ else
+ return -NFS4ERR_RESET_TO_MDS;
+ }
+
+ if (task->tk_status == -EJUKEBOX)
+ nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+ task->tk_status = 0;
+ rpc_restart_call(task);
+ rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+ return -EAGAIN;
+}
+
+static int ff_layout_async_handle_error(struct rpc_task *task,
+ struct nfs4_state *state,
+ struct nfs_client *clp,
+ struct pnfs_layout_segment *lseg,
+ int idx)
+{
+ int vers = clp->cl_nfs_mod->rpc_vers->number;
+
+ switch (vers) {
+ case 3:
+ return ff_layout_async_handle_error_v3(task, lseg, idx);
+ case 4:
+ return ff_layout_async_handle_error_v4(task, state, clp,
+ lseg, idx);
+ default:
+ /* should never happen */
+ WARN_ON_ONCE(1);
+ return 0;
+ }
+}
+
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+ int idx, u64 offset, u64 length,
+ u32 status, int opnum)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ int err;
+
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+ dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+
+/* NFS_PROTO call done callback routines */
+
+static int ff_layout_read_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_read(hdr, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+ hdr->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && hdr->res.op_status)
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ hdr->res.op_status, OP_READ);
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &hdr->lseg->pls_layout->plh_flags);
+ pnfs_read_resend_pnfs(hdr);
+ return task->tk_status;
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = hdr->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+ ff_layout_reset_read(hdr);
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+ pnfs_set_layoutcommit(hdr);
+ dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+ (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+ /* No mirroring for now */
+ struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+
+ return ff_layout_test_devid_unavailable(node);
+}
+
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+ if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+ dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+ if (ff_layout_has_available_ds(hdr->lseg))
+ pnfs_read_resend_pnfs(hdr);
+ else
+ ff_layout_reset_read(hdr);
+ rpc_exit(task, 0);
+ return -EAGAIN;
+ }
+ hdr->pgio_done_cb = ff_layout_read_done_cb;
+
+ return 0;
+}
+
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
+{
+ if (ds_clp->cl_session)
+ return nfs41_setup_sequence(ds_clp->cl_session,
+ args,
+ res,
+ task);
+ return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+ args,
+ res,
+ task);
+}
+
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_read_prepare_common(task, hdr))
+ return;
+
+ if (ff_layout_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_READ) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+
+static int ff_layout_write_done_cb(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_write(hdr, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+ hdr->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && hdr->res.op_status)
+ ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+ hdr->args.offset, hdr->args.count,
+ hdr->res.op_status, OP_WRITE);
+ err = ff_layout_async_handle_error(task, hdr->args.context->state,
+ hdr->ds_clp, hdr->lseg,
+ hdr->pgio_mirror_idx);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = hdr->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+ if (err == -NFS4ERR_RESET_TO_PNFS) {
+ pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, true);
+ } else {
+ pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+ ff_layout_reset_write(hdr, false);
+ }
+ return task->tk_status;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+ hdr->res.verf->committed == NFS_DATA_SYNC)
+ ff_layout_set_layoutcommit(hdr);
+
+ return 0;
+}
+
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+ struct nfs_commit_data *data)
+{
+ struct inode *inode;
+ int err;
+
+ trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+ if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+ data->res.op_status = NFS4ERR_NXIO;
+ if (task->tk_status < 0 && data->res.op_status)
+ ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+ data->args.offset, data->args.count,
+ data->res.op_status, OP_COMMIT);
+ err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+ data->lseg, data->ds_commit_index);
+
+ switch (err) {
+ case -NFS4ERR_RESET_TO_PNFS:
+ case -NFS4ERR_RESET_TO_MDS:
+ inode = data->lseg->pls_layout->plh_inode;
+ pnfs_error_mark_layout_for_return(inode, data->lseg);
+ if (err == -NFS4ERR_RESET_TO_PNFS)
+ pnfs_set_retry_layoutget(data->lseg->pls_layout);
+ else
+ pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+ pnfs_generic_prepare_to_resend_writes(data);
+ return -EAGAIN;
+ case -EAGAIN:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
+ }
+
+ if (data->verf.committed == NFS_UNSTABLE)
+ pnfs_commit_set_layoutcommit(data);
+
+ return 0;
+}
+
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+ struct nfs_pgio_header *hdr)
+{
+ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+ rpc_exit(task, -EIO);
+ return -EIO;
+ }
+
+ if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+ bool retry_pnfs;
+
+ retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+ dprintk("%s task %u reset io to %s\n", __func__,
+ task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+ ff_layout_reset_write(hdr, retry_pnfs);
+ rpc_exit(task, 0);
+ return -EAGAIN;
+ }
+
+ return 0;
+}
+
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ rpc_call_start(task);
+}
+
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (ff_layout_write_prepare_common(task, hdr))
+ return;
+
+ if (ff_layout_setup_sequence(hdr->ds_clp,
+ &hdr->args.seq_args,
+ &hdr->res.seq_res,
+ task))
+ return;
+
+ if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+ hdr->args.lock_context, FMODE_WRITE) == -EIO)
+ rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+ task->tk_status == 0) {
+ nfs4_sequence_done(task, &hdr->res.seq_res);
+ return;
+ }
+
+ /* Note this may cause RPC to be resent */
+ hdr->mds_ops->rpc_call_done(task, hdr);
+}
+
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+ rpc_call_start(task);
+}
+
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ ff_layout_setup_sequence(wdata->ds_clp,
+ &wdata->args.seq_args,
+ &wdata->res.seq_res,
+ task);
+}
+
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *cdata = data;
+
+ rpc_count_iostats_metrics(task,
+ &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v3,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_read_prepare_v4,
+ .rpc_call_done = ff_layout_read_call_done,
+ .rpc_count_stats = ff_layout_read_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v3,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_write_prepare_v4,
+ .rpc_call_done = ff_layout_write_call_done,
+ .rpc_count_stats = ff_layout_write_count_stats,
+ .rpc_release = pnfs_generic_rw_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v3,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+ .rpc_call_prepare = ff_layout_commit_prepare_v4,
+ .rpc_call_done = pnfs_generic_write_commit_done,
+ .rpc_count_stats = ff_layout_commit_count_stats,
+ .rpc_release = pnfs_generic_commit_release,
+};
+
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ u32 idx = hdr->pgio_mirror_idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+ __func__, hdr->inode->i_ino,
+ hdr->args.pgbase, (size_t)hdr->args.count, offset);
+
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+ if (!ds)
+ goto out_failed;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_failed;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ if (IS_ERR(ds_cred))
+ goto out_failed;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+ ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+
+ atomic_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ if (fh)
+ hdr->args.fh = fh;
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+ hdr->mds_offset = offset;
+
+ /* Perform an asynchronous read to ds */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_read_call_ops_v3 :
+ &ff_layout_read_call_ops_v4,
+ 0, RPC_TASK_SOFTCONN);
+
+ return PNFS_ATTEMPTED;
+
+out_failed:
+ if (ff_layout_has_available_ds(lseg))
+ return PNFS_TRY_AGAIN;
+ return PNFS_NOT_ATTEMPTED;
+}
+
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+ struct pnfs_layout_segment *lseg = hdr->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ loff_t offset = hdr->args.offset;
+ int vers;
+ struct nfs_fh *fh;
+ int idx = hdr->pgio_mirror_idx;
+
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ if (!ds)
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ hdr->inode);
+ if (IS_ERR(ds_clnt))
+ return PNFS_NOT_ATTEMPTED;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+ if (IS_ERR(ds_cred))
+ return PNFS_NOT_ATTEMPTED;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+ __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+ offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+ vers);
+
+ hdr->pgio_done_cb = ff_layout_write_done_cb;
+ atomic_inc(&ds->ds_clp->cl_count);
+ hdr->ds_clp = ds->ds_clp;
+ hdr->ds_commit_idx = idx;
+ fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+ if (fh)
+ hdr->args.fh = fh;
+
+ /*
+ * Note that if we ever decide to split across DSes,
+ * then we may need to handle dense-like offsets.
+ */
+ hdr->args.offset = offset;
+
+ /* Perform an asynchronous write */
+ nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_write_call_ops_v3 :
+ &ff_layout_write_call_ops_v4,
+ sync, RPC_TASK_SOFTCONN);
+ return PNFS_ATTEMPTED;
+}
+
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ return i;
+}
+
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+
+ /* FIXME: Assume that there is only one NFS version available
+ * for the DS.
+ */
+ return &flseg->mirror_array[i]->fh_versions[0];
+}
+
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+ struct pnfs_layout_segment *lseg = data->lseg;
+ struct nfs4_pnfs_ds *ds;
+ struct rpc_clnt *ds_clnt;
+ struct rpc_cred *ds_cred;
+ u32 idx;
+ int vers;
+ struct nfs_fh *fh;
+
+ idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+ if (!ds)
+ goto out_err;
+
+ ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+ data->inode);
+ if (IS_ERR(ds_clnt))
+ goto out_err;
+
+ ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+ if (IS_ERR(ds_cred))
+ goto out_err;
+
+ vers = nfs4_ff_layout_ds_version(lseg, idx);
+
+ dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+ data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+ vers);
+ data->commit_done_cb = ff_layout_commit_done_cb;
+ data->cred = ds_cred;
+ atomic_inc(&ds->ds_clp->cl_count);
+ data->ds_clp = ds->ds_clp;
+ fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+ if (fh)
+ data->args.fh = fh;
+ return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+ vers == 3 ? &ff_layout_commit_call_ops_v3 :
+ &ff_layout_commit_call_ops_v4,
+ how, RPC_TASK_SOFTCONN);
+out_err:
+ pnfs_generic_prepare_to_resend_writes(data);
+ pnfs_generic_commit_release(data);
+ return -EAGAIN;
+}
+
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo)
+{
+ return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+ ff_layout_initiate_commit);
+}
+
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+ struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+
+ if (layout == NULL)
+ return NULL;
+
+ return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+
+static void
+ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+ nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+ id_node));
+}
+
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+ __be32 *start;
+ int count = 0, ret = 0;
+
+ start = xdr_reserve_space(xdr, 4);
+ if (unlikely(!start))
+ return -E2BIG;
+
+ /* This assume we always return _ALL_ layouts */
+ spin_lock(&hdr->plh_inode->i_lock);
+ ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+ spin_unlock(&hdr->plh_inode->i_lock);
+
+ *start = cpu_to_be32(count);
+
+ return ret;
+}
+
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (likely(p))
+ *p = cpu_to_be32(0);
+}
+
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+ struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds *dsaddr;
+
+ dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+ if (!dsaddr)
+ return NULL;
+ return &dsaddr->id_node;
+}
+
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+ struct xdr_stream *xdr,
+ const struct nfs4_layoutreturn_args *args)
+{
+ struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ __be32 *start;
+
+ dprintk("%s: Begin\n", __func__);
+ start = xdr_reserve_space(xdr, 4);
+ BUG_ON(!start);
+
+ if (ff_layout_encode_ioerr(flo, xdr, args))
+ goto out;
+
+ ff_layout_encode_iostats(flo, xdr, args);
+out:
+ *start = cpu_to_be32((xdr->p - start - 1) * 4);
+ dprintk("%s: Return\n", __func__);
+}
+
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+ .id = LAYOUT_FLEX_FILES,
+ .name = "LAYOUT_FLEX_FILES",
+ .owner = THIS_MODULE,
+ .alloc_layout_hdr = ff_layout_alloc_layout_hdr,
+ .free_layout_hdr = ff_layout_free_layout_hdr,
+ .alloc_lseg = ff_layout_alloc_lseg,
+ .free_lseg = ff_layout_free_lseg,
+ .pg_read_ops = &ff_layout_pg_read_ops,
+ .pg_write_ops = &ff_layout_pg_write_ops,
+ .get_ds_info = ff_layout_get_ds_info,
+ .free_deviceid_node = ff_layout_free_deveiceid_node,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+ .read_pagelist = ff_layout_read_pagelist,
+ .write_pagelist = ff_layout_write_pagelist,
+ .alloc_deviceid_node = ff_layout_alloc_deviceid_node,
+ .encode_layoutreturn = ff_layout_encode_layoutreturn,
+};
+
+static int __init nfs4flexfilelayout_init(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+ __func__);
+ return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+
+static void __exit nfs4flexfilelayout_exit(void)
+{
+ printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+ __func__);
+ pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+
+MODULE_ALIAS("nfs-layouttype4-4");
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+
+#include "../pnfs.h"
+
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+
+struct nfs4_ff_ds_version {
+ u32 version;
+ u32 minor_version;
+ u32 rsize;
+ u32 wsize;
+ bool tightly_coupled;
+};
+
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+ struct nfs4_deviceid_node id_node;
+ u32 ds_versions_cnt;
+ struct nfs4_ff_ds_version *ds_versions;
+ struct nfs4_pnfs_ds *ds;
+};
+
+struct nfs4_ff_layout_ds_err {
+ struct list_head list; /* linked in mirror error_list */
+ u64 offset;
+ u64 length;
+ int status;
+ enum nfs_opnum4 opnum;
+ nfs4_stateid stateid;
+ struct nfs4_deviceid deviceid;
+};
+
+struct nfs4_ff_layout_mirror {
+ u32 ds_count;
+ u32 efficiency;
+ struct nfs4_ff_layout_ds *mirror_ds;
+ u32 fh_versions_cnt;
+ struct nfs_fh *fh_versions;
+ nfs4_stateid stateid;
+ struct nfs4_string user_name;
+ struct nfs4_string group_name;
+ u32 uid;
+ u32 gid;
+ struct rpc_cred *cred;
+ spinlock_t lock;
+};
+
+struct nfs4_ff_layout_segment {
+ struct pnfs_layout_segment generic_hdr;
+ u64 stripe_unit;
+ u32 mirror_array_cnt;
+ struct nfs4_ff_layout_mirror **mirror_array;
+};
+
+struct nfs4_flexfile_layout {
+ struct pnfs_layout_hdr generic_hdr;
+ struct pnfs_ds_commit_info commit_info;
+ struct list_head error_list; /* nfs4_ff_layout_ds_err */
+};
+
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+ return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+ return container_of(lseg,
+ struct nfs4_ff_layout_segment,
+ generic_hdr);
+}
+
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
+ FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
+ FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
+ return NULL;
+ return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
+}
+
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+ return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+ if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
+ return NULL;
+ return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+}
+
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+
+static inline bool
+ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+ return nfs4_test_deviceid_unavailable(node);
+}
+
+static inline int
+nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+ return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+}
+
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags);
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr, int *count,
+ const struct pnfs_layout_range *range);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ bool fail_return);
+
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
+ u32 ds_idx,
+ struct nfs_client *ds_clp,
+ struct inode *inode);
+struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+ u32 ds_idx, struct rpc_cred *mdscred);
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS_LD
+
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ if (mirror_ds)
+ nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+ nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+ nfs4_pnfs_ds_put(mirror_ds->ds);
+ kfree(mirror_ds);
+}
+
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+ gfp_t gfp_flags)
+{
+ struct xdr_stream stream;
+ struct xdr_buf buf;
+ struct page *scratch;
+ struct list_head dsaddrs;
+ struct nfs4_pnfs_ds_addr *da;
+ struct nfs4_ff_layout_ds *new_ds = NULL;
+ struct nfs4_ff_ds_version *ds_versions = NULL;
+ u32 mp_count;
+ u32 version_count;
+ __be32 *p;
+ int i, ret = -ENOMEM;
+
+ /* set up xdr stream */
+ scratch = alloc_page(gfp_flags);
+ if (!scratch)
+ goto out_err;
+
+ new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+ if (!new_ds)
+ goto out_scratch;
+
+ nfs4_init_deviceid_node(&new_ds->id_node,
+ server,
+ &pdev->dev_id);
+ INIT_LIST_HEAD(&dsaddrs);
+
+ xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+ xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+
+ /* multipath count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ mp_count = be32_to_cpup(p);
+ dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+
+ for (i = 0; i < mp_count; i++) {
+ /* multipath ds */
+ da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+ &stream, gfp_flags);
+ if (da)
+ list_add_tail(&da->da_node, &dsaddrs);
+ }
+ if (list_empty(&dsaddrs)) {
+ dprintk("%s: no suitable DS addresses found\n",
+ __func__);
+ ret = -ENOMEDIUM;
+ goto out_err_drain_dsaddrs;
+ }
+
+ /* version count */
+ p = xdr_inline_decode(&stream, 4);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ version_count = be32_to_cpup(p);
+ dprintk("%s: version count %d\n", __func__, version_count);
+
+ ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
+ gfp_flags);
+ if (!ds_versions)
+ goto out_scratch;
+
+ for (i = 0; i < version_count; i++) {
+ /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+ * tightly_coupled(4) */
+ p = xdr_inline_decode(&stream, 20);
+ if (unlikely(!p))
+ goto out_err_drain_dsaddrs;
+ ds_versions[i].version = be32_to_cpup(p++);
+ ds_versions[i].minor_version = be32_to_cpup(p++);
+ ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+ ds_versions[i].tightly_coupled = be32_to_cpup(p);
+
+ if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+ if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+ ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+
+ if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+ dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+ i, ds_versions[i].version,
+ ds_versions[i].minor_version);
+ ret = -EPROTONOSUPPORT;
+ goto out_err_drain_dsaddrs;
+ }
+
+ dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+ __func__, i, ds_versions[i].version,
+ ds_versions[i].minor_version,
+ ds_versions[i].rsize,
+ ds_versions[i].wsize,
+ ds_versions[i].tightly_coupled);
+ }
+
+ new_ds->ds_versions = ds_versions;
+ new_ds->ds_versions_cnt = version_count;
+
+ new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+ if (!new_ds->ds)
+ goto out_err_drain_dsaddrs;
+
+ /* If DS was already in cache, free ds addrs */
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ __free_page(scratch);
+ return new_ds;
+
+out_err_drain_dsaddrs:
+ while (!list_empty(&dsaddrs)) {
+ da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ kfree(ds_versions);
+out_scratch:
+ __free_page(scratch);
+out_err:
+ kfree(new_ds);
+
+ dprintk("%s ERROR: returning %d\n", __func__, ret);
+ return NULL;
+}
+
+static u64
+end_offset(u64 start, u64 len)
+{
+ u64 end;
+
+ end = start + len;
+ return end >= start ? end : NFS4_MAX_UINT64;
+}
+
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+ u64 offset, u64 length)
+{
+ u64 end;
+
+ end = max_t(u64, end_offset(err->offset, err->length),
+ end_offset(offset, length));
+ err->offset = min_t(u64, err->offset, offset);
+ err->length = end - err->offset;
+}
+
+static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ nfs4_stateid *stateid,
+ struct nfs4_deviceid *deviceid)
+{
+ return err->status == status && err->opnum == opnum &&
+ nfs4_stateid_match(&err->stateid, stateid) &&
+ !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
+ end_offset(err->offset, err->length) >= offset &&
+ err->offset <= end_offset(offset, length);
+}
+
+static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
+ struct nfs4_ff_layout_ds_err *new)
+{
+ if (!ds_error_can_merge(old, new->offset, new->length, new->status,
+ new->opnum, &new->stateid, &new->deviceid))
+ return false;
+
+ extend_ds_error(old, new->offset, new->length);
+ return true;
+}
+
+static bool
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_ds_err *dserr)
+{
+ struct nfs4_ff_layout_ds_err *err;
+
+ list_for_each_entry(err, &flo->error_list, list) {
+ if (merge_ds_error(err, dserr)) {
+ return true;
+ }
+ }
+
+ list_add(&dserr->list, &flo->error_list);
+ return false;
+}
+
+static bool
+ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
+{
+ bool found = false;
+ struct nfs4_ff_layout_ds_err *err;
+
+ list_for_each_entry(err, &flo->error_list, list) {
+ if (ds_error_can_merge(err, offset, length, status, opnum,
+ stateid, deviceid)) {
+ found = true;
+ extend_ds_error(err, offset, length);
+ break;
+ }
+ }
+
+ return found;
+}
+
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+ struct nfs4_ff_layout_mirror *mirror, u64 offset,
+ u64 length, int status, enum nfs_opnum4 opnum,
+ gfp_t gfp_flags)
+{
+ struct nfs4_ff_layout_ds_err *dserr;
+ bool needfree;
+
+ if (status == 0)
+ return 0;
+
+ if (mirror->mirror_ds == NULL)
+ return -EINVAL;
+
+ spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+ if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
+ &mirror->stateid,
+ &mirror->mirror_ds->id_node.deviceid)) {
+ spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+ return 0;
+ }
+ spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+ dserr = kmalloc(sizeof(*dserr), gfp_flags);
+ if (!dserr)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&dserr->list);
+ dserr->offset = offset;
+ dserr->length = length;
+ dserr->status = status;
+ dserr->opnum = opnum;
+ nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+ memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+ NFS4_DEVICEID4_SIZE);
+
+ spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+ needfree = ff_layout_add_ds_error_locked(flo, dserr);
+ spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+ if (needfree)
+ kfree(dserr);
+
+ return 0;
+}
+
+/* currently we only support AUTH_NONE and AUTH_SYS */
+static rpc_authflavor_t
+nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+{
+ if (mirror->uid == (u32)-1)
+ return RPC_AUTH_NULL;
+ return RPC_AUTH_UNIX;
+}
+
+/* fetch cred for NFSv3 DS */
+static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
+ struct nfs4_pnfs_ds *ds)
+{
+ if (ds->ds_clp && !mirror->cred &&
+ mirror->mirror_ds->ds_versions[0].version == 3) {
+ struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
+ struct rpc_cred *cred;
+ struct auth_cred acred = {
+ .uid = make_kuid(&init_user_ns, mirror->uid),
+ .gid = make_kgid(&init_user_ns, mirror->gid),
+ };
+
+ /* AUTH_NULL ignores acred */
+ cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+ if (IS_ERR(cred)) {
+ dprintk("%s: lookup_cred failed with %ld\n",
+ __func__, PTR_ERR(cred));
+ return PTR_ERR(cred);
+ } else {
+ mirror->cred = cred;
+ }
+ }
+ return 0;
+}
+
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+ struct nfs_fh *fh = NULL;
+ struct nfs4_deviceid_node *devid;
+
+ if (mirror == NULL || mirror->mirror_ds == NULL ||
+ mirror->mirror_ds->ds == NULL) {
+ printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+ __func__, mirror_idx);
+ if (mirror && mirror->mirror_ds) {
+ devid = &mirror->mirror_ds->id_node;
+ pnfs_generic_mark_devid_invalid(devid);
+ }
+ goto out;
+ }
+
+ /* FIXME: For now assume there is only 1 version available for the DS */
+ fh = &mirror->fh_versions[0];
+out:
+ return fh;
+}
+
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ bool fail_return)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+ struct nfs4_pnfs_ds *ds = NULL;
+ struct nfs4_deviceid_node *devid;
+ struct inode *ino = lseg->pls_layout->plh_inode;
+ struct nfs_server *s = NFS_SERVER(ino);
+ unsigned int max_payload;
+ rpc_authflavor_t flavor;
+
+ if (mirror == NULL || mirror->mirror_ds == NULL ||
+ mirror->mirror_ds->ds == NULL) {
+ printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+ __func__, ds_idx);
+ if (mirror && mirror->mirror_ds) {
+ devid = &mirror->mirror_ds->id_node;
+ pnfs_generic_mark_devid_invalid(devid);
+ }
+ goto out;
+ }
+
+ devid = &mirror->mirror_ds->id_node;
+ if (ff_layout_test_devid_unavailable(devid))
+ goto out;
+
+ ds = mirror->mirror_ds->ds;
+ /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+ smp_rmb();
+ if (ds->ds_clp)
+ goto out;
+
+ flavor = nfs4_ff_layout_choose_authflavor(mirror);
+
+ /* FIXME: For now we assume the server sent only one version of NFS
+ * to use for the DS.
+ */
+ nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+ dataserver_retrans,
+ mirror->mirror_ds->ds_versions[0].version,
+ mirror->mirror_ds->ds_versions[0].minor_version,
+ flavor);
+
+ /* connect success, check rsize/wsize limit */
+ if (ds->ds_clp) {
+ max_payload =
+ nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+ NULL);
+ if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+ if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+ mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+ } else {
+ ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, lseg->pls_range.offset,
+ lseg->pls_range.length, NFS4ERR_NXIO,
+ OP_ILLEGAL, GFP_NOIO);
+ if (fail_return) {
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ if (ff_layout_has_available_ds(lseg))
+ pnfs_set_retry_layoutget(lseg->pls_layout);
+ else
+ pnfs_clear_retry_layoutget(lseg->pls_layout);
+
+ } else {
+ if (ff_layout_has_available_ds(lseg))
+ set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lseg->pls_layout->plh_flags);
+ else {
+ pnfs_error_mark_layout_for_return(ino, lseg);
+ pnfs_clear_retry_layoutget(lseg->pls_layout);
+ }
+ }
+ }
+
+ if (ff_layout_update_mirror_cred(mirror, ds))
+ ds = NULL;
+out:
+ return ds;
+}
+
+struct rpc_cred *
+ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ struct rpc_cred *mdscred)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+ struct rpc_cred *cred = ERR_PTR(-EINVAL);
+
+ if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
+ goto out;
+
+ if (mirror && mirror->cred)
+ cred = mirror->cred;
+ else
+ cred = mdscred;
+out:
+ return cred;
+}
+
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+ struct nfs_client *ds_clp, struct inode *inode)
+{
+ struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+
+ switch (mirror->mirror_ds->ds_versions[0].version) {
+ case 3:
+ /* For NFSv3 DS, flavor is set when creating DS connections */
+ return ds_clp->cl_rpcclient;
+ case 4:
+ return nfs4_find_or_create_ds_client(ds_clp, inode);
+ default:
+ BUG();
+ }
+}
+
+static bool is_range_intersecting(u64 offset1, u64 length1,
+ u64 offset2, u64 length2)
+{
+ u64 end1 = end_offset(offset1, length1);
+ u64 end2 = end_offset(offset2, length2);
+
+ return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
+ (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+}
+
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+ struct xdr_stream *xdr, int *count,
+ const struct pnfs_layout_range *range)
+{
+ struct nfs4_ff_layout_ds_err *err, *n;
+ __be32 *p;
+
+ list_for_each_entry_safe(err, n, &flo->error_list, list) {
+ if (!is_range_intersecting(err->offset, err->length,
+ range->offset, range->length))
+ continue;
+ /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+ * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+ */
+ p = xdr_reserve_space(xdr,
+ 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+ if (unlikely(!p))
+ return -ENOBUFS;
+ p = xdr_encode_hyper(p, err->offset);
+ p = xdr_encode_hyper(p, err->length);
+ p = xdr_encode_opaque_fixed(p, &err->stateid,
+ NFS4_STATEID_SIZE);
+ p = xdr_encode_opaque_fixed(p, &err->deviceid,
+ NFS4_DEVICEID4_SIZE);
+ *p++ = cpu_to_be32(err->status);
+ *p++ = cpu_to_be32(err->opnum);
+ *count += 1;
+ list_del(&err->list);
+ dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+ __func__, err->offset, err->length, err->status,
+ err->opnum, *count);
+ kfree(err);
+ }
+
+ return 0;
+}
+
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_mirror *mirror;
+ struct nfs4_deviceid_node *devid;
+ int idx;
+
+ for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ if (mirror && mirror->mirror_ds) {
+ devid = &mirror->mirror_ds->id_node;
+ if (!ff_layout_test_devid_unavailable(devid))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client "
+ "retries a request before it attempts further "
+ " recovery action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+ "NFSv4.1 client waits for a response from a "
+ " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
nfs_fattr_free_group_name(fattr);
}
-static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
{
unsigned long val;
char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
*res = val;
return 1;
}
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
{
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..83107be3dd01 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -352,8 +352,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
nfs_attr_check_mountpoint(sb, fattr);
- if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
- !nfs_attr_use_mounted_on_fileid(fattr))
+ if (nfs_attr_use_mounted_on_fileid(fattr))
+ fattr->fileid = fattr->mounted_on_fileid;
+ else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
goto out_no_inode;
if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
goto out_no_inode;
@@ -387,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
if (S_ISREG(inode->i_mode)) {
inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
inode->i_data.a_ops = &nfs_file_aops;
- inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
inode->i_fop = &nfs_dir_operations;
@@ -506,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
attr->ia_valid &= ~ATTR_MODE;
if (attr->ia_valid & ATTR_SIZE) {
+ loff_t i_size;
+
BUG_ON(!S_ISREG(inode->i_mode));
- if (attr->ia_size == i_size_read(inode))
+ i_size = i_size_read(inode);
+ if (attr->ia_size == i_size)
attr->ia_valid &= ~ATTR_SIZE;
+ else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
+ return -ETXTBSY;
}
/* Optimization: if the end result is no change, don't RPC */
@@ -1770,7 +1775,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
#if IS_ENABLED(CONFIG_NFS_V4)
INIT_LIST_HEAD(&nfsi->open_states);
nfsi->delegation = NULL;
- nfsi->delegation_state = 0;
init_rwsem(&nfsi->rwsem);
nfsi->layout = NULL;
#endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index efaa31c70fbe..b802fb3a2d99 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
#include <linux/mount.h>
#include <linux/security.h>
#include <linux/crc32.h>
+#include <linux/nfs_page.h>
#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
@@ -31,8 +32,6 @@ static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
(((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
return 0;
-
- fattr->fileid = fattr->mounted_on_fileid;
return 1;
}
@@ -189,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
const struct sockaddr *ds_addr,
int ds_addrlen, int ds_proto,
unsigned int ds_timeo,
- unsigned int ds_retrans);
+ unsigned int ds_retrans,
+ u32 minor_version,
+ rpc_authflavor_t au_flavor);
extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo,
+ unsigned int ds_retrans, rpc_authflavor_t au_flavor);
#ifdef CONFIG_PROC_FS
extern int __init nfs_fs_proc_init(void);
extern void nfs_fs_proc_exit(void);
@@ -244,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
void nfs_pgio_header_free(struct nfs_pgio_header *);
void nfs_pgio_data_destroy(struct nfs_pgio_header *);
int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
- const struct rpc_call_ops *, int, int);
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+ const struct rpc_call_ops *call_ops, int how, int flags);
void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
static inline void nfs_iocounter_init(struct nfs_io_counter *c)
{
@@ -254,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
atomic_set(&c->io_count, 0);
}
+static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
+{
+ WARN_ON_ONCE(desc->pg_mirror_count < 1);
+ return desc->pg_mirror_count > 1;
+}
+
/* nfs2xdr.c */
extern struct rpc_procinfo nfs_procedures[];
extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -377,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct super_block *sb);
+extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
/* namespace.c */
@@ -416,7 +430,6 @@ int nfs_show_options(struct seq_file *, struct dentry *);
int nfs_show_devname(struct seq_file *, struct dentry *);
int nfs_show_path(struct seq_file *, struct dentry *);
int nfs_show_stats(struct seq_file *, struct dentry *);
-void nfs_put_super(struct super_block *);
int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
/* write.c */
@@ -429,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
extern int nfs_initiate_commit(struct rpc_clnt *clnt,
struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
int how, int flags);
extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -442,13 +456,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
struct nfs_commit_info *cinfo);
void nfs_mark_request_commit(struct nfs_page *req,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo);
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
int nfs_write_need_commit(struct nfs_pgio_header *);
int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
int how, struct nfs_commit_info *cinfo);
void nfs_retry_commit(struct list_head *page_list,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo);
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
void nfs_commitdata_release(struct nfs_commit_data *data);
void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
struct nfs_commit_info *cinfo);
@@ -459,6 +475,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
#ifdef CONFIG_MIGRATION
extern int nfs_migrate_page(struct address_space *,
@@ -482,6 +499,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
inode_dio_wait(inode);
}
extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
/* nfs4proc.c */
extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -495,6 +513,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
struct nfs_client **result,
struct rpc_cred *cred);
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+ inode = igrab(inode);
+ if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+ iput(inode);
+ inode = NULL;
+ }
+ return inode;
+}
+
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+ if (inode != NULL) {
+ struct super_block *sb = inode->i_sb;
+
+ iput(inode);
+ nfs_sb_deactive(sb);
+ }
+}
+
/*
* Determine the device name as a string
*/
@@ -560,6 +598,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
}
/*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+
+ inc_zone_page_state(page, NR_UNSTABLE_NFS);
+ inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+
+/*
* Determine the number of bytes of data the page contains
*/
static inline
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
* void;
* };
*/
-static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+ __u32 *op_status)
{
enum nfs_stat status;
int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
error = decode_stat(xdr, &status);
if (unlikely(error))
goto out;
+ if (op_status)
+ *op_status = status;
if (status != NFS_OK)
goto out_default;
error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
struct nfs_fattr *result)
{
- return decode_attrstat(xdr, result);
+ return decode_attrstat(xdr, result, NULL);
}
static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
error = decode_stat(xdr, &status);
if (unlikely(error))
goto out;
+ result->op_status = status;
if (status != NFS_OK)
goto out_default;
error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
{
/* All NFSv2 writes are "file sync" writes */
result->verf->committed = NFS_FILE_SYNC;
- return decode_attrstat(xdr, result->fattr);
+ return decode_attrstat(xdr, result->fattr, &result->op_status);
}
/**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
struct nfs_fattr *, rpc_authflavor_t);
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
#endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
#include <linux/nfs_fs.h>
#include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
#include "internal.h"
#include "nfs3_fs.h"
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
nfs_init_server_aclclient(server);
return server;
}
+
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+ const struct sockaddr *ds_addr, int ds_addrlen,
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+ rpc_authflavor_t au_flavor)
+{
+ struct nfs_client_initdata cl_init = {
+ .addr = ds_addr,
+ .addrlen = ds_addrlen,
+ .nfs_mod = &nfs_v3,
+ .proto = ds_proto,
+ .net = mds_clp->cl_net,
+ };
+ struct rpc_timeout ds_timeout;
+ struct nfs_client *clp;
+ char buf[INET6_ADDRSTRLEN + 1];
+
+ /* fake a hostname because lockd wants it */
+ if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+ return ERR_PTR(-EINVAL);
+ cl_init.hostname = buf;
+
+ /* Use the MDS nfs_client cl_ipaddr. */
+ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+ clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+ au_flavor);
+
+ return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
struct inode *inode = hdr->inode;
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
{
struct inode *inode = hdr->inode;
+ if (hdr->pgio_done_cb != NULL)
+ return hdr->pgio_done_cb(task, hdr);
+
if (nfs3_async_handle_jukebox(task, inode))
return -EAGAIN;
if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
{
+ if (data->commit_done_cb != NULL)
+ return data->commit_done_cb(task, data);
+
if (nfs3_async_handle_jukebox(task, data->inode))
return -EAGAIN;
nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
#include "nfs3_fs.h"
#include "nfs.h"
-static struct nfs_subversion nfs_v3 = {
+struct nfs_subversion nfs_v3 = {
.owner = THIS_MODULE,
.nfs_fs = &nfs_fs_type,
.rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
error = decode_post_op_attr(xdr, result->fattr);
if (unlikely(error))
goto out;
+ result->op_status = status;
if (status != NFS3_OK)
goto out_status;
error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
error = decode_wcc_data(xdr, result->fattr);
if (unlikely(error))
goto out;
+ result->op_status = status;
if (status != NFS3_OK)
goto out_status;
error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
error = decode_wcc_data(xdr, result->fattr);
if (unlikely(error))
goto out;
+ result->op_status = status;
if (status != NFS3_OK)
goto out_status;
error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
#define NFS4_RENEW_TIMEOUT 0x01
#define NFS4_RENEW_DELEGATION_CB 0x02
+struct nfs_seqid_counter;
struct nfs4_minor_version_ops {
u32 minor_version;
unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
struct nfs_fsinfo *);
void (*free_lock_state)(struct nfs_server *,
struct nfs4_lock_state *);
+ struct nfs_seqid *
+ (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
const struct rpc_call_ops *call_sync_ops;
const struct nfs4_state_recovery_ops *reboot_recovery_ops;
const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
extern void nfs_release_seqid(struct nfs_seqid *seqid);
extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res);
extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 03311259b0c4..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -228,6 +228,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
kfree(clp->cl_serverowner);
kfree(clp->cl_serverscope);
kfree(clp->cl_implid);
+ kfree(clp->cl_owner_id);
}
void nfs4_free_client(struct nfs_client *clp)
@@ -452,6 +453,14 @@ static void nfs4_swap_callback_idents(struct nfs_client *keep,
spin_unlock(&nn->nfs_client_lock);
}
+static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
+ const struct nfs_client *clp2)
+{
+ if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
+ return true;
+ return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
+}
+
/**
* nfs40_walk_client_list - Find server that recognizes a client ID
*
@@ -483,9 +492,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->rpc_ops != new->rpc_ops)
continue;
- if (pos->cl_proto != new->cl_proto)
- continue;
-
if (pos->cl_minorversion != new->cl_minorversion)
continue;
@@ -510,6 +516,9 @@ int nfs40_walk_client_list(struct nfs_client *new,
if (pos->cl_clientid != new->cl_clientid)
continue;
+ if (!nfs4_match_client_owner_id(pos, new))
+ continue;
+
atomic_inc(&pos->cl_count);
spin_unlock(&nn->nfs_client_lock);
@@ -566,20 +575,14 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
}
/*
- * Returns true if the server owners match
+ * Returns true if the server major ids match
*/
static bool
-nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
{
struct nfs41_server_owner *o1 = a->cl_serverowner;
struct nfs41_server_owner *o2 = b->cl_serverowner;
- if (o1->minor_id != o2->minor_id) {
- dprintk("NFS: --> %s server owner minor IDs do not match\n",
- __func__);
- return false;
- }
-
if (o1->major_id_sz != o2->major_id_sz)
goto out_major_mismatch;
if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
@@ -621,9 +624,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (pos->rpc_ops != new->rpc_ops)
continue;
- if (pos->cl_proto != new->cl_proto)
- continue;
-
if (pos->cl_minorversion != new->cl_minorversion)
continue;
@@ -639,7 +639,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
prev = pos;
status = nfs_wait_client_init_complete(pos);
- if (status == 0) {
+ if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
nfs4_schedule_lease_recovery(pos);
status = nfs4_wait_clnt_recover(pos);
}
@@ -654,7 +654,19 @@ int nfs41_walk_client_list(struct nfs_client *new,
if (!nfs4_match_clientids(pos, new))
continue;
- if (!nfs4_match_serverowners(pos, new))
+ /*
+ * Note that session trunking is just a special subcase of
+ * client id trunking. In either case, we want to fall back
+ * to using the existing nfs_client.
+ */
+ if (!nfs4_check_clientid_trunking(pos, new))
+ continue;
+
+ /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
+ * uniform string, however someone might switch the
+ * uniquifier string on us.
+ */
+ if (!nfs4_match_client_owner_id(pos, new))
continue;
atomic_inc(&pos->cl_count);
@@ -837,14 +849,15 @@ error:
*/
struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
const struct sockaddr *ds_addr, int ds_addrlen,
- int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
+ int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+ u32 minor_version, rpc_authflavor_t au_flavor)
{
struct nfs_client_initdata cl_init = {
.addr = ds_addr,
.addrlen = ds_addrlen,
.nfs_mod = &nfs_v4,
.proto = ds_proto,
- .minorversion = mds_clp->cl_minorversion,
+ .minorversion = minor_version,
.net = mds_clp->cl_net,
};
struct rpc_timeout ds_timeout;
@@ -862,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
*/
nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
- mds_clp->cl_rpcclient->cl_auth->au_flavor);
+ au_flavor);
dprintk("<-- %s %p\n", __func__, clp);
return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e7f8d5ff2581..88180ac5ea0e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
args->sa_privileged = 1;
}
-static int nfs40_setup_sequence(const struct nfs_server *server,
- struct nfs4_sequence_args *args,
- struct nfs4_sequence_res *res,
- struct rpc_task *task)
+int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+ struct nfs4_sequence_args *args,
+ struct nfs4_sequence_res *res,
+ struct rpc_task *task)
{
- struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
struct nfs4_slot *slot;
/* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
spin_unlock(&tbl->slot_tbl_lock);
return -EAGAIN;
}
+EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
static int nfs40_sequence_done(struct rpc_task *task,
struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
}
EXPORT_SYMBOL_GPL(nfs41_sequence_done);
-static int nfs4_sequence_done(struct rpc_task *task,
- struct nfs4_sequence_res *res)
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
{
if (res->sr_slot == NULL)
return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
return nfs40_sequence_done(task, res);
return nfs41_sequence_done(task, res);
}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
int nfs41_setup_sequence(struct nfs4_session *session,
struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
int ret = 0;
if (!session)
- return nfs40_setup_sequence(server, args, res, task);
+ return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+ args, res, task);
dprintk("--> %s clp %p session %p sr_slot %u\n",
__func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
struct nfs4_sequence_res *res,
struct rpc_task *task)
{
- return nfs40_setup_sequence(server, args, res, task);
+ return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+ args, res, task);
}
-static int nfs4_sequence_done(struct rpc_task *task,
- struct nfs4_sequence_res *res)
+int nfs4_sequence_done(struct rpc_task *task,
+ struct nfs4_sequence_res *res)
{
return nfs40_sequence_done(task, res);
}
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
#endif /* !CONFIG_NFS_V4_1 */
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
return true;
}
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+ fmode_t fmode, int openflags)
+{
+ u32 res = 0;
+
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+ case FMODE_READ:
+ res = NFS4_SHARE_ACCESS_READ;
+ break;
+ case FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_WRITE;
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ res = NFS4_SHARE_ACCESS_BOTH;
+ }
+ if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+ goto out;
+ /* Want no delegation if we're using O_DIRECT */
+ if (openflags & O_DIRECT)
+ res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+ return res;
+}
+
static enum open_claim_type4
nfs4_map_atomic_open_claim(struct nfs_server *server,
enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
struct dentry *parent = dget_parent(dentry);
struct inode *dir = parent->d_inode;
struct nfs_server *server = NFS_SERVER(dir);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
struct nfs4_opendata *p;
p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
if (IS_ERR(p->f_label))
goto err_free_p;
- p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
- if (p->o_arg.seqid == NULL)
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+ if (IS_ERR(p->o_arg.seqid))
goto err_free_label;
nfs_sb_active(dentry->d_sb);
p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
atomic_inc(&sp->so_count);
p->o_arg.open_flags = flags;
p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+ p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+ fmode, flags);
/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
* will return permission denied for all bits until close */
if (!(flags & O_EXCL)) {
@@ -1117,8 +1149,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
return 0;
if ((delegation->type & fmode) != fmode)
return 0;
- if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
- return 0;
if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
return 0;
nfs_mark_delegation_referenced(delegation);
@@ -1169,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
return false;
}
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+ if (state->n_wronly)
+ set_bit(NFS_O_WRONLY_STATE, &state->flags);
+ if (state->n_rdonly)
+ set_bit(NFS_O_RDONLY_STATE, &state->flags);
+ if (state->n_rdwr)
+ set_bit(NFS_O_RDWR_STATE, &state->flags);
+}
+
static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
nfs4_stateid *stateid, fmode_t fmode)
{
@@ -1187,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
}
if (stateid == NULL)
return;
- if (!nfs_need_update_open_stateid(state, stateid))
+ /* Handle races with OPEN */
+ if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
+ !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+ nfs_resync_open_stateid_locked(state);
return;
+ }
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
nfs4_stateid_copy(&state->stateid, stateid);
nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1283,6 +1327,23 @@ no_delegation:
return ret;
}
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+ const nfs4_stateid *stateid)
+{
+ struct nfs4_state *state = lsp->ls_state;
+ bool ret = false;
+
+ spin_lock(&state->state_lock);
+ if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+ goto out_noupdate;
+ nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+ ret = true;
+out_noupdate:
+ spin_unlock(&state->state_lock);
+ return ret;
+}
static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
{
@@ -1681,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
{
struct nfs4_opendata *data = calldata;
- nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
- &data->c_res.seq_res, task);
+ nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
+ &data->c_arg.seq_args, &data->c_res.seq_res, task);
}
static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2589,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_EXPIRED:
+ if (!nfs4_stateid_match(&calldata->arg.stateid,
+ &state->stateid)) {
+ rpc_restart_call_prepare(task);
+ goto out_release;
+ }
if (calldata->arg.fmode == 0)
break;
default:
@@ -2621,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+ nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
/* Calculate the change in open mode */
calldata->arg.fmode = 0;
if (state->n_rdwr == 0) {
@@ -2655,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_wait;
}
}
+ calldata->arg.share_access =
+ nfs4_map_atomic_open_share(NFS_SERVER(inode),
+ calldata->arg.fmode, 0);
nfs_fattr_init(calldata->res.fattr);
calldata->timestamp = jiffies;
@@ -2677,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
.rpc_release = nfs4_free_closedata,
};
-static bool nfs4_state_has_opener(struct nfs4_state *state)
-{
- /* first check existing openers */
- if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
- state->n_rdonly != 0)
- return true;
-
- if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
- state->n_wronly != 0)
- return true;
-
- if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
- state->n_rdwr != 0)
- return true;
-
- return false;
-}
-
static bool nfs4_roc(struct inode *inode)
{
- struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_open_context *ctx;
- struct nfs4_state *state;
-
- spin_lock(&inode->i_lock);
- list_for_each_entry(ctx, &nfsi->open_files, list) {
- state = ctx->state;
- if (state == NULL)
- continue;
- if (nfs4_state_has_opener(state)) {
- spin_unlock(&inode->i_lock);
- return false;
- }
- }
- spin_unlock(&inode->i_lock);
-
- if (nfs4_check_delegation(inode, FMODE_READ))
+ if (!nfs_have_layout(inode))
return false;
-
return pnfs_roc(inode);
}
@@ -2733,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
{
struct nfs_server *server = NFS_SERVER(state->inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
struct nfs4_closedata *calldata;
struct nfs4_state_owner *sp = state->owner;
struct rpc_task *task;
@@ -2759,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
calldata->inode = state->inode;
calldata->state = state;
calldata->arg.fh = NFS_FH(state->inode);
- calldata->arg.stateid = &state->open_stateid;
/* Serialization for the sequence id */
- calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
- if (calldata->arg.seqid == NULL)
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+ if (IS_ERR(calldata->arg.seqid))
goto out_free_calldata;
calldata->arg.fmode = 0;
calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -4917,11 +4953,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
}
static unsigned int
-nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
+nfs4_init_nonuniform_client_string(struct nfs_client *clp,
char *buf, size_t len)
{
unsigned int result;
+ if (clp->cl_owner_id != NULL)
+ return strlcpy(buf, clp->cl_owner_id, len);
+
rcu_read_lock();
result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
clp->cl_ipaddr,
@@ -4930,24 +4969,32 @@ nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_PROTO));
rcu_read_unlock();
+ clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
return result;
}
static unsigned int
-nfs4_init_uniform_client_string(const struct nfs_client *clp,
+nfs4_init_uniform_client_string(struct nfs_client *clp,
char *buf, size_t len)
{
const char *nodename = clp->cl_rpcclient->cl_nodename;
+ unsigned int result;
+
+ if (clp->cl_owner_id != NULL)
+ return strlcpy(buf, clp->cl_owner_id, len);
if (nfs4_client_id_uniquifier[0] != '\0')
- return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
+ result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
clp->rpc_ops->version,
clp->cl_minorversion,
nfs4_client_id_uniquifier,
nodename);
- return scnprintf(buf, len, "Linux NFSv%u.%u %s",
+ else
+ result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
clp->rpc_ops->version, clp->cl_minorversion,
nodename);
+ clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
+ return result;
}
/*
@@ -5128,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
static void nfs4_delegreturn_release(void *calldata)
{
struct nfs4_delegreturndata *data = calldata;
+ struct inode *inode = data->inode;
- if (data->roc)
- pnfs_roc_release(data->inode);
+ if (inode) {
+ if (data->roc)
+ pnfs_roc_release(inode);
+ nfs_iput_and_deactive(inode);
+ }
kfree(calldata);
}
@@ -5187,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
nfs_fattr_init(data->res.fattr);
data->timestamp = jiffies;
data->rpc_status = 0;
- data->inode = inode;
- data->roc = list_empty(&NFS_I(inode)->open_files) ?
- pnfs_roc(inode) : false;
+ data->inode = nfs_igrab_and_active(inode);
+ if (data->inode)
+ data->roc = nfs4_roc(inode);
task_setup_data.callback_data = data;
msg.rpc_argp = &data->args;
@@ -5344,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
p->arg.fl = &p->fl;
p->arg.seqid = seqid;
p->res.seqid = seqid;
- p->arg.stateid = &lsp->ls_stateid;
p->lsp = lsp;
atomic_inc(&lsp->ls_count);
/* Ensure we don't close file until we're done freeing locks! */
@@ -5371,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
return;
switch (task->tk_status) {
case 0:
- nfs4_stateid_copy(&calldata->lsp->ls_stateid,
- &calldata->res.stateid);
renew_lease(calldata->server, calldata->timestamp);
- break;
+ do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+ if (nfs4_update_lock_stateid(calldata->lsp,
+ &calldata->res.stateid))
+ break;
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
+ if (!nfs4_stateid_match(&calldata->arg.stateid,
+ &calldata->lsp->ls_stateid))
+ rpc_restart_call_prepare(task);
break;
default:
if (nfs4_async_handle_error(task, calldata->server,
@@ -5394,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
goto out_wait;
+ nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
/* Note: exit _without_ running nfs4_locku_done */
goto out_no_action;
@@ -5464,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
struct nfs_seqid *seqid;
struct nfs4_lock_state *lsp;
struct rpc_task *task;
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
int status = 0;
unsigned char fl_flags = request->fl_flags;
@@ -5487,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
lsp = request->fl_u.nfs4_fl.owner;
if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
goto out;
- seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+ alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+ seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
status = -ENOMEM;
- if (seqid == NULL)
+ if (IS_ERR(seqid))
goto out;
task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
status = PTR_ERR(task);
@@ -5522,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
struct nfs4_lockdata *p;
struct inode *inode = lsp->ls_state->inode;
struct nfs_server *server = NFS_SERVER(inode);
+ struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
p = kzalloc(sizeof(*p), gfp_mask);
if (p == NULL)
@@ -5530,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
p->arg.fh = NFS_FH(inode);
p->arg.fl = &p->fl;
p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
- if (p->arg.open_seqid == NULL)
+ if (IS_ERR(p->arg.open_seqid))
goto out_free;
- p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
- if (p->arg.lock_seqid == NULL)
+ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
+ p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+ if (IS_ERR(p->arg.lock_seqid))
goto out_free_seqid;
- p->arg.lock_stateid = &lsp->ls_stateid;
p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
p->arg.lock_owner.s_dev = server->s_dev;
@@ -5562,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
goto out_wait;
/* Do we need to do an open_to_lock_owner? */
- if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
+ if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
goto out_release_lock_seqid;
}
- data->arg.open_stateid = &state->open_stateid;
+ nfs4_stateid_copy(&data->arg.open_stateid,
+ &state->open_stateid);
data->arg.new_lock_owner = 1;
data->res.open_seqid = data->arg.open_seqid;
- } else
+ } else {
data->arg.new_lock_owner = 0;
+ nfs4_stateid_copy(&data->arg.lock_stateid,
+ &data->lsp->ls_stateid);
+ }
if (!nfs4_valid_open_stateid(state)) {
data->rpc_status = -EBADF;
task->tk_action = NULL;
@@ -5594,6 +5656,7 @@ out_wait:
static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
+ struct nfs4_lock_state *lsp = data->lsp;
dprintk("%s: begin!\n", __func__);
@@ -5601,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
return;
data->rpc_status = task->tk_status;
- if (data->arg.new_lock_owner != 0) {
- if (data->rpc_status == 0)
- nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
- else
- goto out;
- }
- if (data->rpc_status == 0) {
- nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
- set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
- renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
+ switch (task->tk_status) {
+ case 0:
+ renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
+ data->timestamp);
+ if (data->arg.new_lock) {
+ data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+ if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
+ rpc_restart_call_prepare(task);
+ break;
+ }
+ }
+ if (data->arg.new_lock_owner != 0) {
+ nfs_confirm_seqid(&lsp->ls_seqid, 0);
+ nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+ set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+ } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+ rpc_restart_call_prepare(task);
+ break;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ if (data->arg.new_lock_owner != 0) {
+ if (!nfs4_stateid_match(&data->arg.open_stateid,
+ &lsp->ls_state->open_stateid))
+ rpc_restart_call_prepare(task);
+ } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+ &lsp->ls_stateid))
+ rpc_restart_call_prepare(task);
}
-out:
dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
}
@@ -5693,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
if (recovery_type == NFS_LOCK_RECLAIM)
data->arg.reclaim = NFS_LOCK_RECLAIM;
nfs4_set_sequence_privileged(&data->arg.seq_args);
- }
+ } else
+ data->arg.new_lock = 1;
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -5817,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
- struct nfs4_state_owner *sp = state->owner;
struct nfs_inode *nfsi = NFS_I(state->inode);
unsigned char fl_flags = request->fl_flags;
- unsigned int seq;
int status = -ENOLCK;
if ((fl_flags & FL_POSIX) &&
@@ -5840,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
/* ...but avoid races with delegation recall... */
request->fl_flags = fl_flags & ~FL_SLEEP;
status = do_vfs_lock(request->fl_file, request);
- goto out_unlock;
- }
- seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
- up_read(&nfsi->rwsem);
- status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
- if (status != 0)
+ up_read(&nfsi->rwsem);
goto out;
- down_read(&nfsi->rwsem);
- if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
- status = -NFS4ERR_DELAY;
- goto out_unlock;
}
- /* Note: we always want to sleep here! */
- request->fl_flags = fl_flags | FL_SLEEP;
- if (do_vfs_lock(request->fl_file, request) < 0)
- printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
- "manager!\n", __func__);
-out_unlock:
up_read(&nfsi->rwsem);
+ status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
out:
request->fl_flags = fl_flags;
return status;
@@ -5965,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
{
struct nfs_release_lockowner_data *data = calldata;
struct nfs_server *server = data->server;
- nfs40_setup_sequence(server, &data->args.seq_args,
- &data->res.seq_res, task);
+ nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+ &data->args.seq_args, &data->res.seq_res, task);
data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
data->timestamp = jiffies;
}
@@ -6582,47 +6648,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
{
int status;
+ struct nfs41_bind_conn_to_session_args args = {
+ .client = clp,
+ .dir = NFS4_CDFC4_FORE_OR_BOTH,
+ };
struct nfs41_bind_conn_to_session_res res;
struct rpc_message msg = {
.rpc_proc =
&nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
- .rpc_argp = clp,
+ .rpc_argp = &args,
.rpc_resp = &res,
.rpc_cred = cred,
};
dprintk("--> %s\n", __func__);
- res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
- if (unlikely(res.session == NULL)) {
- status = -ENOMEM;
- goto out;
- }
+ nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
+ if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+ args.dir = NFS4_CDFC4_FORE;
status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
trace_nfs4_bind_conn_to_session(clp, status);
if (status == 0) {
- if (memcmp(res.session->sess_id.data,
+ if (memcmp(res.sessionid.data,
clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
dprintk("NFS: %s: Session ID mismatch\n", __func__);
status = -EIO;
- goto out_session;
+ goto out;
}
- if (res.dir != NFS4_CDFS4_BOTH) {
+ if ((res.dir & args.dir) != res.dir || res.dir == 0) {
dprintk("NFS: %s: Unexpected direction from server\n",
__func__);
status = -EIO;
- goto out_session;
+ goto out;
}
- if (res.use_conn_in_rdma_mode) {
+ if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
dprintk("NFS: %s: Server returned RDMA mode = true\n",
__func__);
status = -EIO;
- goto out_session;
+ goto out;
}
}
-out_session:
- kfree(res.session);
out:
dprintk("<-- %s status= %d\n", __func__, status);
return status;
@@ -7100,10 +7166,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
args->bc_attrs.max_reqs);
}
-static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
{
struct nfs4_channel_attrs *sent = &args->fc_attrs;
- struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
if (rcvd->max_resp_sz > sent->max_resp_sz)
return -EINVAL;
@@ -7122,11 +7189,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
return 0;
}
-static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+ struct nfs41_create_session_res *res)
{
struct nfs4_channel_attrs *sent = &args->bc_attrs;
- struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+ struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+ if (!(res->flags & SESSION4_BACK_CHAN))
+ goto out;
if (rcvd->max_rqst_sz > sent->max_rqst_sz)
return -EINVAL;
if (rcvd->max_resp_sz < sent->max_resp_sz)
@@ -7138,18 +7208,30 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
return -EINVAL;
if (rcvd->max_reqs != sent->max_reqs)
return -EINVAL;
+out:
return 0;
}
static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
- struct nfs4_session *session)
+ struct nfs41_create_session_res *res)
{
int ret;
- ret = nfs4_verify_fore_channel_attrs(args, session);
+ ret = nfs4_verify_fore_channel_attrs(args, res);
if (ret)
return ret;
- return nfs4_verify_back_channel_attrs(args, session);
+ return nfs4_verify_back_channel_attrs(args, res);
+}
+
+static void nfs4_update_session(struct nfs4_session *session,
+ struct nfs41_create_session_res *res)
+{
+ nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+ session->flags = res->flags;
+ memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+ if (res->flags & SESSION4_BACK_CHAN)
+ memcpy(&session->bc_attrs, &res->bc_attrs,
+ sizeof(session->bc_attrs));
}
static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7158,11 +7240,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
struct nfs4_session *session = clp->cl_session;
struct nfs41_create_session_args args = {
.client = clp,
+ .clientid = clp->cl_clientid,
+ .seqid = clp->cl_seqid,
.cb_program = NFS4_CALLBACK,
};
- struct nfs41_create_session_res res = {
- .client = clp,
- };
+ struct nfs41_create_session_res res;
+
struct rpc_message msg = {
.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
.rpc_argp = &args,
@@ -7179,11 +7262,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
if (!status) {
/* Verify the session's negotiated channel_attrs values */
- status = nfs4_verify_channel_attrs(&args, session);
+ status = nfs4_verify_channel_attrs(&args, &res);
/* Increment the clientid slot sequence id */
- clp->cl_seqid++;
+ if (clp->cl_seqid == res.seqid)
+ clp->cl_seqid++;
+ if (status)
+ goto out;
+ nfs4_update_session(session, &res);
}
-
+out:
return status;
}
@@ -7528,6 +7615,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
return;
if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
NFS_I(lgp->args.inode)->layout,
+ &lgp->args.range,
lgp->args.ctx->state)) {
rpc_exit(task, NFS4_OK);
}
@@ -7783,9 +7871,13 @@ static void nfs4_layoutreturn_release(void *calldata)
spin_lock(&lo->plh_inode->i_lock);
if (lrp->res.lrs_present)
pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+ pnfs_clear_layoutreturn_waitbit(lo);
+ clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+ rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
lo->plh_block_lgets--;
spin_unlock(&lo->plh_inode->i_lock);
pnfs_put_layout_hdr(lrp->args.layout);
+ nfs_iput_and_deactive(lrp->inode);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
@@ -7796,7 +7888,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
.rpc_release = nfs4_layoutreturn_release,
};
-int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
{
struct rpc_task *task;
struct rpc_message msg = {
@@ -7811,14 +7903,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
.callback_ops = &nfs4_layoutreturn_call_ops,
.callback_data = lrp,
};
- int status;
+ int status = 0;
dprintk("--> %s\n", __func__);
+ if (!sync) {
+ lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+ if (!lrp->inode) {
+ nfs4_layoutreturn_release(lrp);
+ return -EAGAIN;
+ }
+ task_setup_data.flags |= RPC_TASK_ASYNC;
+ }
nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- status = task->tk_status;
+ if (sync)
+ status = task->tk_status;
trace_nfs4_layoutreturn(lrp->args.inode, status);
dprintk("<-- %s status=%d\n", __func__, status);
rpc_put_task(task);
@@ -7912,6 +8013,7 @@ static void nfs4_layoutcommit_release(void *calldata)
nfs_post_op_update_inode_force_wcc(data->args.inode,
data->res.fattr);
put_rpccred(data->cred);
+ nfs_iput_and_deactive(data->inode);
kfree(data);
}
@@ -7936,7 +8038,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutcommit_ops,
.callback_data = data,
- .flags = RPC_TASK_ASYNC,
};
struct rpc_task *task;
int status = 0;
@@ -7947,18 +8048,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
data->args.lastbytewritten,
data->args.inode->i_ino);
+ if (!sync) {
+ data->inode = nfs_igrab_and_active(data->args.inode);
+ if (data->inode == NULL) {
+ nfs4_layoutcommit_release(data);
+ return -EAGAIN;
+ }
+ task_setup_data.flags = RPC_TASK_ASYNC;
+ }
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
- if (sync == false)
- goto out;
- status = nfs4_wait_for_completion_rpc_task(task);
- if (status != 0)
- goto out;
- status = task->tk_status;
+ if (sync)
+ status = task->tk_status;
trace_nfs4_layoutcommit(data->args.inode, status);
-out:
dprintk("%s: status %d\n", __func__, status);
rpc_put_task(task);
return status;
@@ -8386,6 +8490,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
.match_stateid = nfs4_match_stateid,
.find_root_sec = nfs4_find_root_sec,
.free_lock_state = nfs4_release_lockowner,
+ .alloc_seqid = nfs_alloc_seqid,
.call_sync_ops = &nfs40_call_sync_ops,
.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
.nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8394,6 +8499,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
};
#if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+ return NULL;
+}
+
static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.minor_version = 1,
.init_caps = NFS_CAP_READDIRPLUS
@@ -8407,6 +8518,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
.match_stateid = nfs41_match_stateid,
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
+ .alloc_seqid = nfs_alloc_no_seqid,
.call_sync_ops = &nfs41_call_sync_ops,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8433,6 +8545,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
.find_root_sec = nfs41_find_root_sec,
.free_lock_state = nfs41_free_lock_state,
.call_sync_ops = &nfs41_call_sync_ops,
+ .alloc_seqid = nfs_alloc_no_seqid,
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e799dc3c3b1d..e23366effcfb 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
tbl = &ses->fc_slot_table;
tbl->session = ses;
status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
- if (status) /* -ENOMEM */
+ if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
return status;
/* Back channel */
tbl = &ses->bc_slot_table;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9bc6a2..fc46c7455898 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -118,6 +118,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
return 0;
}
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+ const struct nfs4_sessionid *src)
+{
+ memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
+
#ifdef CONFIG_CRC32
/*
* nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5194933ed419..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
struct nfs_seqid *new;
new = kmalloc(sizeof(*new), gfp_mask);
- if (new != NULL) {
- new->sequence = counter;
- INIT_LIST_HEAD(&new->list);
- new->task = NULL;
- }
+ if (new == NULL)
+ return ERR_PTR(-ENOMEM);
+ new->sequence = counter;
+ INIT_LIST_HEAD(&new->list);
+ new->task = NULL;
return new;
}
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
{
struct nfs_seqid_counter *sequence;
- if (list_empty(&seqid->list))
+ if (seqid == NULL || list_empty(&seqid->list))
return;
sequence = seqid->sequence;
spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
{
- struct nfs4_state_owner *sp = container_of(seqid->sequence,
- struct nfs4_state_owner, so_seqid);
- struct nfs_server *server = sp->so_server;
+ struct nfs4_state_owner *sp;
+
+ if (seqid == NULL)
+ return;
+ sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
if (status == -NFS4ERR_BAD_SEQID)
nfs4_drop_state_owner(sp);
- if (!nfs4_has_session(server->nfs_client))
+ if (!nfs4_has_session(sp->so_server->nfs_client))
nfs_increment_seqid(status, seqid);
}
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
*/
void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
{
- nfs_increment_seqid(status, seqid);
+ if (seqid != NULL)
+ nfs_increment_seqid(status, seqid);
}
int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
{
- struct nfs_seqid_counter *sequence = seqid->sequence;
+ struct nfs_seqid_counter *sequence;
int status = 0;
+ if (seqid == NULL)
+ goto out;
+ sequence = seqid->sequence;
spin_lock(&sequence->lock);
seqid->task = task;
if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
status = -EAGAIN;
unlock:
spin_unlock(&sequence->lock);
+out:
return status;
}
@@ -1366,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
struct nfs_inode *nfsi = NFS_I(inode);
struct file_lock *fl;
int status = 0;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct list_head *list;
- if (inode->i_flock == NULL)
+ if (flctx == NULL)
return 0;
+ list = &flctx->flc_posix;
+
/* Guard against delegation returns and new lock/unlock calls */
down_write(&nfsi->rwsem);
- /* Protect inode->i_flock using the BKL */
- spin_lock(&inode->i_lock);
- for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
- if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
- continue;
+ spin_lock(&flctx->flc_lock);
+restart:
+ list_for_each_entry(fl, list, fl_list) {
if (nfs_file_open_context(fl->fl_file)->state != state)
continue;
- spin_unlock(&inode->i_lock);
+ spin_unlock(&flctx->flc_lock);
status = ops->recover_lock(state, fl);
switch (status) {
- case 0:
- break;
- case -ESTALE:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_EXPIRED:
- case -NFS4ERR_NO_GRACE:
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- goto out;
- default:
- printk(KERN_ERR "NFS: %s: unhandled error %d\n",
- __func__, status);
- case -ENOMEM:
- case -NFS4ERR_DENIED:
- case -NFS4ERR_RECLAIM_BAD:
- case -NFS4ERR_RECLAIM_CONFLICT:
- /* kill_proc(fl->fl_pid, SIGLOST, 1); */
- status = 0;
+ case 0:
+ break;
+ case -ESTALE:
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_EXPIRED:
+ case -NFS4ERR_NO_GRACE:
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_BADSLOT:
+ case -NFS4ERR_BAD_HIGH_SLOT:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto out;
+ default:
+ pr_err("NFS: %s: unhandled error %d\n",
+ __func__, status);
+ case -ENOMEM:
+ case -NFS4ERR_DENIED:
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+ status = 0;
}
- spin_lock(&inode->i_lock);
+ spin_lock(&flctx->flc_lock);
}
- spin_unlock(&inode->i_lock);
+ if (list == &flctx->flc_posix) {
+ list = &flctx->flc_flock;
+ goto restart;
+ }
+ spin_unlock(&flctx->flc_lock);
out:
up_write(&nfsi->rwsem);
return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs4_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs4_evict_inode,
.umount_begin = nfs_umount_begin,
@@ -346,6 +345,9 @@ out:
static void __exit exit_nfs_v4(void)
{
+ /* Not called in the _init(), conditionally loaded */
+ nfs4_pnfs_v3_ds_connect_unload();
+
unregister_nfs_version(&nfs_v4);
nfs4_unregister_sysctl();
nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..5c399ec41079 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
static void encode_nfs4_seqid(struct xdr_stream *xdr,
const struct nfs_seqid *seqid)
{
- encode_uint32(xdr, seqid->sequence->counter);
+ if (seqid != NULL)
+ encode_uint32(xdr, seqid->sequence->counter);
+ else
+ encode_uint32(xdr, 0);
}
static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
{
encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
encode_nfs4_seqid(xdr, arg->seqid);
- encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_stateid(xdr, &arg->stateid);
}
static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
*p = cpu_to_be32(args->new_lock_owner);
if (args->new_lock_owner){
encode_nfs4_seqid(xdr, args->open_seqid);
- encode_nfs4_stateid(xdr, args->open_stateid);
+ encode_nfs4_stateid(xdr, &args->open_stateid);
encode_nfs4_seqid(xdr, args->lock_seqid);
encode_lockowner(xdr, &args->lock_owner);
}
else {
- encode_nfs4_stateid(xdr, args->lock_stateid);
+ encode_nfs4_stateid(xdr, &args->lock_stateid);
encode_nfs4_seqid(xdr, args->lock_seqid);
}
}
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
encode_nfs4_seqid(xdr, args->seqid);
- encode_nfs4_stateid(xdr, args->stateid);
+ encode_nfs4_stateid(xdr, &args->stateid);
p = reserve_space(xdr, 16);
p = xdr_encode_hyper(p, args->fl->fl_start);
xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
encode_string(xdr, name->len, name->name);
}
-static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
{
__be32 *p;
p = reserve_space(xdr, 8);
- switch (fmode & (FMODE_READ|FMODE_WRITE)) {
- case FMODE_READ:
- *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
- break;
- case FMODE_WRITE:
- *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
- break;
- case FMODE_READ|FMODE_WRITE:
- *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
- break;
- default:
- *p++ = cpu_to_be32(0);
- }
+ *p++ = cpu_to_be32(share_access);
*p = cpu_to_be32(0); /* for linux, share_deny = 0 always */
}
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
* owner 4 = 32
*/
encode_nfs4_seqid(xdr, arg->seqid);
- encode_share_access(xdr, arg->fmode);
+ encode_share_access(xdr, arg->share_access);
p = reserve_space(xdr, 36);
p = xdr_encode_hyper(p, arg->clientid);
*p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
{
encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
- encode_nfs4_stateid(xdr, arg->stateid);
+ encode_nfs4_stateid(xdr, &arg->stateid);
encode_nfs4_seqid(xdr, arg->seqid);
- encode_share_access(xdr, arg->fmode);
+ encode_share_access(xdr, arg->share_access);
}
static void
@@ -1724,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
#if defined(CONFIG_NFS_V4_1)
/* NFSv4.1 operations */
static void encode_bind_conn_to_session(struct xdr_stream *xdr,
- struct nfs4_session *session,
+ struct nfs41_bind_conn_to_session_args *args,
struct compound_hdr *hdr)
{
__be32 *p;
encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
decode_bind_conn_to_session_maxsz, hdr);
- encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+ encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
p = xdr_reserve_space(xdr, 8);
- *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
- *p = 0; /* use_conn_in_rdma_mode = False */
+ *p++ = cpu_to_be32(args->dir);
+ *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
}
static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
struct compound_hdr *hdr)
{
__be32 *p;
- char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
- uint32_t len;
struct nfs_client *clp = args->client;
+ struct rpc_clnt *clnt = clp->cl_rpcclient;
struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
u32 max_resp_sz_cached;
@@ -1814,13 +1804,10 @@ static void encode_create_session(struct xdr_stream *xdr,
max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
- len = scnprintf(machine_name, sizeof(machine_name), "%s",
- clp->cl_ipaddr);
-
encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
- p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
- p = xdr_encode_hyper(p, clp->cl_clientid);
- *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */
+ p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
+ p = xdr_encode_hyper(p, args->clientid);
+ *p++ = cpu_to_be32(args->seqid); /*Sequence id */
*p++ = cpu_to_be32(args->flags); /*flags */
/* Fore Channel */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
/* authsys_parms rfc1831 */
*p++ = cpu_to_be32(nn->boot_time.tv_nsec); /* stamp */
- p = xdr_encode_opaque(p, machine_name, len);
+ p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
*p++ = cpu_to_be32(0); /* UID */
*p++ = cpu_to_be32(0); /* GID */
*p = cpu_to_be32(0); /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
p = reserve_space(xdr, 16);
*p++ = cpu_to_be32(0); /* reclaim. always 0 for now */
*p++ = cpu_to_be32(args->layout_type);
- *p++ = cpu_to_be32(IOMODE_ANY);
+ *p++ = cpu_to_be32(args->range.iomode);
*p = cpu_to_be32(RETURN_FILE);
p = reserve_space(xdr, 16);
- p = xdr_encode_hyper(p, 0);
- p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+ p = xdr_encode_hyper(p, args->range.offset);
+ p = xdr_encode_hyper(p, args->range.length);
spin_lock(&args->inode->i_lock);
encode_nfs4_stateid(xdr, &args->stateid);
spin_unlock(&args->inode->i_lock);
@@ -2747,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
*/
static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
struct xdr_stream *xdr,
- struct nfs_client *clp)
+ struct nfs41_bind_conn_to_session_args *args)
{
struct compound_hdr hdr = {
- .minorversion = clp->cl_mvops->minor_version,
+ .minorversion = args->client->cl_mvops->minor_version,
};
encode_compound_hdr(xdr, req, &hdr);
- encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+ encode_bind_conn_to_session(xdr, args, &hdr);
encode_nops(&hdr);
}
@@ -4936,20 +4923,13 @@ out_overflow:
return -EIO;
}
-static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_rw_delegation(struct xdr_stream *xdr,
+ uint32_t delegation_type,
+ struct nfs_openres *res)
{
__be32 *p;
- uint32_t delegation_type;
int status;
- p = xdr_inline_decode(xdr, 4);
- if (unlikely(!p))
- goto out_overflow;
- delegation_type = be32_to_cpup(p);
- if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
- res->delegation_type = 0;
- return 0;
- }
status = decode_stateid(xdr, &res->delegation);
if (unlikely(status))
return status;
@@ -4973,6 +4953,52 @@ out_overflow:
return -EIO;
}
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t why_no_delegation;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ why_no_delegation = be32_to_cpup(p);
+ switch (why_no_delegation) {
+ case WND4_CONTENTION:
+ case WND4_RESOURCE:
+ xdr_inline_decode(xdr, 4);
+ /* Ignore for now */
+ }
+ return 0;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+ __be32 *p;
+ uint32_t delegation_type;
+
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_overflow;
+ delegation_type = be32_to_cpup(p);
+ res->delegation_type = 0;
+ switch (delegation_type) {
+ case NFS4_OPEN_DELEGATE_NONE:
+ return 0;
+ case NFS4_OPEN_DELEGATE_READ:
+ case NFS4_OPEN_DELEGATE_WRITE:
+ return decode_rw_delegation(xdr, delegation_type, res);
+ case NFS4_OPEN_DELEGATE_NONE_EXT:
+ return decode_no_delegation(xdr, res);
+ }
+ return -EIO;
+out_overflow:
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+}
+
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
{
__be32 *p;
@@ -5587,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
if (!status)
- status = decode_sessionid(xdr, &res->session->sess_id);
+ status = decode_sessionid(xdr, &res->sessionid);
if (unlikely(status))
return status;
@@ -5615,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
{
__be32 *p;
int status;
- struct nfs_client *clp = res->client;
- struct nfs4_session *session = clp->cl_session;
status = decode_op_hdr(xdr, OP_CREATE_SESSION);
if (!status)
- status = decode_sessionid(xdr, &session->sess_id);
+ status = decode_sessionid(xdr, &res->sessionid);
if (unlikely(status))
return status;
@@ -5628,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
p = xdr_inline_decode(xdr, 8);
if (unlikely(!p))
goto out_overflow;
- clp->cl_seqid = be32_to_cpup(p++);
- session->flags = be32_to_cpup(p);
+ res->seqid = be32_to_cpup(p++);
+ res->flags = be32_to_cpup(p);
/* Channel attributes */
- status = decode_chan_attrs(xdr, &session->fc_attrs);
+ status = decode_chan_attrs(xdr, &res->fc_attrs);
if (!status)
- status = decode_chan_attrs(xdr, &session->bc_attrs);
+ status = decode_chan_attrs(xdr, &res->bc_attrs);
return status;
out_overflow:
print_overflow_msg(__func__, xdr);
@@ -6567,6 +6591,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
if (status)
goto out;
status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6617,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
if (status)
goto out;
status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6647,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
int status;
status = decode_compound_hdr(xdr, &hdr);
+ res->op_status = hdr.status;
if (status)
goto out;
status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
*/
len = snprintf(nfs_export_path, sizeof(nfs_export_path),
tmp, utsname()->nodename);
- if (len > (int)sizeof(nfs_export_path))
+ if (len >= (int)sizeof(nfs_export_path))
goto out_devnametoolong;
len = snprintf(nfs_root_device, sizeof(nfs_root_device),
"%pI4:%s", &servaddr, nfs_export_path);
- if (len > (int)sizeof(nfs_root_device))
+ if (len >= (int)sizeof(nfs_root_device))
goto out_devnametoolong;
retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
unsigned int size;
size = pnfs_generic_pg_test(pgio, prev, req);
- if (!size || pgio->pg_count + req->wb_bytes >
+ if (!size || mirror->pg_count + req->wb_bytes >
(unsigned long)pgio->pg_layout_private)
return 0;
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
.pg_init = objio_init_read,
.pg_test = objio_pg_test,
.pg_doio = pnfs_generic_pg_readpages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static const struct nfs_pageio_ops objio_pg_write_ops = {
.pg_init = objio_init_write,
.pg_test = objio_pg_test,
.pg_doio = pnfs_generic_pg_writepages,
+ .pg_cleanup = pnfs_generic_pg_cleanup,
};
static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
return p->pagevec != NULL;
}
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+ return nfs_pgio_has_mirroring(desc) ?
+ &desc->pg_mirrors[desc->pg_mirror_idx] :
+ &desc->pg_mirrors[0];
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
+
void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr,
void (*release)(struct nfs_pgio_header *hdr))
{
- hdr->req = nfs_list_entry(desc->pg_list.next);
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ hdr->req = nfs_list_entry(mirror->pg_list.next);
hdr->inode = desc->pg_inode;
hdr->cred = hdr->req->wb_context->cred;
hdr->io_start = req_offset(hdr->req);
- hdr->good_bytes = desc->pg_count;
+ hdr->good_bytes = mirror->pg_count;
hdr->dreq = desc->pg_dreq;
hdr->layout_private = desc->pg_layout_private;
hdr->release = release;
hdr->completion_ops = desc->pg_completion_ops;
if (hdr->completion_ops->init_hdr)
hdr->completion_ops->init_hdr(hdr);
+
+ hdr->pgio_mirror_idx = desc->pg_mirror_idx;
}
EXPORT_SYMBOL_GPL(nfs_pgheader_init);
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
struct nfs_page *prev, struct nfs_page *req)
{
- if (desc->pg_count > desc->pg_bsize) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ if (mirror->pg_count > mirror->pg_bsize) {
/* should never happen */
WARN_ON_ONCE(1);
return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
* Limit the request size so that we can still allocate a page array
* for it without upsetting the slab allocator.
*/
- if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+ if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
sizeof(struct page) > PAGE_SIZE)
return 0;
- return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
+ return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
}
EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
}
int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+ struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
const struct rpc_call_ops *call_ops, int how, int flags)
{
struct rpc_task *task;
struct rpc_message msg = {
.rpc_argp = &hdr->args,
.rpc_resp = &hdr->res,
- .rpc_cred = hdr->cred,
+ .rpc_cred = cred,
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
};
int ret = 0;
- hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
+ hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
dprintk("NFS: %5u initiated pgio call "
"(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
+ struct nfs_pgio_mirror *mirror;
+ u32 midx;
+
set_bit(NFS_IOHDR_REDO, &hdr->flags);
nfs_pgio_data_destroy(hdr);
hdr->completion_ops->completion(hdr);
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ /* TODO: Make sure it's right to clean up all mirrors here
+ * and not just hdr->pgio_mirror_idx */
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ }
return -ENOMEM;
}
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
hdr->completion_ops->completion(hdr);
}
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+ unsigned int bsize)
+{
+ INIT_LIST_HEAD(&mirror->pg_list);
+ mirror->pg_bytes_written = 0;
+ mirror->pg_count = 0;
+ mirror->pg_bsize = bsize;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+}
+
/**
* nfs_pageio_init - initialise a page io descriptor
* @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
size_t bsize,
int io_flags)
{
- INIT_LIST_HEAD(&desc->pg_list);
- desc->pg_bytes_written = 0;
- desc->pg_count = 0;
- desc->pg_bsize = bsize;
- desc->pg_base = 0;
+ struct nfs_pgio_mirror *new;
+ int i;
+
desc->pg_moreio = 0;
- desc->pg_recoalesce = 0;
desc->pg_inode = inode;
desc->pg_ops = pg_ops;
desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
desc->pg_lseg = NULL;
desc->pg_dreq = NULL;
desc->pg_layout_private = NULL;
+ desc->pg_bsize = bsize;
+
+ desc->pg_mirror_count = 1;
+ desc->pg_mirror_idx = 0;
+
+ if (pg_ops->pg_get_mirror_count) {
+ /* until we have a request, we don't have an lseg and no
+ * idea how many mirrors there will be */
+ new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
+ sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+ desc->pg_mirrors_dynamic = new;
+ desc->pg_mirrors = new;
+
+ for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
+ nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
+ } else {
+ desc->pg_mirrors_dynamic = NULL;
+ desc->pg_mirrors = desc->pg_mirrors_static;
+ nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+ }
}
EXPORT_SYMBOL_GPL(nfs_pageio_init);
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
struct nfs_page *req;
struct page **pages,
*last_page;
- struct list_head *head = &desc->pg_list;
+ struct list_head *head = &mirror->pg_list;
struct nfs_commit_info cinfo;
unsigned int pagecount, pageused;
- pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+ pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
if (!nfs_pgarray_set(&hdr->page_array, pagecount))
return nfs_pgio_error(desc, hdr);
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
desc->pg_ioflags &= ~FLUSH_COND_STABLE;
/* Set up the argument struct */
- nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+ nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
desc->pg_rpc_callops = &nfs_pgio_common_ops;
return 0;
}
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
{
+ struct nfs_pgio_mirror *mirror;
struct nfs_pgio_header *hdr;
int ret;
+ mirror = nfs_pgio_current_mirror(desc);
+
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+ /* TODO: make sure this is right with mirroring - or
+ * should it back out all mirrors? */
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
return -ENOMEM;
}
nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
ret = nfs_generic_pgio(desc, hdr);
if (ret == 0)
ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
- hdr, desc->pg_rpc_callops,
+ hdr,
+ hdr->cred,
+ NFS_PROTO(hdr->inode),
+ desc->pg_rpc_callops,
desc->pg_ioflags, 0);
return ret;
}
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ * by calling the pg_get_mirror_count op
+ */
+static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ int mirror_count = 1;
+
+ if (!pgio->pg_ops->pg_get_mirror_count)
+ return 0;
+
+ mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+
+ if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
+ return -EINVAL;
+
+ if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
+ return -EINVAL;
+
+ pgio->pg_mirror_count = mirror_count;
+
+ return 0;
+}
+
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ pgio->pg_mirror_count = 1;
+ pgio->pg_mirror_idx = 0;
+}
+
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ pgio->pg_mirror_count = 1;
+ pgio->pg_mirror_idx = 0;
+ pgio->pg_mirrors = pgio->pg_mirrors_static;
+ kfree(pgio->pg_mirrors_dynamic);
+ pgio->pg_mirrors_dynamic = NULL;
+}
+
static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -826,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
struct nfs_pageio_descriptor *pgio)
{
size_t size;
+ struct file_lock_context *flctx;
if (prev) {
if (!nfs_match_open_context(req->wb_context, prev->wb_context))
return false;
- if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+ flctx = req->wb_context->dentry->d_inode->i_flctx;
+ if (flctx != NULL &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock)) &&
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
return false;
@@ -863,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
struct nfs_page *prev = NULL;
- if (desc->pg_count != 0) {
- prev = nfs_list_entry(desc->pg_list.prev);
+
+ if (mirror->pg_count != 0) {
+ prev = nfs_list_entry(mirror->pg_list.prev);
} else {
if (desc->pg_ops->pg_init)
desc->pg_ops->pg_init(desc, req);
- desc->pg_base = req->wb_pgbase;
+ mirror->pg_base = req->wb_pgbase;
}
if (!nfs_can_coalesce_requests(prev, req, desc))
return 0;
nfs_list_remove_request(req);
- nfs_list_add_request(req, &desc->pg_list);
- desc->pg_count += req->wb_bytes;
+ nfs_list_add_request(req, &mirror->pg_list);
+ mirror->pg_count += req->wb_bytes;
return 1;
}
@@ -884,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
*/
static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
{
- if (!list_empty(&desc->pg_list)) {
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
+
+ if (!list_empty(&mirror->pg_list)) {
int error = desc->pg_ops->pg_doio(desc);
if (error < 0)
desc->pg_error = error;
else
- desc->pg_bytes_written += desc->pg_count;
+ mirror->pg_bytes_written += mirror->pg_count;
}
- if (list_empty(&desc->pg_list)) {
- desc->pg_count = 0;
- desc->pg_base = 0;
+ if (list_empty(&mirror->pg_list)) {
+ mirror->pg_count = 0;
+ mirror->pg_base = 0;
}
}
@@ -911,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
struct nfs_page *subreq;
unsigned int bytes_left = 0;
unsigned int offset, pgbase;
@@ -934,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
nfs_pageio_doio(desc);
if (desc->pg_error < 0)
return 0;
- if (desc->pg_recoalesce)
+ if (mirror->pg_recoalesce)
return 0;
/* retry add_request for this subreq */
nfs_page_group_lock(req, false);
@@ -972,14 +1091,16 @@ err_ptr:
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
LIST_HEAD(head);
do {
- list_splice_init(&desc->pg_list, &head);
- desc->pg_bytes_written -= desc->pg_count;
- desc->pg_count = 0;
- desc->pg_base = 0;
- desc->pg_recoalesce = 0;
+ list_splice_init(&mirror->pg_list, &head);
+ mirror->pg_bytes_written -= mirror->pg_count;
+ mirror->pg_count = 0;
+ mirror->pg_base = 0;
+ mirror->pg_recoalesce = 0;
+
desc->pg_moreio = 0;
while (!list_empty(&head)) {
@@ -993,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
return 0;
break;
}
- } while (desc->pg_recoalesce);
+ } while (mirror->pg_recoalesce);
return 1;
}
-int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
int ret;
@@ -1010,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
break;
ret = nfs_do_recoalesce(desc);
} while (ret);
+
return ret;
}
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
+{
+ u32 midx;
+ unsigned int pgbase, offset, bytes;
+ struct nfs_page *dupreq, *lastreq;
+
+ pgbase = req->wb_pgbase;
+ offset = req->wb_offset;
+ bytes = req->wb_bytes;
+
+ nfs_pageio_setup_mirroring(desc, req);
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ if (midx) {
+ nfs_page_group_lock(req, false);
+
+ /* find the last request */
+ for (lastreq = req->wb_head;
+ lastreq->wb_this_page != req->wb_head;
+ lastreq = lastreq->wb_this_page)
+ ;
+
+ dupreq = nfs_create_request(req->wb_context,
+ req->wb_page, lastreq, pgbase, bytes);
+
+ if (IS_ERR(dupreq)) {
+ nfs_page_group_unlock(req);
+ return 0;
+ }
+
+ nfs_lock_request(dupreq);
+ nfs_page_group_unlock(req);
+ dupreq->wb_offset = offset;
+ dupreq->wb_index = req->wb_index;
+ } else
+ dupreq = req;
+
+ if (nfs_pgio_has_mirroring(desc))
+ desc->pg_mirror_idx = midx;
+ if (!nfs_pageio_add_request_mirror(desc, dupreq))
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ * nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+ u32 mirror_idx)
+{
+ struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
+ u32 restore_idx = desc->pg_mirror_idx;
+
+ if (nfs_pgio_has_mirroring(desc))
+ desc->pg_mirror_idx = mirror_idx;
+ for (;;) {
+ nfs_pageio_doio(desc);
+ if (!mirror->pg_recoalesce)
+ break;
+ if (!nfs_do_recoalesce(desc))
+ break;
+ }
+ desc->pg_mirror_idx = restore_idx;
+}
+
/*
* nfs_pageio_resend - Transfer requests to new descriptor and resend
* @hdr - the pgio header to move request from
@@ -1046,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
EXPORT_SYMBOL_GPL(nfs_pageio_resend);
/**
- * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
* @desc: pointer to io descriptor
*/
void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
{
- for (;;) {
- nfs_pageio_doio(desc);
- if (!desc->pg_recoalesce)
- break;
- if (!nfs_do_recoalesce(desc))
- break;
- }
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++)
+ nfs_pageio_complete_mirror(desc, midx);
+
+ if (desc->pg_ops->pg_cleanup)
+ desc->pg_ops->pg_cleanup(desc);
+ nfs_pageio_cleanup_mirroring(desc);
}
/**
@@ -1073,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
*/
void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
{
- if (!list_empty(&desc->pg_list)) {
- struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
- if (index != prev->wb_index + 1)
- nfs_pageio_complete(desc);
+ struct nfs_pgio_mirror *mirror;
+ struct nfs_page *prev;
+ u32 midx;
+
+ for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+ mirror = &desc->pg_mirrors[midx];
+ if (!list_empty(&mirror->pg_list)) {
+ prev = nfs_list_entry(mirror->pg_list.prev);
+ if (index != prev->wb_index + 1)
+ nfs_pageio_complete_mirror(desc, midx);
+ }
}
}
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
#include "pnfs.h"
#include "iostat.h"
#include "nfs4trace.h"
+#include "delegation.h"
#define NFSDBG_FACILITY NFSDBG_PNFS
#define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
*/
static LIST_HEAD(pnfs_modules_tbl);
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+ enum pnfs_iomode iomode, bool sync);
+
/* Return the registered pnfs layout driver module matching given id */
static struct pnfs_layoutdriver_type *
find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
struct inode *inode = lo->plh_inode;
if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+ if (!list_empty(&lo->plh_segs))
+ WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
pnfs_detach_layout_hdr(lo);
spin_unlock(&inode->i_lock);
pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
}
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_segment *s;
+
+ if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ return false;
+
+ list_for_each_entry(s, &lo->plh_segs, pls_list)
+ if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+ return false;
+
+ return true;
+}
+
+static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
+ struct pnfs_layout_hdr *lo, struct inode *inode)
+{
+ lo = lseg->pls_layout;
+ inode = lo->plh_inode;
+
+ spin_lock(&inode->i_lock);
+ if (pnfs_layout_need_return(lo, lseg)) {
+ nfs4_stateid stateid;
+ enum pnfs_iomode iomode;
+
+ stateid = lo->plh_stateid;
+ iomode = lo->plh_return_iomode;
+ /* decreased in pnfs_send_layoutreturn() */
+ lo->plh_block_lgets++;
+ lo->plh_return_iomode = 0;
+ spin_unlock(&inode->i_lock);
+ pnfs_get_layout_hdr(lo);
+
+ /* Send an async layoutreturn so we dont deadlock */
+ pnfs_send_layoutreturn(lo, stateid, iomode, false);
+ } else
+ spin_unlock(&inode->i_lock);
+}
+
void
pnfs_put_lseg(struct pnfs_layout_segment *lseg)
{
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
atomic_read(&lseg->pls_refcount),
test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+
+ /* Handle the case where refcount != 1 */
+ if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
+ return;
+
lo = lseg->pls_layout;
inode = lo->plh_inode;
+ /* Do we need a layoutreturn? */
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
+
if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
pnfs_get_layout_hdr(lo);
pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
pnfs_get_layout_hdr(lo);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+ pnfs_clear_retry_layoutget(lo);
spin_unlock(&nfsi->vfs_inode.i_lock);
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
}
+static bool
+pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range)
+{
+ return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+ (lo->plh_return_iomode == IOMODE_ANY ||
+ lo->plh_return_iomode == range->iomode);
+}
+
/* lget is set to 1 if called from inside send_layoutget call chain */
static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range, int lget)
{
return lo->plh_block_lgets ||
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
(list_empty(&lo->plh_segs) &&
- (atomic_read(&lo->plh_outstanding) > lget));
+ (atomic_read(&lo->plh_outstanding) > lget)) ||
+ pnfs_layout_returning(lo, range);
}
int
pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
struct nfs4_state *open_state)
{
int status = 0;
dprintk("--> %s\n", __func__);
spin_lock(&lo->plh_inode->i_lock);
- if (pnfs_layoutgets_blocked(lo, 1)) {
+ if (pnfs_layoutgets_blocked(lo, range, 1)) {
status = -EAGAIN;
} else if (!nfs4_valid_open_stateid(open_state)) {
status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
pnfs_layout_io_set_failed(lo, range->iomode);
}
return NULL;
- }
+ } else
+ pnfs_layout_clear_fail_bit(lo,
+ pnfs_iomode_to_fail_bit(range->iomode));
return lseg;
}
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
}
}
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+ clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ smp_mb__after_atomic();
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+}
+
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+ enum pnfs_iomode iomode, bool sync)
+{
+ struct inode *ino = lo->plh_inode;
+ struct nfs4_layoutreturn *lrp;
+ int status = 0;
+
+ lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+ if (unlikely(lrp == NULL)) {
+ status = -ENOMEM;
+ spin_lock(&ino->i_lock);
+ lo->plh_block_lgets--;
+ pnfs_clear_layoutreturn_waitbit(lo);
+ rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+ spin_unlock(&ino->i_lock);
+ pnfs_put_layout_hdr(lo);
+ goto out;
+ }
+
+ lrp->args.stateid = stateid;
+ lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+ lrp->args.inode = ino;
+ lrp->args.range.iomode = iomode;
+ lrp->args.range.offset = 0;
+ lrp->args.range.length = NFS4_MAX_UINT64;
+ lrp->args.layout = lo;
+ lrp->clp = NFS_SERVER(ino)->nfs_client;
+ lrp->cred = lo->plh_lc_cred;
+
+ status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+ dprintk("<-- %s status: %d\n", __func__, status);
+ return status;
+}
+
/*
* Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
* when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
- struct nfs4_layoutreturn *lrp;
nfs4_stateid stateid;
int status = 0, empty;
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
- lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
- if (unlikely(lrp == NULL)) {
- status = -ENOMEM;
- spin_lock(&ino->i_lock);
- lo->plh_block_lgets--;
- spin_unlock(&ino->i_lock);
- pnfs_put_layout_hdr(lo);
- goto out;
- }
-
- lrp->args.stateid = stateid;
- lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
- lrp->args.inode = ino;
- lrp->args.layout = lo;
- lrp->clp = NFS_SERVER(ino)->nfs_client;
- lrp->cred = lo->plh_lc_cred;
-
- status = nfs4_proc_layoutreturn(lrp);
+ status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
out:
dprintk("<-- %s status: %d\n", __func__, status);
return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
bool pnfs_roc(struct inode *ino)
{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct nfs_open_context *ctx;
+ struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg, *tmp;
+ nfs4_stateid stateid;
LIST_HEAD(tmp_list);
- bool found = false;
+ bool found = false, layoutreturn = false;
spin_lock(&ino->i_lock);
- lo = NFS_I(ino)->layout;
+ lo = nfsi->layout;
if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
- goto out_nolayout;
+ goto out_noroc;
+
+ /* Don't return layout if we hold a delegation */
+ if (nfs4_check_delegation(ino, FMODE_READ))
+ goto out_noroc;
+
+ list_for_each_entry(ctx, &nfsi->open_files, list) {
+ state = ctx->state;
+ /* Don't return layout if there is open file state */
+ if (state != NULL && state->state != 0)
+ goto out_noroc;
+ }
+
+ pnfs_clear_retry_layoutget(lo);
list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
mark_lseg_invalid(lseg, &tmp_list);
found = true;
}
if (!found)
- goto out_nolayout;
+ goto out_noroc;
lo->plh_block_lgets++;
pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
spin_unlock(&ino->i_lock);
pnfs_free_lseg_list(&tmp_list);
return true;
-out_nolayout:
+out_noroc:
+ if (lo) {
+ stateid = lo->plh_stateid;
+ layoutreturn =
+ test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags);
+ if (layoutreturn) {
+ lo->plh_block_lgets++;
+ pnfs_get_layout_hdr(lo);
+ }
+ }
spin_unlock(&ino->i_lock);
+ if (layoutreturn)
+ pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
return false;
}
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
struct nfs_inode *nfsi = NFS_I(ino);
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg;
+ nfs4_stateid stateid;
u32 current_seqid;
- bool found = false;
+ bool found = false, layoutreturn = false;
spin_lock(&ino->i_lock);
list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
*/
*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
out:
+ if (!found) {
+ stateid = lo->plh_stateid;
+ layoutreturn =
+ test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+ &lo->plh_flags);
+ if (layoutreturn) {
+ lo->plh_block_lgets++;
+ pnfs_get_layout_hdr(lo);
+ }
+ }
spin_unlock(&ino->i_lock);
+ if (layoutreturn) {
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+ pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+ }
return found;
}
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+ !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
pnfs_lseg_range_match(&lseg->pls_range, range)) {
ret = pnfs_get_lseg(lseg);
break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
+/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
+static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
+{
+ if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
+ return 1;
+ return nfs_wait_bit_killable(key);
+}
+
+static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ /*
+ * send layoutcommit as it can hold up layoutreturn due to lseg
+ * reference
+ */
+ pnfs_layoutcommit_inode(lo->plh_inode, false);
+ return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ pnfs_layoutget_retry_bit_wait,
+ TASK_UNINTERRUPTIBLE);
+}
+
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+ unsigned long *bitlock = &lo->plh_flags;
+
+ clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+ smp_mb__after_atomic();
+ wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
+
/*
* Layout segment is retreived from the server if not cached.
* The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
if (pnfs_within_mdsthreshold(ctx, ino, iomode))
goto out;
+lookup_again:
+ first = false;
spin_lock(&ino->i_lock);
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
}
/* if LAYOUTGET already failed once we don't try again */
- if (pnfs_layout_io_test_failed(lo, iomode))
+ if (pnfs_layout_io_test_failed(lo, iomode) &&
+ !pnfs_should_retry_layoutget(lo))
goto out_unlock;
- /* Check to see if the layout for the given range already exists */
- lseg = pnfs_find_lseg(lo, &arg);
- if (lseg)
- goto out_unlock;
+ first = list_empty(&lo->plh_segs);
+ if (first) {
+ /* The first layoutget for the file. Need to serialize per
+ * RFC 5661 Errata 3208.
+ */
+ if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+ &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
+ TASK_UNINTERRUPTIBLE);
+ pnfs_put_layout_hdr(lo);
+ goto lookup_again;
+ }
+ } else {
+ /* Check to see if the layout for the given range
+ * already exists
+ */
+ lseg = pnfs_find_lseg(lo, &arg);
+ if (lseg)
+ goto out_unlock;
+ }
+
+ /*
+ * Because we free lsegs before sending LAYOUTRETURN, we need to wait
+ * for LAYOUTRETURN even if first is true.
+ */
+ if (!lseg && pnfs_should_retry_layoutget(lo) &&
+ test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+ spin_unlock(&ino->i_lock);
+ dprintk("%s wait for layoutreturn\n", __func__);
+ if (pnfs_prepare_to_retry_layoutget(lo)) {
+ if (first)
+ pnfs_clear_first_layoutget(lo);
+ pnfs_put_layout_hdr(lo);
+ dprintk("%s retrying\n", __func__);
+ goto lookup_again;
+ }
+ goto out_put_layout_hdr;
+ }
- if (pnfs_layoutgets_blocked(lo, 0))
+ if (pnfs_layoutgets_blocked(lo, &arg, 0))
goto out_unlock;
atomic_inc(&lo->plh_outstanding);
-
- first = list_empty(&lo->plh_layouts) ? true : false;
spin_unlock(&ino->i_lock);
- if (first) {
+ if (list_empty(&lo->plh_layouts)) {
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
spin_lock(&clp->cl_lock);
- list_add_tail(&lo->plh_layouts, &server->layouts);
+ if (list_empty(&lo->plh_layouts))
+ list_add_tail(&lo->plh_layouts, &server->layouts);
spin_unlock(&clp->cl_lock);
}
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
arg.length = PAGE_CACHE_ALIGN(arg.length);
lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+ pnfs_clear_retry_layoutget(lo);
atomic_dec(&lo->plh_outstanding);
out_put_layout_hdr:
+ if (first)
+ pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
out:
dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
goto out_forget_reply;
}
- if (pnfs_layoutgets_blocked(lo, 1)) {
+ if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
dprintk("%s forget reply due to state\n", __func__);
goto out_forget_reply;
}
@@ -1440,24 +1652,79 @@ out_forget_reply:
goto out;
}
+static void
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+ struct list_head *tmp_list,
+ struct pnfs_layout_range *return_range)
+{
+ struct pnfs_layout_segment *lseg, *next;
+
+ dprintk("%s:Begin lo %p\n", __func__, lo);
+
+ if (list_empty(&lo->plh_segs))
+ return;
+
+ list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+ if (should_free_lseg(&lseg->pls_range, return_range)) {
+ dprintk("%s: marking lseg %p iomode %d "
+ "offset %llu length %llu\n", __func__,
+ lseg, lseg->pls_range.iomode,
+ lseg->pls_range.offset,
+ lseg->pls_range.length);
+ set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+ mark_lseg_invalid(lseg, tmp_list);
+ }
+}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+ int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+ LIST_HEAD(free_me);
+
+ spin_lock(&inode->i_lock);
+ /* set failure bit so that pnfs path will be retried later */
+ pnfs_layout_set_fail_bit(lo, iomode);
+ set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+ if (lo->plh_return_iomode == 0)
+ lo->plh_return_iomode = range.iomode;
+ else if (lo->plh_return_iomode != range.iomode)
+ lo->plh_return_iomode = IOMODE_ANY;
+ /*
+ * mark all matching lsegs so that we are sure to have no live
+ * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+ * for how it works.
+ */
+ pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
+ spin_unlock(&inode->i_lock);
+ pnfs_free_lseg_list(&free_me);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
u64 rd_size = req->wb_bytes;
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- if (pgio->pg_dreq == NULL)
- rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
- else
- rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- req->wb_context,
- req_offset(req),
- rd_size,
- IOMODE_READ,
- GFP_KERNEL);
+ if (pgio->pg_lseg == NULL) {
+ if (pgio->pg_dreq == NULL)
+ rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+ else
+ rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ req_offset(req),
+ rd_size,
+ IOMODE_READ,
+ GFP_KERNEL);
+ }
/* If no lseg, fall back to read through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size)
{
- WARN_ON_ONCE(pgio->pg_lseg != NULL);
-
- pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
- req->wb_context,
- req_offset(req),
- wb_size,
- IOMODE_RW,
- GFP_NOFS);
+ if (pgio->pg_lseg == NULL)
+ pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+ req->wb_context,
+ req_offset(req),
+ wb_size,
+ IOMODE_RW,
+ GFP_NOFS);
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
nfs_pageio_reset_write_mds(pgio);
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+ if (desc->pg_lseg) {
+ pnfs_put_lseg(desc->pg_lseg);
+ desc->pg_lseg = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
+
/*
* Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
* of bytes (maximum @req->wb_bytes) that can be coalesced.
*/
size_t
-pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
- struct nfs_page *req)
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *prev, struct nfs_page *req)
{
unsigned int size;
u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
pgio->pg_lseg->pls_range.length);
req_start = req_offset(req);
- WARN_ON_ONCE(req_start > seg_end);
+ WARN_ON_ONCE(req_start >= seg_end);
/* start of request is past the last byte of this segment */
- if (req_start >= seg_end)
+ if (req_start >= seg_end) {
+ /* reference the new lseg */
+ if (pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+ if (pgio->pg_ops->pg_init)
+ pgio->pg_ops->pg_init(pgio, req);
return 0;
+ }
/* adjust 'size' iff there are fewer bytes left in the
* segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
- list_splice_tail_init(&hdr->pages, &desc->pg_list);
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
nfs_pageio_reset_write_mds(desc);
- desc->pg_recoalesce = 1;
+ mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
}
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
- desc->pg_lseg = NULL;
trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
if (trypnfs == PNFS_NOT_ATTEMPTED)
pnfs_write_through_mds(desc, hdr);
- pnfs_put_lseg(lseg);
}
static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- pnfs_put_lseg(desc->pg_lseg);
- desc->pg_lseg = NULL;
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
return -ENOMEM;
}
nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
+
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
ret = nfs_generic_pgio(desc, hdr);
- if (ret != 0) {
- pnfs_put_lseg(desc->pg_lseg);
- desc->pg_lseg = NULL;
- } else
+ if (!ret)
pnfs_do_write(desc, hdr, desc->pg_ioflags);
+
return ret;
}
EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_header *hdr)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
- list_splice_tail_init(&hdr->pages, &desc->pg_list);
+ list_splice_tail_init(&hdr->pages, &mirror->pg_list);
nfs_pageio_reset_read_mds(desc);
- desc->pg_recoalesce = 1;
+ mirror->pg_recoalesce = 1;
}
nfs_pgio_data_destroy(hdr);
}
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
return trypnfs;
}
+/* Resend all requests through pnfs. */
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+{
+ struct nfs_pageio_descriptor pgio;
+
+ nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
+ return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
+
static void
pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
{
const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
struct pnfs_layout_segment *lseg = desc->pg_lseg;
enum pnfs_try_status trypnfs;
+ int err = 0;
- desc->pg_lseg = NULL;
trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
- if (trypnfs == PNFS_NOT_ATTEMPTED)
+ if (trypnfs == PNFS_TRY_AGAIN)
+ err = pnfs_read_resend_pnfs(hdr);
+ if (trypnfs == PNFS_NOT_ATTEMPTED || err)
pnfs_read_through_mds(desc, hdr);
- pnfs_put_lseg(lseg);
}
static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
{
+ struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+
struct nfs_pgio_header *hdr;
int ret;
hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
if (!hdr) {
- desc->pg_completion_ops->error_cleanup(&desc->pg_list);
- ret = -ENOMEM;
- pnfs_put_lseg(desc->pg_lseg);
- desc->pg_lseg = NULL;
- return ret;
+ desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+ return -ENOMEM;
}
nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
ret = nfs_generic_pgio(desc, hdr);
- if (ret != 0) {
- pnfs_put_lseg(desc->pg_lseg);
- desc->pg_lseg = NULL;
- } else
+ if (!ret)
pnfs_do_read(desc, hdr);
return ret;
}
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
pnfs_clear_layoutcommitting(inode);
goto out;
}
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
{
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..635f0865671c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */
NFS_LSEG_ROC, /* roc bit received from server */
NFS_LSEG_LAYOUTCOMMIT, /* layoutcommit bit set for layoutcommit */
+ NFS_LSEG_LAYOUTRETURN, /* layoutreturn bit set for layoutreturn */
+};
+
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+ struct sockaddr_storage da_addr;
+ size_t da_addrlen;
+ struct list_head da_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *da_remotestr; /* human readable addr+port */
+};
+
+struct nfs4_pnfs_ds {
+ struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */
+ char *ds_remotestr; /* comma sep list of addrs */
+ struct list_head ds_addrs;
+ struct nfs_client *ds_clp;
+ atomic_t ds_count;
+ unsigned long ds_state;
+#define NFS4DS_CONNECTING 0 /* ds is establishing connection */
};
struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
enum pnfs_try_status {
PNFS_ATTEMPTED = 0,
PNFS_NOT_ATTEMPTED = 1,
+ PNFS_TRY_AGAIN = 2,
};
#ifdef CONFIG_NFS_V4_1
#define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO 600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS 12001
+#define NFS4ERR_RESET_TO_PNFS 12002
+
enum {
NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */
NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */
NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */
NFS_LAYOUT_ROC, /* some lseg had roc bit set */
NFS_LAYOUT_RETURN, /* Return this layout ASAP */
+ NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
+ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
+ NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */
};
enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
void (*mark_request_commit) (struct nfs_page *req,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo);
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
void (*clear_request_commit) (struct nfs_page *req,
struct nfs_commit_info *cinfo);
int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
u32 plh_barrier; /* ignore lower seqids */
unsigned long plh_retry_timestamp;
unsigned long plh_flags;
+ enum pnfs_iomode plh_return_iomode;
loff_t plh_lwb; /* last write byte for layoutcommit */
struct rpc_cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
struct pnfs_device *dev,
struct rpc_cred *cred);
extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
-extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
/* pnfs.c */
void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
bool update_barrier);
int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
struct pnfs_layout_hdr *lo,
+ struct pnfs_layout_range *range,
struct nfs4_state *open_state);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
int pnfs_commit_and_return_layout(struct inode *);
void pnfs_ld_write_done(struct nfs_pgio_header *);
void pnfs_ld_read_done(struct nfs_pgio_header *);
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
struct nfs_open_context *ctx,
loff_t pos,
u64 count,
enum pnfs_iomode iomode,
gfp_t gfp_flags);
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg);
/* nfs4_deviceid_flags */
enum {
@@ -275,6 +317,43 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
+/* pnfs_nfs.c */
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+ gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version, u32 minor_version,
+ rpc_authflavor_t au_flavor);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+ struct xdr_stream *xdr,
+ gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return NFS_I(inode)->layout != NULL;
+}
+
static inline struct nfs4_deviceid_node *
nfs4_get_deviceid(struct nfs4_deviceid_node *d)
{
@@ -282,6 +361,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
return d;
}
+static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
+ atomic_inc(&lo->plh_refcount);
+}
+
+static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
+ atomic_dec(&lo->plh_refcount);
+ /* wake up waiters for LAYOUTRETURN as that is not needed */
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+ }
+}
+
+static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+ return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
+}
+
static inline struct pnfs_layout_segment *
pnfs_get_lseg(struct pnfs_layout_segment *lseg)
{
@@ -317,16 +416,22 @@ pnfs_get_ds_info(struct inode *inode)
return ld->get_ds_info(inode);
}
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+ set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
+
static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
struct inode *inode = req->wb_context->dentry->d_inode;
struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
if (lseg == NULL || ld->mark_request_commit == NULL)
return false;
- ld->mark_request_commit(req, lseg, cinfo);
+ ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
}
@@ -352,15 +457,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
}
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
- struct nfs_commit_info *cinfo)
-{
- if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
- return;
- NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-}
-
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
@@ -427,6 +523,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
#endif /* NFS_DEBUG */
#else /* CONFIG_NFS_V4_1 */
+static inline bool nfs_have_layout(struct inode *inode)
+{
+ return false;
+}
+
static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
{
}
@@ -513,7 +614,7 @@ pnfs_get_ds_info(struct inode *inode)
static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
return false;
}
@@ -531,12 +632,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
return 0;
}
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
- struct nfs_commit_info *cinfo)
-{
-}
-
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
@@ -568,6 +663,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
return NULL;
}
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
+
#endif /* CONFIG_NFS_V4_1 */
#endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..54e36b38fb5f
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,870 @@
+/*
+ * Common NFS I/O operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+
+#define NFSDBG_FACILITY NFSDBG_PNFS
+
+void pnfs_generic_rw_release(void *data)
+{
+ struct nfs_pgio_header *hdr = data;
+
+ nfs_put_client(hdr->ds_clp);
+ hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+ struct nfs_page *first = nfs_list_entry(data->pages.next);
+
+ data->task.tk_status = 0;
+ memcpy(&data->verf.verifier, &first->wb_verf,
+ sizeof(data->verf.verifier));
+ data->verf.verifier.data[0]++; /* ensure verifier mismatch */
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+ struct nfs_commit_data *wdata = data;
+
+ /* Note this may cause RPC to be resent */
+ wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+
+void pnfs_generic_commit_release(void *calldata)
+{
+ struct nfs_commit_data *data = calldata;
+
+ data->completion_ops->completion(data);
+ pnfs_put_lseg(data->lseg);
+ nfs_put_client(data->ds_clp);
+ nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding the inode (/cinfo) lock
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_layout_segment *freeme = NULL;
+
+ if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+ goto out;
+ cinfo->ds->nwritten--;
+ if (list_is_singular(&req->wb_list)) {
+ struct pnfs_commit_bucket *bucket;
+
+ bucket = list_first_entry(&req->wb_list,
+ struct pnfs_commit_bucket,
+ written);
+ freeme = bucket->wlseg;
+ bucket->wlseg = NULL;
+ }
+out:
+ nfs_request_remove_commit_list(req, cinfo);
+ pnfs_put_lseg_locked(freeme);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+
+static int
+pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
+ struct nfs_commit_info *cinfo, int max)
+{
+ struct nfs_page *req, *tmp;
+ int ret = 0;
+
+ list_for_each_entry_safe(req, tmp, src, wb_list) {
+ if (!nfs_lock_request(req))
+ continue;
+ kref_get(&req->wb_kref);
+ if (cond_resched_lock(cinfo->lock))
+ list_safe_reset_next(req, tmp, wb_list);
+ nfs_request_remove_commit_list(req, cinfo);
+ clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ nfs_list_add_request(req, dst);
+ ret++;
+ if ((ret == max) && !cinfo->dreq)
+ break;
+ }
+ return ret;
+}
+
+static int
+pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
+{
+ struct list_head *src = &bucket->written;
+ struct list_head *dst = &bucket->committing;
+ int ret;
+
+ lockdep_assert_held(cinfo->lock);
+ ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+ if (ret) {
+ cinfo->ds->nwritten -= ret;
+ cinfo->ds->ncommitting += ret;
+ bucket->clseg = bucket->wlseg;
+ if (list_empty(src))
+ bucket->wlseg = NULL;
+ else
+ pnfs_get_lseg(bucket->clseg);
+ }
+ return ret;
+}
+
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
+ int max)
+{
+ int i, rv = 0, cnt;
+
+ lockdep_assert_held(cinfo->lock);
+ for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+ cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
+ cinfo, max);
+ max -= cnt;
+ rv += cnt;
+ }
+ return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_commit_bucket *b;
+ struct pnfs_layout_segment *freeme;
+ int i;
+
+ lockdep_assert_held(cinfo->lock);
+restart:
+ for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ if (pnfs_generic_transfer_commit_list(&b->written, dst,
+ cinfo, 0)) {
+ freeme = b->wlseg;
+ b->wlseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ spin_lock(cinfo->lock);
+ goto restart;
+ }
+ }
+ cinfo->ds->nwritten = 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+
+static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_bucket *bucket;
+ struct pnfs_layout_segment *freeme;
+ int i;
+
+ for (i = idx; i < fl_cinfo->nbuckets; i++) {
+ bucket = &fl_cinfo->buckets[i];
+ if (list_empty(&bucket->committing))
+ continue;
+ nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
+ spin_lock(cinfo->lock);
+ freeme = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ pnfs_put_lseg(freeme);
+ }
+}
+
+static unsigned int
+pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
+ struct list_head *list)
+{
+ struct pnfs_ds_commit_info *fl_cinfo;
+ struct pnfs_commit_bucket *bucket;
+ struct nfs_commit_data *data;
+ int i;
+ unsigned int nreq = 0;
+
+ fl_cinfo = cinfo->ds;
+ bucket = fl_cinfo->buckets;
+ for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+ if (list_empty(&bucket->committing))
+ continue;
+ data = nfs_commitdata_alloc();
+ if (!data)
+ break;
+ data->ds_commit_index = i;
+ spin_lock(cinfo->lock);
+ data->lseg = bucket->clseg;
+ bucket->clseg = NULL;
+ spin_unlock(cinfo->lock);
+ list_add(&data->pages, list);
+ nreq++;
+ }
+
+ /* Clean up on error */
+ pnfs_generic_retry_commit(cinfo, i);
+ return nreq;
+}
+
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+ int how, struct nfs_commit_info *cinfo,
+ int (*initiate_commit)(struct nfs_commit_data *data,
+ int how))
+{
+ struct nfs_commit_data *data, *tmp;
+ LIST_HEAD(list);
+ unsigned int nreq = 0;
+
+ if (!list_empty(mds_pages)) {
+ data = nfs_commitdata_alloc();
+ if (data != NULL) {
+ data->lseg = NULL;
+ list_add(&data->pages, &list);
+ nreq++;
+ } else {
+ nfs_retry_commit(mds_pages, NULL, cinfo, 0);
+ pnfs_generic_retry_commit(cinfo, 0);
+ cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ return -ENOMEM;
+ }
+ }
+
+ nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
+
+ if (nreq == 0) {
+ cinfo->completion_ops->error_cleanup(NFS_I(inode));
+ goto out;
+ }
+
+ atomic_add(nreq, &cinfo->mds->rpcs_out);
+
+ list_for_each_entry_safe(data, tmp, &list, pages) {
+ list_del_init(&data->pages);
+ if (!data->lseg) {
+ nfs_init_commit(data, mds_pages, NULL, cinfo);
+ nfs_initiate_commit(NFS_CLIENT(inode), data,
+ NFS_PROTO(data->inode),
+ data->mds_ops, how, 0);
+ } else {
+ struct pnfs_commit_bucket *buckets;
+
+ buckets = cinfo->ds->buckets;
+ nfs_init_commit(data,
+ &buckets[data->ds_commit_index].committing,
+ data->lseg,
+ cinfo);
+ initiate_commit(data, how);
+ }
+ }
+out:
+ cinfo->ds->ncommitting = 0;
+ return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ * - set to 1 on allocation
+ * - incremented when a device id maps a data server already in the cache.
+ * - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+ if (ds == NULL) {
+ printk(KERN_WARNING "%s NULL device\n", __func__);
+ return;
+ }
+ printk(KERN_WARNING " ds %s\n"
+ " ref count %d\n"
+ " client %p\n"
+ " cl_exchange_flags %x\n",
+ ds->ds_remotestr,
+ atomic_read(&ds->ds_count), ds->ds_clp,
+ ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+ struct sockaddr_in *a, *b;
+ struct sockaddr_in6 *a6, *b6;
+
+ if (addr1->sa_family != addr2->sa_family)
+ return false;
+
+ switch (addr1->sa_family) {
+ case AF_INET:
+ a = (struct sockaddr_in *)addr1;
+ b = (struct sockaddr_in *)addr2;
+
+ if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+ a->sin_port == b->sin_port)
+ return true;
+ break;
+
+ case AF_INET6:
+ a6 = (struct sockaddr_in6 *)addr1;
+ b6 = (struct sockaddr_in6 *)addr2;
+
+ /* LINKLOCAL addresses must have matching scope_id */
+ if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+ IPV6_ADDR_SCOPE_LINKLOCAL &&
+ a6->sin6_scope_id != b6->sin6_scope_id)
+ return false;
+
+ if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+ a6->sin6_port == b6->sin6_port)
+ return true;
+ break;
+
+ default:
+ dprintk("%s: unhandled address family: %u\n",
+ __func__, addr1->sa_family);
+ return false;
+ }
+
+ return false;
+}
+
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+ const struct list_head *dsaddrs2)
+{
+ struct nfs4_pnfs_ds_addr *da1, *da2;
+
+ /* step through both lists, comparing as we go */
+ for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+ da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+ da1 != NULL && da2 != NULL;
+ da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+ da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+ if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+ (struct sockaddr *)&da2->da_addr))
+ return false;
+ }
+ if (da1 == NULL && da2 == NULL)
+ return true;
+
+ return false;
+}
+
+/*
+ * Lookup DS by addresses. nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+ struct nfs4_pnfs_ds *ds;
+
+ list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+ if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+ return ds;
+ return NULL;
+}
+
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+ struct nfs4_pnfs_ds_addr *da;
+
+ dprintk("--> %s\n", __func__);
+ ifdebug(FACILITY)
+ print_ds(ds);
+
+ nfs_put_client(ds->ds_clp);
+
+ while (!list_empty(&ds->ds_addrs)) {
+ da = list_first_entry(&ds->ds_addrs,
+ struct nfs4_pnfs_ds_addr,
+ da_node);
+ list_del_init(&da->da_node);
+ kfree(da->da_remotestr);
+ kfree(da);
+ }
+
+ kfree(ds->ds_remotestr);
+ kfree(ds);
+}
+
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+ if (atomic_dec_and_lock(&ds->ds_count,
+ &nfs4_ds_cache_lock)) {
+ list_del_init(&ds->ds_node);
+ spin_unlock(&nfs4_ds_cache_lock);
+ destroy_ds(ds);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da;
+ char *remotestr;
+ size_t len;
+ char *p;
+
+ len = 3; /* '{', '}' and eol */
+ list_for_each_entry(da, dsaddrs, da_node) {
+ len += strlen(da->da_remotestr) + 1; /* string plus comma */
+ }
+
+ remotestr = kzalloc(len, gfp_flags);
+ if (!remotestr)
+ return NULL;
+
+ p = remotestr;
+ *(p++) = '{';
+ len--;
+ list_for_each_entry(da, dsaddrs, da_node) {
+ size_t ll = strlen(da->da_remotestr);
+
+ if (ll > len)
+ goto out_err;
+
+ memcpy(p, da->da_remotestr, ll);
+ p += ll;
+ len -= ll;
+
+ if (len < 1)
+ goto out_err;
+ (*p++) = ',';
+ len--;
+ }
+ if (len < 2)
+ goto out_err;
+ *(p++) = '}';
+ *p = '\0';
+ return remotestr;
+out_err:
+ kfree(remotestr);
+ return NULL;
+}
+
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+ char *remotestr;
+
+ if (list_empty(dsaddrs)) {
+ dprintk("%s: no addresses defined\n", __func__);
+ goto out;
+ }
+
+ ds = kzalloc(sizeof(*ds), gfp_flags);
+ if (!ds)
+ goto out;
+
+ /* this is only used for debugging, so it's ok if its NULL */
+ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+
+ spin_lock(&nfs4_ds_cache_lock);
+ tmp_ds = _data_server_lookup_locked(dsaddrs);
+ if (tmp_ds == NULL) {
+ INIT_LIST_HEAD(&ds->ds_addrs);
+ list_splice_init(dsaddrs, &ds->ds_addrs);
+ ds->ds_remotestr = remotestr;
+ atomic_set(&ds->ds_count, 1);
+ INIT_LIST_HEAD(&ds->ds_node);
+ ds->ds_clp = NULL;
+ list_add(&ds->ds_node, &nfs4_data_server_cache);
+ dprintk("%s add new data server %s\n", __func__,
+ ds->ds_remotestr);
+ } else {
+ kfree(remotestr);
+ kfree(ds);
+ atomic_inc(&tmp_ds->ds_count);
+ dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+ __func__, tmp_ds->ds_remotestr,
+ atomic_read(&tmp_ds->ds_count));
+ ds = tmp_ds;
+ }
+ spin_unlock(&nfs4_ds_cache_lock);
+out:
+ return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+ might_sleep();
+ wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+ TASK_KILLABLE);
+}
+
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+ smp_mb__before_atomic();
+ clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+ smp_mb__after_atomic();
+ wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+
+static struct nfs_client *(*get_v3_ds_connect)(
+ struct nfs_client *mds_clp,
+ const struct sockaddr *ds_addr,
+ int ds_addrlen,
+ int ds_proto,
+ unsigned int ds_timeo,
+ unsigned int ds_retrans,
+ rpc_authflavor_t au_flavor);
+
+static bool load_v3_ds_connect(void)
+{
+ if (!get_v3_ds_connect) {
+ get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+ WARN_ON_ONCE(!get_v3_ds_connect);
+ }
+
+ return(get_v3_ds_connect != NULL);
+}
+
+void __exit nfs4_pnfs_v3_ds_connect_unload(void)
+{
+ if (get_v3_ds_connect) {
+ symbol_put(nfs3_set_ds_client);
+ get_v3_ds_connect = NULL;
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
+
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans,
+ rpc_authflavor_t au_flavor)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ int status = 0;
+
+ dprintk("--> %s DS %s au_flavor %d\n", __func__,
+ ds->ds_remotestr, au_flavor);
+
+ if (!load_v3_ds_connect())
+ goto out;
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ clp = get_v3_ds_connect(mds_srv->nfs_client,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans, au_flavor);
+ if (!IS_ERR(clp))
+ break;
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ smp_wmb();
+ ds->ds_clp = clp;
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+}
+
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+ struct nfs4_pnfs_ds *ds,
+ unsigned int timeo,
+ unsigned int retrans,
+ u32 minor_version,
+ rpc_authflavor_t au_flavor)
+{
+ struct nfs_client *clp = ERR_PTR(-EIO);
+ struct nfs4_pnfs_ds_addr *da;
+ int status = 0;
+
+ dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
+ au_flavor);
+
+ list_for_each_entry(da, &ds->ds_addrs, da_node) {
+ dprintk("%s: DS %s: trying address %s\n",
+ __func__, ds->ds_remotestr, da->da_remotestr);
+
+ clp = nfs4_set_ds_client(mds_srv->nfs_client,
+ (struct sockaddr *)&da->da_addr,
+ da->da_addrlen, IPPROTO_TCP,
+ timeo, retrans, minor_version,
+ au_flavor);
+ if (!IS_ERR(clp))
+ break;
+ }
+
+ if (IS_ERR(clp)) {
+ status = PTR_ERR(clp);
+ goto out;
+ }
+
+ status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
+ if (status)
+ goto out_put;
+
+ smp_wmb();
+ ds->ds_clp = clp;
+ dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+ return status;
+out_put:
+ nfs_put_client(clp);
+ goto out;
+}
+
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable.
+ */
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+ struct nfs4_deviceid_node *devid, unsigned int timeo,
+ unsigned int retrans, u32 version,
+ u32 minor_version, rpc_authflavor_t au_flavor)
+{
+ if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+ int err = 0;
+
+ if (version == 3) {
+ err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
+ retrans, au_flavor);
+ } else if (version == 4) {
+ err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
+ retrans, minor_version,
+ au_flavor);
+ } else {
+ dprintk("%s: unsupported DS version %d\n", __func__,
+ version);
+ err = -EPROTONOSUPPORT;
+ }
+
+ if (err)
+ nfs4_mark_deviceid_unavailable(devid);
+ nfs4_clear_ds_conn_bit(ds);
+ } else {
+ nfs4_wait_ds_connect(ds);
+ }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+ struct nfs4_pnfs_ds_addr *da = NULL;
+ char *buf, *portstr;
+ __be16 port;
+ int nlen, rlen;
+ int tmp[2];
+ __be32 *p;
+ char *netid, *match_netid;
+ size_t len, match_netid_len;
+ char *startsep = "";
+ char *endsep = "";
+
+
+ /* r_netid */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_err;
+ nlen = be32_to_cpup(p++);
+
+ p = xdr_inline_decode(xdr, nlen);
+ if (unlikely(!p))
+ goto out_err;
+
+ netid = kmalloc(nlen+1, gfp_flags);
+ if (unlikely(!netid))
+ goto out_err;
+
+ netid[nlen] = '\0';
+ memcpy(netid, p, nlen);
+
+ /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p))
+ goto out_free_netid;
+ rlen = be32_to_cpup(p);
+
+ p = xdr_inline_decode(xdr, rlen);
+ if (unlikely(!p))
+ goto out_free_netid;
+
+ /* port is ".ABC.DEF", 8 chars max */
+ if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+ dprintk("%s: Invalid address, length %d\n", __func__,
+ rlen);
+ goto out_free_netid;
+ }
+ buf = kmalloc(rlen + 1, gfp_flags);
+ if (!buf) {
+ dprintk("%s: Not enough memory\n", __func__);
+ goto out_free_netid;
+ }
+ buf[rlen] = '\0';
+ memcpy(buf, p, rlen);
+
+ /* replace port '.' with '-' */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot in port\n",
+ __func__);
+ goto out_free_buf;
+ }
+ *portstr = '-';
+
+ /* find '.' between address and port */
+ portstr = strrchr(buf, '.');
+ if (!portstr) {
+ dprintk("%s: Failed finding expected dot between address and "
+ "port\n", __func__);
+ goto out_free_buf;
+ }
+ *portstr = '\0';
+
+ da = kzalloc(sizeof(*da), gfp_flags);
+ if (unlikely(!da))
+ goto out_free_buf;
+
+ INIT_LIST_HEAD(&da->da_node);
+
+ if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+ sizeof(da->da_addr))) {
+ dprintk("%s: error parsing address %s\n", __func__, buf);
+ goto out_free_da;
+ }
+
+ portstr++;
+ sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+ port = htons((tmp[0] << 8) | (tmp[1]));
+
+ switch (da->da_addr.ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in);
+ match_netid = "tcp";
+ match_netid_len = 3;
+ break;
+
+ case AF_INET6:
+ ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+ da->da_addrlen = sizeof(struct sockaddr_in6);
+ match_netid = "tcp6";
+ match_netid_len = 4;
+ startsep = "[";
+ endsep = "]";
+ break;
+
+ default:
+ dprintk("%s: unsupported address family: %u\n",
+ __func__, da->da_addr.ss_family);
+ goto out_free_da;
+ }
+
+ if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+ dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+ __func__, netid, match_netid);
+ goto out_free_da;
+ }
+
+ /* save human readable address */
+ len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+ da->da_remotestr = kzalloc(len, gfp_flags);
+
+ /* NULL is ok, only used for dprintk */
+ if (da->da_remotestr)
+ snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+ buf, endsep, ntohs(port));
+
+ dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+ kfree(buf);
+ kfree(netid);
+ return da;
+
+out_free_da:
+ kfree(da);
+out_free_buf:
+ dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+ kfree(buf);
+out_free_netid:
+ kfree(netid);
+out_err:
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
+{
+ struct list_head *list;
+ struct pnfs_commit_bucket *buckets;
+
+ spin_lock(cinfo->lock);
+ buckets = cinfo->ds->buckets;
+ list = &buckets[ds_commit_idx].written;
+ if (list_empty(list)) {
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+ buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+ }
+ set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+ cinfo->ds->nwritten++;
+ spin_unlock(cinfo->lock);
+
+ nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
+ struct nfs_pgio_mirror *mirror;
+
pgio->pg_ops = &nfs_pgio_rw_ops;
- pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+
+ /* read path should never have more than one mirror */
+ WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
struct nfs_page *new;
unsigned int len;
struct nfs_pageio_descriptor pgio;
+ struct nfs_pgio_mirror *pgm;
len = nfs_page_length(page);
if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
&nfs_async_read_completion_ops);
nfs_pageio_add_request(&pgio, new);
nfs_pageio_complete(&pgio);
- NFS_I(inode)->read_io += pgio.pg_bytes_written;
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+ pgm = &pgio.pg_mirrors[0];
+ NFS_I(inode)->read_io += pgm->pg_bytes_written;
+
return 0;
}
@@ -168,13 +182,14 @@ out:
static void nfs_initiate_read(struct nfs_pgio_header *hdr,
struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
struct rpc_task_setup *task_setup_data, int how)
{
struct inode *inode = hdr->inode;
int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
task_setup_data->flags |= swap_flags;
- NFS_PROTO(inode)->read_setup(hdr, msg);
+ rpc_ops->read_setup(hdr, msg);
}
static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct nfs_pageio_descriptor pgio;
+ struct nfs_pgio_mirror *pgm;
struct nfs_readdesc desc = {
.pgio = &pgio,
};
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
&nfs_async_read_completion_ops);
ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
-
nfs_pageio_complete(&pgio);
- NFS_I(inode)->read_io += pgio.pg_bytes_written;
- npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+ /* It doesn't make sense to do mirrored reads! */
+ WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+
+ pgm = &pgio.pg_mirrors[0];
+ NFS_I(inode)->read_io += pgm->pg_bytes_written;
+ npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
nfs_add_stats(inode, NFSIOS_READPAGES, npages);
read_complete:
put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
.destroy_inode = nfs_destroy_inode,
.write_inode = nfs_write_inode,
.drop_inode = nfs_drop_inode,
- .put_super = nfs_put_super,
.statfs = nfs_statfs,
.evict_inode = nfs_evict_inode,
.umount_begin = nfs_umount_begin,
@@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void)
unregister_filesystem(&nfs_fs_type);
}
-void nfs_sb_active(struct super_block *sb)
+bool nfs_sb_active(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
- if (atomic_inc_return(&server->active) == 1)
- atomic_inc(&sb->s_active);
+ if (!atomic_inc_not_zero(&sb->s_active))
+ return false;
+ if (atomic_inc_return(&server->active) != 1)
+ atomic_dec(&sb->s_active);
+ return true;
}
EXPORT_SYMBOL_GPL(nfs_sb_active);
@@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
error = nfs_bdi_register(server);
if (error) {
mntroot = ERR_PTR(error);
- goto error_splat_bdi;
+ goto error_splat_super;
}
server->super = s;
}
@@ -2601,9 +2603,6 @@ error_splat_root:
dput(mntroot);
mntroot = ERR_PTR(error);
error_splat_super:
- if (server && !s->s_root)
- bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
deactivate_locked_super(s);
goto out;
}
@@ -2651,27 +2650,19 @@ out:
EXPORT_SYMBOL_GPL(nfs_fs_mount);
/*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
-void nfs_put_super(struct super_block *s)
-{
- struct nfs_server *server = NFS_SB(s);
-
- bdi_unregister(&server->backing_dev_info);
-}
-EXPORT_SYMBOL_GPL(nfs_put_super);
-
-/*
* Destroy an NFS2/3 superblock
*/
void nfs_kill_super(struct super_block *s)
{
struct nfs_server *server = NFS_SB(s);
+ dev_t dev = s->s_dev;
+
+ generic_shutdown_super(s);
- kill_anon_super(s);
nfs_fscache_release_super_cookie(s);
+
nfs_free_server(server);
+ free_anon_bdev(dev);
}
EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..595d81e354d1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
do {
/*
* Subrequests are always contiguous, non overlapping
- * and in order. If not, it's a programming error.
+ * and in order - but may be repeated (mirrored writes).
*/
- WARN_ON_ONCE(subreq->wb_offset !=
- (head->wb_offset + total_bytes));
-
- /* keep track of how many bytes this group covers */
- total_bytes += subreq->wb_bytes;
+ if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
+ /* keep track of how many bytes this group covers */
+ total_bytes += subreq->wb_bytes;
+ } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
+ ((subreq->wb_offset + subreq->wb_bytes) >
+ (head->wb_offset + total_bytes)))) {
+ nfs_page_group_unlock(head);
+ spin_unlock(&inode->i_lock);
+ return ERR_PTR(-EIO);
+ }
if (!nfs_lock_request(subreq)) {
/* releases page group bit lock and
@@ -784,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
nfs_list_add_request(req, dst);
cinfo->mds->ncommit++;
spin_unlock(cinfo->lock);
- if (!cinfo->dreq) {
- inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
- BDI_RECLAIMABLE);
- __mark_inode_dirty(req->wb_context->dentry->d_inode,
- I_DIRTY_DATASYNC);
- }
+ if (!cinfo->dreq)
+ nfs_mark_page_unstable(req->wb_page);
}
EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -842,9 +842,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
*/
void
nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- if (pnfs_mark_request_commit(req, lseg, cinfo))
+ if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
return;
nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
}
@@ -853,7 +853,7 @@ static void
nfs_clear_page_commit(struct page *page)
{
dec_zone_page_state(page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
}
/* Called holding inode (/cinfo) lock */
@@ -900,7 +900,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
}
if (nfs_write_need_commit(hdr)) {
memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
- nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+ nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+ hdr->pgio_mirror_idx);
goto next;
}
remove_req:
@@ -1091,6 +1092,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
{
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct nfs_lock_context *l_ctx;
+ struct file_lock_context *flctx = file_inode(file)->i_flctx;
struct nfs_page *req;
int do_flush, status;
/*
@@ -1109,7 +1111,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
do_flush = req->wb_page != page || req->wb_context != ctx;
/* for now, flush if more than 1 request in page_group */
do_flush |= req->wb_this_page != req;
- if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
+ if (l_ctx && flctx &&
+ !(list_empty_careful(&flctx->flc_posix) &&
+ list_empty_careful(&flctx->flc_flock))) {
do_flush |= l_ctx->lockowner.l_owner != current->files
|| l_ctx->lockowner.l_pid != current->tgid;
}
@@ -1170,6 +1174,13 @@ out:
return PageUptodate(page) != 0;
}
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+ return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+ fl->fl_type == F_WRLCK;
+}
+
/* If we know the page is up to date, and we're not using byte range locks (or
* if we have the whole file locked for writing), it may be more efficient to
* extend the write to cover the entire page in order to avoid fragmentation
@@ -1180,17 +1191,36 @@ out:
*/
static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
{
+ int ret;
+ struct file_lock_context *flctx = inode->i_flctx;
+ struct file_lock *fl;
+
if (file->f_flags & O_DSYNC)
return 0;
if (!nfs_write_pageuptodate(page, inode))
return 0;
if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
return 1;
- if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
- inode->i_flock->fl_end == OFFSET_MAX &&
- inode->i_flock->fl_type != F_RDLCK))
- return 1;
- return 0;
+ if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
+ list_empty_careful(&flctx->flc_posix)))
+ return 0;
+
+ /* Check to see if there are whole file write locks */
+ ret = 0;
+ spin_lock(&flctx->flc_lock);
+ if (!list_empty(&flctx->flc_posix)) {
+ fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+ fl_list);
+ if (is_whole_file_wrlock(fl))
+ ret = 1;
+ } else if (!list_empty(&flctx->flc_flock)) {
+ fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+ fl_list);
+ if (fl->fl_type == F_WRLCK)
+ ret = 1;
+ }
+ spin_unlock(&flctx->flc_lock);
+ return ret;
}
/*
@@ -1240,15 +1270,15 @@ static int flush_task_priority(int how)
static void nfs_initiate_write(struct nfs_pgio_header *hdr,
struct rpc_message *msg,
+ const struct nfs_rpc_ops *rpc_ops,
struct rpc_task_setup *task_setup_data, int how)
{
- struct inode *inode = hdr->inode;
int priority = flush_task_priority(how);
task_setup_data->priority = priority;
- NFS_PROTO(inode)->write_setup(hdr, msg);
+ rpc_ops->write_setup(hdr, msg);
- nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+ nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
&task_setup_data->rpc_client, msg, hdr);
}
@@ -1298,8 +1328,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
+ struct nfs_pgio_mirror *mirror;
+
pgio->pg_ops = &nfs_pgio_rw_ops;
- pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+
+ nfs_pageio_stop_mirroring(pgio);
+
+ mirror = &pgio->pg_mirrors[0];
+ mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
}
EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
@@ -1465,6 +1501,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
EXPORT_SYMBOL_GPL(nfs_commitdata_release);
int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+ const struct nfs_rpc_ops *nfs_ops,
const struct rpc_call_ops *call_ops,
int how, int flags)
{
@@ -1486,7 +1523,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.priority = priority,
};
/* Set up the initial task struct. */
- NFS_PROTO(data->inode)->commit_setup(data, &msg);
+ nfs_ops->commit_setup(data, &msg);
dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
@@ -1554,19 +1591,17 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
void nfs_retry_commit(struct list_head *page_list,
struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo)
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx)
{
struct nfs_page *req;
while (!list_empty(page_list)) {
req = nfs_list_entry(page_list->next);
nfs_list_remove_request(req);
- nfs_mark_request_commit(req, lseg, cinfo);
- if (!cinfo->dreq) {
- dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
- dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
- BDI_RECLAIMABLE);
- }
+ nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ if (!cinfo->dreq)
+ nfs_clear_page_commit(req->wb_page);
nfs_unlock_and_release_request(req);
}
}
@@ -1589,10 +1624,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
atomic_inc(&cinfo->mds->rpcs_out);
- return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
- how, 0);
+ return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
+ data->mds_ops, how, 0);
out_bad:
- nfs_retry_commit(head, NULL, cinfo);
+ nfs_retry_commit(head, NULL, cinfo, 0);
cinfo->completion_ops->error_cleanup(NFS_I(inode));
return -ENOMEM;
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
If unsure, say N.
+config NFSD_PNFS
+ bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+ depends on NFSD_V4
+ help
+ This option enables support for the parallel NFS features of the
+ minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+ server.
+
+ If unsure, say N.
+
config NFSD_V4_SECURITY_LABEL
bool "Provide Security Label support for NFSv4 server"
depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
# Makefile for the Linux nfs server
#
+ccflags-y += -I$(src) # needed for trace events
+
obj-$(CONFIG_NFSD) += nfsd.o
-nfsd-y := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y += trace.o
+
+nfsd-y += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3) += nfs3proc.o nfs3xdr.o
nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
nfsd-$(CONFIG_NFSD_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+
+#include <linux/nfsd/debug.h>
+
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev;
+ struct pnfs_block_volume *b;
+
+ dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+ sizeof(struct pnfs_block_volume), GFP_KERNEL);
+ if (!dev)
+ return -ENOMEM;
+ gdp->gd_device = dev;
+
+ dev->nr_volumes = 1;
+ b = &dev->volumes[0];
+
+ b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+ b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+ return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+ &b->simple.offset);
+}
+
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ if (sb->s_bdev != sb->s_bdev->bd_contains)
+ return nfserr_inval;
+ return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *args)
+{
+ struct nfsd4_layout_seg *seg = &args->lg_seg;
+ struct super_block *sb = inode->i_sb;
+ u32 block_size = (1 << inode->i_blkbits);
+ struct pnfs_block_extent *bex;
+ struct iomap iomap;
+ u32 device_generation = 0;
+ int error;
+
+ /*
+ * We do not attempt to support I/O smaller than the fs block size,
+ * or not aligned to it.
+ */
+ if (args->lg_minlength < block_size) {
+ dprintk("pnfsd: I/O too small\n");
+ goto out_layoutunavailable;
+ }
+ if (seg->offset & (block_size - 1)) {
+ dprintk("pnfsd: I/O misaligned\n");
+ goto out_layoutunavailable;
+ }
+
+ /*
+ * Some clients barf on non-zero block numbers for NONE or INVALID
+ * layouts, so make sure to zero the whole structure.
+ */
+ error = -ENOMEM;
+ bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+ if (!bex)
+ goto out_error;
+ args->lg_content = bex;
+
+ error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+ &iomap, seg->iomode != IOMODE_READ,
+ &device_generation);
+ if (error) {
+ if (error == -ENXIO)
+ goto out_layoutunavailable;
+ goto out_error;
+ }
+
+ if (iomap.length < args->lg_minlength) {
+ dprintk("pnfsd: extent smaller than minlength\n");
+ goto out_layoutunavailable;
+ }
+
+ switch (iomap.type) {
+ case IOMAP_MAPPED:
+ if (seg->iomode == IOMODE_READ)
+ bex->es = PNFS_BLOCK_READ_DATA;
+ else
+ bex->es = PNFS_BLOCK_READWRITE_DATA;
+ bex->soff = (iomap.blkno << 9);
+ break;
+ case IOMAP_UNWRITTEN:
+ if (seg->iomode & IOMODE_RW) {
+ /*
+ * Crack monkey special case from section 2.3.1.
+ */
+ if (args->lg_minlength == 0) {
+ dprintk("pnfsd: no soup for you!\n");
+ goto out_layoutunavailable;
+ }
+
+ bex->es = PNFS_BLOCK_INVALID_DATA;
+ bex->soff = (iomap.blkno << 9);
+ break;
+ }
+ /*FALLTHRU*/
+ case IOMAP_HOLE:
+ if (seg->iomode == IOMODE_READ) {
+ bex->es = PNFS_BLOCK_NONE_DATA;
+ break;
+ }
+ /*FALLTHRU*/
+ case IOMAP_DELALLOC:
+ default:
+ WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+ goto out_layoutunavailable;
+ }
+
+ error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+ if (error)
+ goto out_error;
+ bex->foff = iomap.offset;
+ bex->len = iomap.length;
+
+ seg->offset = iomap.offset;
+ seg->length = iomap.length;
+
+ dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+ return 0;
+
+out_error:
+ seg->length = 0;
+ return nfserrno(error);
+out_layoutunavailable:
+ seg->length = 0;
+ return nfserr_layoutunavailable;
+}
+
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp)
+{
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct iattr iattr = { .ia_valid = 0 };
+ struct iomap *iomaps;
+ int nr_iomaps;
+ int error;
+
+ nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+ lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+ if (nr_iomaps < 0)
+ return nfserrno(nr_iomaps);
+
+ if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+ timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+ lcp->lc_mtime = current_fs_time(inode->i_sb);
+ iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+ iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+
+ if (new_size > i_size_read(inode)) {
+ iattr.ia_valid |= ATTR_SIZE;
+ iattr.ia_size = new_size;
+ }
+
+ error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+ nr_iomaps, &iattr);
+ kfree(iomaps);
+ return nfserrno(error);
+}
+
+const struct nfsd4_layout_ops bl_layout_ops = {
+ .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo,
+ .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo,
+ .proc_layoutget = nfsd4_block_proc_layoutget,
+ .encode_layoutget = nfsd4_block_encode_layoutget,
+ .proc_layoutcommit = nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct pnfs_block_extent *b = lgp->lg_content;
+ int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+ if (!p)
+ return nfserr_toosmall;
+
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(1); /* we always return a single extent */
+
+ p = xdr_encode_opaque_fixed(p, &b->vol_id,
+ sizeof(struct nfsd4_deviceid));
+ p = xdr_encode_hyper(p, b->foff);
+ p = xdr_encode_hyper(p, b->len);
+ p = xdr_encode_hyper(p, b->soff);
+ *p++ = cpu_to_be32(b->es);
+ return 0;
+}
+
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+ __be32 *p;
+ int len;
+
+ switch (b->type) {
+ case PNFS_BLOCK_VOLUME_SIMPLE:
+ len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+ p = xdr_reserve_space(xdr, len);
+ if (!p)
+ return -ETOOSMALL;
+
+ *p++ = cpu_to_be32(b->type);
+ *p++ = cpu_to_be32(1); /* single signature */
+ p = xdr_encode_hyper(p, b->simple.offset);
+ p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+ break;
+ default:
+ return -ENOTSUPP;
+ }
+
+ return len;
+}
+
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+ int len = sizeof(__be32), ret, i;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, len + sizeof(__be32));
+ if (!p)
+ return nfserr_resource;
+
+ for (i = 0; i < dev->nr_volumes; i++) {
+ ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+ if (ret < 0)
+ return nfserrno(ret);
+ len += ret;
+ }
+
+ /*
+ * Fill in the overall length and number of volumes at the beginning
+ * of the layout.
+ */
+ *p++ = cpu_to_be32(len);
+ *p++ = cpu_to_be32(dev->nr_volumes);
+ return 0;
+}
+
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size)
+{
+ struct iomap *iomaps;
+ u32 nr_iomaps, expected, i;
+
+ if (len < sizeof(u32)) {
+ dprintk("%s: extent array too small: %u\n", __func__, len);
+ return -EINVAL;
+ }
+
+ nr_iomaps = be32_to_cpup(p++);
+ expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+ if (len != expected) {
+ dprintk("%s: extent array size mismatch: %u/%u\n",
+ __func__, len, expected);
+ return -EINVAL;
+ }
+
+ iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+ if (!iomaps) {
+ dprintk("%s: failed to allocate extent array\n", __func__);
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < nr_iomaps; i++) {
+ struct pnfs_block_extent bex;
+
+ memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+ p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+
+ p = xdr_decode_hyper(p, &bex.foff);
+ if (bex.foff & (block_size - 1)) {
+ dprintk("%s: unaligned offset %lld\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.len);
+ if (bex.len & (block_size - 1)) {
+ dprintk("%s: unaligned length %lld\n",
+ __func__, bex.foff);
+ goto fail;
+ }
+ p = xdr_decode_hyper(p, &bex.soff);
+ if (bex.soff & (block_size - 1)) {
+ dprintk("%s: unaligned disk offset %lld\n",
+ __func__, bex.soff);
+ goto fail;
+ }
+ bex.es = be32_to_cpup(p++);
+ if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+ dprintk("%s: incorrect extent state %d\n",
+ __func__, bex.es);
+ goto fail;
+ }
+
+ iomaps[i].offset = bex.foff;
+ iomaps[i].length = bex.len;
+ }
+
+ *iomapp = iomaps;
+ return nr_iomaps;
+fail:
+ kfree(iomaps);
+ return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+
+#include <linux/blkdev.h>
+#include "xdr4.h"
+
+struct iomap;
+struct xdr_stream;
+
+enum pnfs_block_extent_state {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2,
+ PNFS_BLOCK_NONE_DATA = 3,
+};
+
+struct pnfs_block_extent {
+ struct nfsd4_deviceid vol_id;
+ u64 foff;
+ u64 len;
+ u64 soff;
+ enum pnfs_block_extent_state es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE 44
+
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
+};
+
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN 128
+
+struct pnfs_block_volume {
+ enum pnfs_block_volume_type type;
+ union {
+ struct {
+ u64 offset;
+ u32 sig_len;
+ u8 sig[PNFS_BLOCK_UUID_LEN];
+ } simple;
+ };
+};
+
+struct pnfs_block_deviceaddr {
+ u32 nr_volumes;
+ struct pnfs_block_volume volumes[];
+};
+
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+ struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+ u32 block_size);
+
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
#include "nfsd.h"
#include "nfsfh.h"
#include "netns.h"
+#include "pnfs.h"
#define NFSDDBG_FACILITY NFSDDBG_EXPORT
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
exp.ex_client = dom;
exp.cd = cd;
+ exp.ex_devid_map = NULL;
/* expiry */
err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
if (!gid_valid(exp.ex_anon_gid))
goto out4;
err = 0;
+
+ nfsd4_setup_layout_type(&exp);
}
expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
new->ex_fslocs.locations = NULL;
new->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = 0;
+ new->ex_layout_type = 0;
new->ex_uuid = NULL;
new->cd = item->cd;
}
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
new->ex_anon_uid = item->ex_anon_uid;
new->ex_anon_gid = item->ex_anon_gid;
new->ex_fsid = item->ex_fsid;
+ new->ex_devid_map = item->ex_devid_map;
+ item->ex_devid_map = NULL;
new->ex_uuid = item->ex_uuid;
item->ex_uuid = NULL;
new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
item->ex_fslocs.locations_count = 0;
new->ex_fslocs.migrated = item->ex_fslocs.migrated;
item->ex_fslocs.migrated = 0;
+ new->ex_layout_type = item->ex_layout_type;
new->ex_nflavors = item->ex_nflavors;
for (i = 0; i < MAX_SECINFO_LIST; i++) {
new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
struct nfsd4_fs_locations ex_fslocs;
uint32_t ex_nflavors;
struct exp_flavor_info ex_flavors[MAX_SECINFO_LIST];
+ enum pnfs_layouttype ex_layout_type;
+ struct nfsd4_deviceid_map *ex_devid_map;
struct cache_detail *cd;
};
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
return status;
}
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ * struct layoutrecall_file4 {
+ * nfs_fh4 lor_fh;
+ * offset4 lor_offset;
+ * length4 lor_length;
+ * stateid4 lor_stateid;
+ * };
+ *
+ * union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ * case LAYOUTRECALL4_FILE:
+ * layoutrecall_file4 lor_layout;
+ * case LAYOUTRECALL4_FSID:
+ * fsid4 lor_fsid;
+ * case LAYOUTRECALL4_ALL:
+ * void;
+ * };
+ *
+ * struct CB_LAYOUTRECALL4args {
+ * layouttype4 clora_type;
+ * layoutiomode4 clora_iomode;
+ * bool clora_changed;
+ * layoutrecall4 clora_recall;
+ * };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+ const struct nfs4_layout_stateid *ls,
+ struct nfs4_cb_compound_hdr *hdr)
+{
+ __be32 *p;
+
+ BUG_ON(hdr->minorversion == 0);
+
+ p = xdr_reserve_space(xdr, 5 * 4);
+ *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+ *p++ = cpu_to_be32(ls->ls_layout_type);
+ *p++ = cpu_to_be32(IOMODE_ANY);
+ *p++ = cpu_to_be32(1);
+ *p = cpu_to_be32(RETURN_FILE);
+
+ encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+
+ p = xdr_reserve_space(xdr, 2 * 8);
+ p = xdr_encode_hyper(p, 0);
+ xdr_encode_hyper(p, NFS4_MAX_UINT64);
+
+ encode_stateid4(xdr, &ls->ls_recall_sid);
+
+ hdr->nops++;
+}
+
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ const struct nfsd4_callback *cb)
+{
+ const struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ struct nfs4_cb_compound_hdr hdr = {
+ .ident = 0,
+ .minorversion = cb->cb_minorversion,
+ };
+
+ encode_cb_compound4args(xdr, &hdr);
+ encode_cb_sequence4args(xdr, cb, &hdr);
+ encode_cb_layout4args(xdr, ls, &hdr);
+ encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfsd4_callback *cb)
+{
+ struct nfs4_cb_compound_hdr hdr;
+ enum nfsstat4 nfserr;
+ int status;
+
+ status = decode_cb_compound4res(xdr, &hdr);
+ if (unlikely(status))
+ goto out;
+ if (cb) {
+ status = decode_cb_sequence4res(xdr, cb);
+ if (unlikely(status))
+ goto out;
+ }
+ status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+ if (unlikely(status))
+ goto out;
+ if (unlikely(nfserr != NFS4_OK))
+ status = nfs_cb_stat_to_errno(nfserr);
+out:
+ return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
/*
* RPC procedure tables
*/
@@ -563,6 +659,9 @@ out:
static struct rpc_procinfo nfs4_cb_procedures[] = {
PROC(CB_NULL, NULL, cb_null, cb_null),
PROC(CB_RECALL, COMPOUND, cb_recall, cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+ PROC(CB_LAYOUT, COMPOUND, cb_layout, cb_layout),
+#endif
};
static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+
+#define NFSDDBG_FACILITY NFSDDBG_PNFS
+
+struct nfs4_layout {
+ struct list_head lo_perstate;
+ struct nfs4_layout_stateid *lo_state;
+ struct nfsd4_layout_seg lo_seg;
+};
+
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = {
+ [LAYOUT_BLOCK_VOLUME] = &bl_layout_ops,
+};
+
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+
+static inline u32 devid_hashfn(u64 idx)
+{
+ return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+ const struct knfsd_fh *fh = &fhp->fh_handle;
+ size_t fsid_len = key_len(fh->fh_fsid_type);
+ struct nfsd4_deviceid_map *map, *old;
+ int i;
+
+ map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+ if (!map)
+ return;
+
+ map->fsid_type = fh->fh_fsid_type;
+ memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+
+ spin_lock(&nfsd_devid_lock);
+ if (fhp->fh_export->ex_devid_map)
+ goto out_unlock;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+ if (old->fsid_type != fh->fh_fsid_type)
+ continue;
+ if (memcmp(old->fsid, fh->fh_fsid,
+ key_len(old->fsid_type)))
+ continue;
+
+ fhp->fh_export->ex_devid_map = old;
+ goto out_unlock;
+ }
+ }
+
+ map->idx = nfsd_devid_seq++;
+ list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+ fhp->fh_export->ex_devid_map = map;
+ map = NULL;
+
+out_unlock:
+ spin_unlock(&nfsd_devid_lock);
+ kfree(map);
+}
+
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+ struct nfsd4_deviceid_map *map, *ret = NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+ if (map->idx == idx)
+ ret = map;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation)
+{
+ if (!fhp->fh_export->ex_devid_map) {
+ nfsd4_alloc_devid_map(fhp);
+ if (!fhp->fh_export->ex_devid_map)
+ return -ENOMEM;
+ }
+
+ id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+ id->generation = device_generation;
+ id->pad = 0;
+ return 0;
+}
+
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+ struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+
+ if (exp->ex_flags & NFSEXP_NOPNFS)
+ return;
+
+ if (sb->s_export_op->get_uuid &&
+ sb->s_export_op->map_blocks &&
+ sb->s_export_op->commit_blocks)
+ exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+ struct nfs4_layout_stateid *ls = layoutstateid(stid);
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+
+ trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+
+ spin_lock(&clp->cl_lock);
+ list_del_init(&ls->ls_perclnt);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_del_init(&ls->ls_perfile);
+ spin_unlock(&fp->fi_lock);
+
+ vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+ fput(ls->ls_file);
+
+ if (ls->ls_recalled)
+ atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+ struct file_lock *fl;
+ int status;
+
+ fl = locks_alloc_lock();
+ if (!fl)
+ return -ENOMEM;
+ locks_init_lock(fl);
+ fl->fl_lmops = &nfsd4_layouts_lm_ops;
+ fl->fl_flags = FL_LAYOUT;
+ fl->fl_type = F_RDLCK;
+ fl->fl_end = OFFSET_MAX;
+ fl->fl_owner = ls;
+ fl->fl_pid = current->tgid;
+ fl->fl_file = ls->ls_file;
+
+ status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+ if (status) {
+ locks_free_lock(fl);
+ return status;
+ }
+ BUG_ON(fl != NULL);
+ return 0;
+}
+
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+ struct nfs4_stid *parent, u32 layout_type)
+{
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_file *fp = parent->sc_file;
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stp;
+
+ stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+ if (!stp)
+ return NULL;
+ stp->sc_free = nfsd4_free_layout_stateid;
+ get_nfs4_file(fp);
+ stp->sc_file = fp;
+
+ ls = layoutstateid(stp);
+ INIT_LIST_HEAD(&ls->ls_perclnt);
+ INIT_LIST_HEAD(&ls->ls_perfile);
+ spin_lock_init(&ls->ls_lock);
+ INIT_LIST_HEAD(&ls->ls_layouts);
+ ls->ls_layout_type = layout_type;
+ nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+ NFSPROC4_CLNT_CB_LAYOUT);
+
+ if (parent->sc_type == NFS4_DELEG_STID)
+ ls->ls_file = get_file(fp->fi_deleg_file);
+ else
+ ls->ls_file = find_any_file(fp);
+ BUG_ON(!ls->ls_file);
+
+ if (nfsd4_layout_setlease(ls)) {
+ put_nfs4_file(fp);
+ kmem_cache_free(nfs4_layout_stateid_cache, ls);
+ return NULL;
+ }
+
+ spin_lock(&clp->cl_lock);
+ stp->sc_type = NFS4_LAYOUT_STID;
+ list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+ spin_unlock(&clp->cl_lock);
+
+ spin_lock(&fp->fi_lock);
+ list_add(&ls->ls_perfile, &fp->fi_lo_states);
+ spin_unlock(&fp->fi_lock);
+
+ trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+ return ls;
+}
+
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_stid *stid;
+ unsigned char typemask = NFS4_LAYOUT_STID;
+ __be32 status;
+
+ if (create)
+ typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+
+ status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+ net_generic(SVC_NET(rqstp), nfsd_net_id));
+ if (status)
+ goto out;
+
+ if (!fh_match(&cstate->current_fh.fh_handle,
+ &stid->sc_file->fi_fhandle)) {
+ status = nfserr_bad_stateid;
+ goto out_put_stid;
+ }
+
+ if (stid->sc_type != NFS4_LAYOUT_STID) {
+ ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+ nfs4_put_stid(stid);
+
+ status = nfserr_jukebox;
+ if (!ls)
+ goto out;
+ } else {
+ ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+
+ status = nfserr_bad_stateid;
+ if (stateid->si_generation > stid->sc_stateid.si_generation)
+ goto out_put_stid;
+ if (layout_type != ls->ls_layout_type)
+ goto out_put_stid;
+ }
+
+ *lsp = ls;
+ return 0;
+
+out_put_stid:
+ nfs4_put_stid(stid);
+out:
+ return status;
+}
+
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+ spin_lock(&ls->ls_lock);
+ if (ls->ls_recalled)
+ goto out_unlock;
+
+ ls->ls_recalled = true;
+ atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+ if (list_empty(&ls->ls_layouts))
+ goto out_unlock;
+
+ trace_layout_recall(&ls->ls_stid.sc_stateid);
+
+ atomic_inc(&ls->ls_stid.sc_count);
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfsd4_run_cb(&ls->ls_recall);
+
+out_unlock:
+ spin_unlock(&ls->ls_lock);
+}
+
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+ u64 end = seg->offset + seg->length;
+ return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+ if (end == NFS4_MAX_UINT64)
+ lo->length = NFS4_MAX_UINT64;
+ else
+ lo->length = end - lo->offset;
+}
+
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+ if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+ return false;
+ if (layout_end(&lo->lo_seg) <= s->offset)
+ return false;
+ if (layout_end(s) <= lo->lo_seg.offset)
+ return false;
+ return true;
+}
+
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+ if (lo->iomode != new->iomode)
+ return false;
+ if (layout_end(new) < lo->offset)
+ return false;
+ if (layout_end(lo) < new->offset)
+ return false;
+
+ lo->offset = min(lo->offset, new->offset);
+ layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+ return true;
+}
+
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout_stateid *l, *n;
+ __be32 nfserr = nfs_ok;
+
+ assert_spin_locked(&fp->fi_lock);
+
+ list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+ if (l != ls) {
+ nfsd4_recall_file_layout(l);
+ nfserr = nfserr_recallconflict;
+ }
+ }
+
+ return nfserr;
+}
+
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+ struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+ struct nfs4_file *fp = ls->ls_stid.sc_file;
+ struct nfs4_layout *lp, *new = NULL;
+ __be32 nfserr;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+ spin_unlock(&ls->ls_lock);
+ spin_unlock(&fp->fi_lock);
+
+ new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+ if (!new)
+ return nfserr_jukebox;
+ memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+ new->lo_state = ls;
+
+ spin_lock(&fp->fi_lock);
+ nfserr = nfsd4_recall_conflict(ls);
+ if (nfserr)
+ goto out;
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+ if (layouts_try_merge(&lp->lo_seg, seg))
+ goto done;
+ }
+
+ atomic_inc(&ls->ls_stid.sc_count);
+ list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+ new = NULL;
+done:
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ spin_unlock(&ls->ls_lock);
+out:
+ spin_unlock(&fp->fi_lock);
+ if (new)
+ kmem_cache_free(nfs4_layout_cache, new);
+ return nfserr;
+}
+
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+ while (!list_empty(reaplist)) {
+ struct nfs4_layout *lp = list_first_entry(reaplist,
+ struct nfs4_layout, lo_perstate);
+
+ list_del(&lp->lo_perstate);
+ nfs4_put_stid(&lp->lo_state->ls_stid);
+ kmem_cache_free(nfs4_layout_cache, lp);
+ }
+}
+
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+ struct list_head *reaplist)
+{
+ struct nfsd4_layout_seg *lo = &lp->lo_seg;
+ u64 end = layout_end(lo);
+
+ if (seg->offset <= lo->offset) {
+ if (layout_end(seg) >= end) {
+ list_move_tail(&lp->lo_perstate, reaplist);
+ return;
+ }
+ end = seg->offset;
+ } else {
+ /* retain the whole layout segment on a split. */
+ if (layout_end(seg) < end) {
+ dprintk("%s: split not supported\n", __func__);
+ return;
+ }
+
+ lo->offset = layout_end(seg);
+ }
+
+ layout_update_len(lo, end);
+}
+
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls;
+ struct nfs4_layout *lp, *n;
+ LIST_HEAD(reaplist);
+ __be32 nfserr;
+ int found = 0;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+ false, lrp->lr_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_layout_return_lookup_fail(&lrp->lr_sid);
+ return nfserr;
+ }
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+ if (layouts_overlapping(lp, &lrp->lr_seg)) {
+ nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+ found++;
+ }
+ }
+ if (!list_empty(&ls->ls_layouts)) {
+ if (found) {
+ update_stateid(&ls->ls_stid.sc_stateid);
+ memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+ sizeof(stateid_t));
+ }
+ lrp->lrs_present = 1;
+ } else {
+ trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+ nfs4_unhash_stid(&ls->ls_stid);
+ lrp->lrs_present = 0;
+ }
+ spin_unlock(&ls->ls_lock);
+
+ nfs4_put_stid(&ls->ls_stid);
+ nfsd4_free_layouts(&reaplist);
+ return nfs_ok;
+}
+
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ struct nfs4_client *clp = cstate->clp;
+ struct nfs4_layout *lp, *t;
+ LIST_HEAD(reaplist);
+
+ lrp->lrs_present = 0;
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+ if (lrp->lr_return_type == RETURN_FSID &&
+ !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+ &cstate->current_fh.fh_handle))
+ continue;
+
+ spin_lock(&ls->ls_lock);
+ list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+ if (lrp->lr_seg.iomode == IOMODE_ANY ||
+ lrp->lr_seg.iomode == lp->lo_seg.iomode)
+ list_move_tail(&lp->lo_perstate, &reaplist);
+ }
+ spin_unlock(&ls->ls_lock);
+ }
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+ return 0;
+}
+
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+ struct list_head *reaplist)
+{
+ spin_lock(&ls->ls_lock);
+ list_splice_init(&ls->ls_layouts, reaplist);
+ spin_unlock(&ls->ls_lock);
+}
+
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&clp->cl_lock);
+ list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ spin_unlock(&clp->cl_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_layout_stateid *ls, *n;
+ LIST_HEAD(reaplist);
+
+ spin_lock(&fp->fi_lock);
+ list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+ if (ls->ls_stid.sc_client == clp)
+ nfsd4_return_all_layouts(ls, &reaplist);
+ }
+ spin_unlock(&fp->fi_lock);
+
+ nfsd4_free_layouts(&reaplist);
+}
+
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+ struct nfs4_client *clp = ls->ls_stid.sc_client;
+ char addr_str[INET6_ADDRSTRLEN];
+ static char *envp[] = {
+ "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ NULL
+ };
+ char *argv[8];
+ int error;
+
+ rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+
+ nfsd4_cb_layout_fail(ls);
+
+ printk(KERN_WARNING
+ "nfsd: client %s failed to respond to layout recall. "
+ " Fencing..\n", addr_str);
+
+ argv[0] = "/sbin/nfsd-recall-failed";
+ argv[1] = addr_str;
+ argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+ argv[3] = NULL;
+
+ error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+ if (error) {
+ printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+ addr_str, error);
+ }
+}
+
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ LIST_HEAD(reaplist);
+
+ switch (task->tk_status) {
+ case 0:
+ return 1;
+ case -NFS4ERR_NOMATCHING_LAYOUT:
+ trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+ task->tk_status = 0;
+ return 1;
+ case -NFS4ERR_DELAY:
+ /* Poll the client until it's done with the layout */
+ /* FIXME: cap number of retries.
+ * The pnfs standard states that we need to only expire
+ * the client after at-least "lease time" .eg lease-time * 2
+ * when failing to communicate a recall
+ */
+ rpc_delay(task, HZ/100); /* 10 mili-seconds */
+ return 0;
+ default:
+ /*
+ * Unknown error or non-responding client, we'll need to fence.
+ */
+ nfsd4_cb_layout_fail(ls);
+ return -1;
+ }
+}
+
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+ LIST_HEAD(reaplist);
+
+ trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+
+ nfsd4_return_all_layouts(ls, &reaplist);
+ nfsd4_free_layouts(&reaplist);
+ nfs4_put_stid(&ls->ls_stid);
+}
+
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .done = nfsd4_cb_layout_done,
+ .release = nfsd4_cb_layout_release,
+};
+
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+ /*
+ * We don't want the locks code to timeout the lease for us;
+ * we'll remove it ourself if a layout isn't returned
+ * in time:
+ */
+ fl->fl_break_time = 0;
+ nfsd4_recall_file_layout(fl->fl_owner);
+ return false;
+}
+
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
+{
+ BUG_ON(!(arg & F_UNLCK));
+ return lease_modify(onlist, arg, dispose);
+}
+
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+ .lm_break = nfsd4_layout_lm_break,
+ .lm_change = nfsd4_layout_lm_change,
+};
+
+int
+nfsd4_init_pnfs(void)
+{
+ int i;
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++)
+ INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+
+ nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+ sizeof(struct nfs4_layout), 0, 0, NULL);
+ if (!nfs4_layout_cache)
+ return -ENOMEM;
+
+ nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+ sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+ if (!nfs4_layout_stateid_cache) {
+ kmem_cache_destroy(nfs4_layout_cache);
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+void
+nfsd4_exit_pnfs(void)
+{
+ int i;
+
+ kmem_cache_destroy(nfs4_layout_cache);
+ kmem_cache_destroy(nfs4_layout_stateid_cache);
+
+ for (i = 0; i < DEVID_HASH_SIZE; i++) {
+ struct nfsd4_deviceid_map *map, *n;
+
+ list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+ kfree(map);
+ }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
#include "current_stateid.h"
#include "netns.h"
#include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status == nfserr_same ? nfs_ok : status;
}
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+ if (!exp->ex_layout_type) {
+ dprintk("%s: export does not support pNFS\n", __func__);
+ return NULL;
+ }
+
+ if (exp->ex_layout_type != layout_type) {
+ dprintk("%s: layout type %d not supported\n",
+ __func__, layout_type);
+ return NULL;
+ }
+
+ return nfsd4_layout_ops[layout_type];
+}
+
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_getdeviceinfo *gdp)
+{
+ const struct nfsd4_layout_ops *ops;
+ struct nfsd4_deviceid_map *map;
+ struct svc_export *exp;
+ __be32 nfserr;
+
+ dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+ __func__,
+ gdp->gd_layout_type,
+ gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+ gdp->gd_maxcount);
+
+ map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+ if (!map) {
+ dprintk("%s: couldn't find device ID to export mapping!\n",
+ __func__);
+ return nfserr_noent;
+ }
+
+ exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+ if (IS_ERR(exp)) {
+ dprintk("%s: could not find device id\n", __func__);
+ return nfserr_noent;
+ }
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+ if (!ops)
+ goto out;
+
+ nfserr = nfs_ok;
+ if (gdp->gd_maxcount != 0)
+ nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+
+ gdp->gd_notify_types &= ops->notify_types;
+ exp_put(exp);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutget *lgp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+ int accmode;
+
+ switch (lgp->lg_seg.iomode) {
+ case IOMODE_READ:
+ accmode = NFSD_MAY_READ;
+ break;
+ case IOMODE_RW:
+ accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n",
+ __func__, lgp->lg_seg.iomode);
+ nfserr = nfserr_badiomode;
+ goto out;
+ }
+
+ nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+ if (!ops)
+ goto out;
+
+ /*
+ * Verify minlength and range as per RFC5661:
+ * o If loga_length is less than loga_minlength,
+ * the metadata server MUST return NFS4ERR_INVAL.
+ * o If the sum of loga_offset and loga_minlength exceeds
+ * NFS4_UINT64_MAX, and loga_minlength is not
+ * NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+ * o If the sum of loga_offset and loga_length exceeds
+ * NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+ * the error NFS4ERR_INVAL MUST result.
+ */
+ nfserr = nfserr_inval;
+ if (lgp->lg_seg.length < lgp->lg_minlength ||
+ (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+ lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+ (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+ lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+ goto out;
+ if (lgp->lg_seg.length == 0)
+ goto out;
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+ true, lgp->lg_layout_type, &ls);
+ if (nfserr) {
+ trace_layout_get_lookup_fail(&lgp->lg_sid);
+ goto out;
+ }
+
+ nfserr = nfserr_recallconflict;
+ if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+ goto out_put_stid;
+
+ nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+ current_fh, lgp);
+ if (nfserr)
+ goto out_put_stid;
+
+ nfserr = nfsd4_insert_layout(lgp, ls);
+
+out_put_stid:
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutcommit *lcp)
+{
+ const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+ struct svc_fh *current_fh = &cstate->current_fh;
+ const struct nfsd4_layout_ops *ops;
+ loff_t new_size = lcp->lc_last_wr + 1;
+ struct inode *inode;
+ struct nfs4_layout_stateid *ls;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+ if (!ops)
+ goto out;
+ inode = current_fh->fh_dentry->d_inode;
+
+ nfserr = nfserr_inval;
+ if (new_size <= seg->offset) {
+ dprintk("pnfsd: last write before layout segment\n");
+ goto out;
+ }
+ if (new_size > seg->offset + seg->length) {
+ dprintk("pnfsd: last write beyond layout segment\n");
+ goto out;
+ }
+ if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+ dprintk("pnfsd: layoutcommit beyond EOF\n");
+ goto out;
+ }
+
+ nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+ false, lcp->lc_layout_type,
+ &ls);
+ if (nfserr) {
+ trace_layout_commit_lookup_fail(&lcp->lc_sid);
+ /* fixup error code as per RFC5661 */
+ if (nfserr == nfserr_bad_stateid)
+ nfserr = nfserr_badlayout;
+ goto out;
+ }
+
+ nfserr = ops->proc_layoutcommit(inode, lcp);
+ if (nfserr)
+ goto out_put_stid;
+
+ if (new_size > i_size_read(inode)) {
+ lcp->lc_size_chg = 1;
+ lcp->lc_newsize = new_size;
+ } else {
+ lcp->lc_size_chg = 0;
+ }
+
+out_put_stid:
+ nfs4_put_stid(&ls->ls_stid);
+out:
+ return nfserr;
+}
+
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct svc_fh *current_fh = &cstate->current_fh;
+ __be32 nfserr;
+
+ nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_layoutunavailable;
+ if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+ goto out;
+
+ switch (lrp->lr_seg.iomode) {
+ case IOMODE_READ:
+ case IOMODE_RW:
+ case IOMODE_ANY:
+ break;
+ default:
+ dprintk("%s: invalid iomode %d\n", __func__,
+ lrp->lr_seg.iomode);
+ nfserr = nfserr_inval;
+ goto out;
+ }
+
+ switch (lrp->lr_return_type) {
+ case RETURN_FILE:
+ nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+ break;
+ case RETURN_FSID:
+ case RETURN_ALL:
+ nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+ break;
+ default:
+ dprintk("%s: invalid return_type %d\n", __func__,
+ lrp->lr_return_type);
+ nfserr = nfserr_inval;
+ break;
+ }
+out:
+ return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
/*
* NULL call.
*/
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
op_encode_channel_attrs_maxsz) * sizeof(__be32);
}
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE 128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* logr_return_on_close */ +
+ op_encode_stateid_maxsz +
+ 1 /* nr of layouts */ +
+ MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* locr_newsize */ +
+ 2 /* ns_size */) * sizeof(__be32);
+}
+
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+ return (op_encode_hdr_size +
+ 1 /* lrs_stateid */ +
+ op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static struct nfsd4_operation nfsd4_ops[] = {
[OP_ACCESS] = {
.op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
.op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
},
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = {
+ .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+ .op_flags = ALLOWED_WITHOUT_FH,
+ .op_name = "OP_GETDEVICEINFO",
+ },
+ [OP_LAYOUTGET] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutget,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTGET",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+ },
+ [OP_LAYOUTCOMMIT] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTCOMMIT",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+ },
+ [OP_LAYOUTRETURN] = {
+ .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+ .op_flags = OP_MODIFIES_SOMETHING,
+ .op_name = "OP_LAYOUTRETURN",
+ .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+ },
+#endif /* CONFIG_NFSD_PNFS */
/* NFSv4.2 operations */
[OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3550a9c87616..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
#include "current_stateid.h"
#include "netns.h"
+#include "pnfs.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
clp->cl_time = get_seconds();
}
-static inline void
-renew_client(struct nfs4_client *clp)
-{
- struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-
- spin_lock(&nn->client_lock);
- renew_client_locked(clp);
- spin_unlock(&nn->client_lock);
-}
-
static void put_client_renew_locked(struct nfs4_client *clp)
{
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
kmem_cache_free(file_slab, fp);
}
-static inline void
+void
put_nfs4_file(struct nfs4_file *fi)
{
might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
}
}
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
- atomic_inc(&fi->fi_ref);
-}
-
static struct file *
__nfs4_get_fd(struct nfs4_file *f, int oflag)
{
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
return ret;
}
-static struct file *
+struct file *
find_any_file(struct nfs4_file *f)
{
struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
}
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
- return fh1->fh_size == fh2->fh_size &&
- !memcmp(fh1->fh_base.fh_pad,
- fh2->fh_base.fh_pad,
- fh1->fh_size);
-}
-
static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
__nfs4_file_put_access(fp, O_RDONLY);
}
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab)
{
struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
struct file *filp = NULL;
spin_lock(&fp->fi_lock);
- if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+ if (fp->fi_deleg_file && --fp->fi_delegees == 0)
swap(filp, fp->fi_deleg_file);
spin_unlock(&fp->fi_lock);
if (filp) {
- vfs_setlease(filp, F_UNLCK, NULL, NULL);
+ vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
fput(filp);
}
}
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
{
s->sc_type = 0;
}
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
list_del_init(&stp->st_locks);
unhash_ol_stateid(stp);
- unhash_stid(&stp->st_stid);
+ nfs4_unhash_stid(&stp->st_stid);
}
static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
static int
STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
{
- if (clid->cl_boot == nn->boot_time)
+ /*
+ * We're assuming the clid was not given out from a boot
+ * precisely 2^32 (about 136 years) before this one. That seems
+ * a safe assumption:
+ */
+ if (clid->cl_boot == (u32)nn->boot_time)
return 0;
dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
INIT_LIST_HEAD(&clp->cl_lru);
INIT_LIST_HEAD(&clp->cl_callbacks);
INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
spin_lock_init(&clp->cl_lock);
rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
nfs4_get_stateowner(&oo->oo_owner);
release_openowner(oo);
}
+ nfsd4_return_all_client_layouts(clp);
nfsd4_shutdown_callback(clp);
if (clp->cl_cb_conn.cb_xprt)
svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
static void
nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
{
- /* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+ new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
/* Referrals are supported, Migration is not. */
new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
fp->fi_share_deny = 0;
memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+ INIT_LIST_HEAD(&fp->fi_lo_states);
+ atomic_set(&fp->fi_lo_recalls, 0);
+#endif
hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
}
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
struct nfs4_file *fp;
hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
- if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+ if (fh_match(&fp->fi_fhandle, fh)) {
if (atomic_inc_not_zero(&fp->fi_ref))
return fp;
}
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
return NULL;
}
-static struct nfs4_file *
+struct nfs4_file *
find_file(struct knfsd_fh *fh)
{
struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
}
static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+ struct list_head *dispose)
{
if (arg & F_UNLCK)
return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
/* Race breaker */
if (fp->fi_deleg_file) {
status = 0;
- atomic_inc(&fp->fi_delegees);
+ ++fp->fi_delegees;
hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- atomic_set(&fp->fi_delegees, 1);
+ fp->fi_delegees = 1;
hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -3897,11 +3891,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = nfs4_setlease(dp);
goto out;
}
- atomic_inc(&fp->fi_delegees);
if (fp->fi_had_conflict) {
status = -EAGAIN;
goto out_unlock;
}
+ ++fp->fi_delegees;
hash_delegation_locked(dp, fp);
status = 0;
out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
{
- if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+ if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
return nfserr_bad_stateid;
return nfs_ok;
}
@@ -4445,7 +4439,7 @@ out_unlock:
return status;
}
-static __be32
+__be32
nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
stateid_t *stateid, unsigned char typemask,
struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
update_stateid(&stp->st_stid.sc_stateid);
memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+ stp->st_stid.sc_file);
+
nfsd4_close_open_stateid(stp);
/* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
static bool
check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
- struct file_lock **flpp;
+ struct file_lock *fl;
int status = false;
struct file *filp = find_any_file(fp);
struct inode *inode;
+ struct file_lock_context *flctx;
if (!filp) {
/* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
inode = file_inode(filp);
+ flctx = inode->i_flctx;
- spin_lock(&inode->i_lock);
- for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
- if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
- status = true;
- break;
+ if (flctx && !list_empty_careful(&flctx->flc_posix)) {
+ spin_lock(&flctx->flc_lock);
+ list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+ if (fl->fl_owner == (fl_owner_t)lowner) {
+ status = true;
+ break;
+ }
}
+ spin_unlock(&flctx->flc_lock);
}
- spin_unlock(&inode->i_lock);
fput(filp);
return status;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
#include "state.h"
#include "cache.h"
#include "netns.h"
+#include "pnfs.h"
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
return ret;
}
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+ DECODE_HEAD;
+ u64 sec;
+
+ READ_BUF(12);
+ p = xdr_decode_hyper(p, &sec);
+ tv->tv_sec = sec;
+ tv->tv_nsec = be32_to_cpup(p++);
+ if (tv->tv_nsec >= (u32)1000000000)
+ return nfserr_inval;
+
+ DECODE_TAIL;
+}
+
static __be32
nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
{
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
{
int expected_len, len = 0;
u32 dummy32;
- u64 sec;
char *buf;
DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
- /* We require the high 32 bits of 'seconds' to be 0, and we ignore
- all 32 bits of 'nseconds'. */
- READ_BUF(12);
len += 12;
- p = xdr_decode_hyper(p, &sec);
- iattr->ia_atime.tv_sec = (time_t)sec;
- iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
- if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
- return nfserr_inval;
+ status = nfsd4_decode_time(argp, &iattr->ia_atime);
+ if (status)
+ return status;
iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
break;
case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
dummy32 = be32_to_cpup(p++);
switch (dummy32) {
case NFS4_SET_TO_CLIENT_TIME:
- /* We require the high 32 bits of 'seconds' to be 0, and we ignore
- all 32 bits of 'nseconds'. */
- READ_BUF(12);
len += 12;
- p = xdr_decode_hyper(p, &sec);
- iattr->ia_mtime.tv_sec = sec;
- iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
- if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
- return nfserr_inval;
+ status = nfsd4_decode_time(argp, &iattr->ia_mtime);
+ if (status)
+ return status;
iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
break;
case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
DECODE_TAIL;
}
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ DECODE_HEAD;
+ u32 num, i;
+
+ READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+ COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+ gdev->gd_layout_type = be32_to_cpup(p++);
+ gdev->gd_maxcount = be32_to_cpup(p++);
+ num = be32_to_cpup(p++);
+ if (num) {
+ READ_BUF(4 * num);
+ gdev->gd_notify_types = be32_to_cpup(p++);
+ for (i = 1; i < num; i++) {
+ if (be32_to_cpup(p++)) {
+ status = nfserr_inval;
+ goto out;
+ }
+ }
+ }
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutget *lgp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(36);
+ lgp->lg_signal = be32_to_cpup(p++);
+ lgp->lg_layout_type = be32_to_cpup(p++);
+ lgp->lg_seg.iomode = be32_to_cpup(p++);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+ p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+ p = xdr_decode_hyper(p, &lgp->lg_minlength);
+ nfsd4_decode_stateid(argp, &lgp->lg_sid);
+ READ_BUF(4);
+ lgp->lg_maxcount = be32_to_cpup(p++);
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutcommit *lcp)
+{
+ DECODE_HEAD;
+ u32 timechange;
+
+ READ_BUF(20);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+ p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+ lcp->lc_reclaim = be32_to_cpup(p++);
+ nfsd4_decode_stateid(argp, &lcp->lc_sid);
+ READ_BUF(4);
+ lcp->lc_newoffset = be32_to_cpup(p++);
+ if (lcp->lc_newoffset) {
+ READ_BUF(8);
+ p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+ } else
+ lcp->lc_last_wr = 0;
+ READ_BUF(4);
+ timechange = be32_to_cpup(p++);
+ if (timechange) {
+ status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+ if (status)
+ return status;
+ } else {
+ lcp->lc_mtime.tv_nsec = UTIME_NOW;
+ }
+ READ_BUF(8);
+ lcp->lc_layout_type = be32_to_cpup(p++);
+
+ /*
+ * Save the layout update in XDR format and let the layout driver deal
+ * with it later.
+ */
+ lcp->lc_up_len = be32_to_cpup(p++);
+ if (lcp->lc_up_len > 0) {
+ READ_BUF(lcp->lc_up_len);
+ READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+ }
+
+ DECODE_TAIL;
+}
+
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+ struct nfsd4_layoutreturn *lrp)
+{
+ DECODE_HEAD;
+
+ READ_BUF(16);
+ lrp->lr_reclaim = be32_to_cpup(p++);
+ lrp->lr_layout_type = be32_to_cpup(p++);
+ lrp->lr_seg.iomode = be32_to_cpup(p++);
+ lrp->lr_return_type = be32_to_cpup(p++);
+ if (lrp->lr_return_type == RETURN_FILE) {
+ READ_BUF(16);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+ p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+ nfsd4_decode_stateid(argp, &lrp->lr_sid);
+ READ_BUF(4);
+ lrp->lrf_body_len = be32_to_cpup(p++);
+ if (lrp->lrf_body_len > 0) {
+ READ_BUF(lrp->lrf_body_len);
+ READMEM(lrp->lrf_body, lrp->lrf_body_len);
+ }
+ } else {
+ lrp->lr_seg.offset = 0;
+ lrp->lr_seg.length = NFS4_MAX_UINT64;
+ }
+
+ DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static __be32
nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_dec)nfsd4_decode_destroy_session,
[OP_FREE_STATEID] = (nfsd4_dec)nfsd4_decode_free_stateid,
[OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
+ [OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
[OP_GETDEVICEINFO] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_GETDEVICELIST] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTCOMMIT] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTGET] = (nfsd4_dec)nfsd4_decode_notsupp,
[OP_LAYOUTRETURN] = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
[OP_SECINFO_NO_NAME] = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_dec)nfsd4_decode_sequence,
[OP_SET_SSV] = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
get_parent_attributes(exp, &stat);
p = xdr_encode_hyper(p, stat.ino);
}
+#ifdef CONFIG_NFSD_PNFS
+ if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+ (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+ if (exp->ex_layout_type) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(1);
+ *p++ = cpu_to_be32(exp->ex_layout_type);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(0);
+ }
+ }
+
+ if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out_resource;
+ *p++ = cpu_to_be32(stat.blksize);
+ }
+#endif /* CONFIG_NFSD_PNFS */
if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
status = nfsd4_encode_security_label(xdr, rqstp, context,
contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
if (entry_bytes > cd->rd_maxcount)
goto fail;
cd->rd_maxcount -= entry_bytes;
- if (!cd->rd_dircount)
- goto fail;
/*
* RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
* let's always let through the first entry, at least:
*/
- name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+ if (!cd->rd_dircount)
+ goto fail;
+ name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
goto fail;
cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
+
cd->cookie_offset = cookie_offset;
skip_entry:
cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
return nfserr;
}
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_getdeviceinfo *gdev)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops =
+ nfsd4_layout_ops[gdev->gd_layout_type];
+ u32 starting_len = xdr->buf->len, needed_len;
+ __be32 *p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_resource;
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+
+ *p++ = cpu_to_be32(gdev->gd_layout_type);
+
+ /* If maxcount is 0 then just update notifications */
+ if (gdev->gd_maxcount != 0) {
+ nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+ if (nfserr) {
+ /*
+ * We don't bother to burden the layout drivers with
+ * enforcing gd_maxcount, just tell the client to
+ * come back with a bigger buffer if it's not enough.
+ */
+ if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ goto toosmall;
+ goto out;
+ }
+ }
+
+ nfserr = nfserr_resource;
+ if (gdev->gd_notify_types) {
+ p = xdr_reserve_space(xdr, 4 + 4);
+ if (!p)
+ goto out;
+ *p++ = cpu_to_be32(1); /* bitmap length */
+ *p++ = cpu_to_be32(gdev->gd_notify_types);
+ } else {
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ goto out;
+ *p++ = 0;
+ }
+
+ nfserr = 0;
+out:
+ kfree(gdev->gd_device);
+ dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+ return nfserr;
+
+toosmall:
+ dprintk("%s: maxcount too small\n", __func__);
+ needed_len = xdr->buf->len + 4 /* notifications */;
+ xdr_truncate_encode(xdr, starting_len);
+ p = xdr_reserve_space(xdr, 4);
+ if (!p) {
+ nfserr = nfserr_resource;
+ } else {
+ *p++ = cpu_to_be32(needed_len);
+ nfserr = nfserr_toosmall;
+ }
+ goto out;
+}
+
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutget *lgp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ const struct nfsd4_layout_ops *ops =
+ nfsd4_layout_ops[lgp->lg_layout_type];
+ __be32 *p;
+
+ dprintk("%s: err %d\n", __func__, nfserr);
+ if (nfserr)
+ goto out;
+
+ nfserr = nfserr_resource;
+ p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+ if (!p)
+ goto out;
+
+ *p++ = cpu_to_be32(1); /* we always set return-on-close */
+ *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+ p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+ sizeof(stateid_opaque_t));
+
+ *p++ = cpu_to_be32(1); /* we always return a single layout */
+ p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+ p = xdr_encode_hyper(p, lgp->lg_seg.length);
+ *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+ *p++ = cpu_to_be32(lgp->lg_layout_type);
+
+ nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+ kfree(lgp->lg_content);
+ return nfserr;
+}
+
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutcommit *lcp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lcp->lc_size_chg);
+ if (lcp->lc_size_chg) {
+ p = xdr_reserve_space(xdr, 8);
+ if (!p)
+ return nfserr_resource;
+ p = xdr_encode_hyper(p, lcp->lc_newsize);
+ }
+
+ return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+ struct nfsd4_layoutreturn *lrp)
+{
+ struct xdr_stream *xdr = &resp->xdr;
+ __be32 *p;
+
+ if (nfserr)
+ return nfserr;
+
+ p = xdr_reserve_space(xdr, 4);
+ if (!p)
+ return nfserr_resource;
+ *p++ = cpu_to_be32(lrp->lrs_present);
+ if (lrp->lrs_present)
+ nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+ return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
+
static __be32
nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
[OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
[OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+ [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+ [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
+ [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_layoutcommit,
+ [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_layoutget,
+ [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
[OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
[OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
[OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
+#endif
[OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
[OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
[OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
#include "cache.h"
#include "state.h"
#include "netns.h"
+#include "pnfs.h"
/*
* We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
retval = nfsd4_init_slabs();
if (retval)
goto out_unregister_pernet;
- retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ retval = nfsd4_init_pnfs();
if (retval)
goto out_free_slabs;
+ retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+ if (retval)
+ goto out_exit_pnfs;
nfsd_stat_init(); /* Statistics */
retval = nfsd_reply_cache_init();
if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
out_free_stat:
nfsd_stat_shutdown();
nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+ nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
nfsd_stat_shutdown();
nfsd_lockd_shutdown();
nfsd4_free_slabs();
+ nfsd4_exit_pnfs();
nfsd_fault_inject_cleanup();
unregister_filesystem(&nfsd_fs_type);
unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void nfsd_lockd_shutdown(void);
#define NFSD4_SUPPORTED_ATTRS_WORD2 0
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1 FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1 0
+#define PNFSD_SUPPORTED_ATTRS_WORD2 0
+#endif /* CONFIG_NFSD_PNFS */
+
#define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
NFSD4_SUPPORTED_ATTRS_WORD0
#define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
- NFSD4_SUPPORTED_ATTRS_WORD1
+ (NFSD4_SUPPORTED_ATTRS_WORD1 | PNFSD_SUPPORTED_ATTRS_WORD1)
#define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
- (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+ (NFSD4_SUPPORTED_ATTRS_WORD2 | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+ FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+/* 4.2 */
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
#define NFSD4_2_SECURITY_ATTRS FATTR4_WORD2_SECURITY_LABEL
#else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..f22920442172 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
return fhp;
}
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_size != fh2->fh_size)
+ return false;
+ if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+ return false;
+ return true;
+}
+
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+ if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+ return false;
+ if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+ return false;
+ return true;
+}
+
#ifdef CONFIG_NFSD_V3
/*
* The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program nfsd_program = {
static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
[0] = 1,
[1] = 1,
+ [2] = 1,
};
int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..d4c4453674c6
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+
+#ifdef CONFIG_NFSD_V4
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+
+#include "state.h"
+#include "xdr4.h"
+
+struct xdr_stream;
+
+struct nfsd4_deviceid_map {
+ struct list_head hash;
+ u64 idx;
+ int fsid_type;
+ u32 fsid[];
+};
+
+struct nfsd4_layout_ops {
+ u32 notify_types;
+
+ __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+ struct nfsd4_getdeviceinfo *gdevp);
+ __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+ struct nfsd4_getdeviceinfo *gdevp);
+
+ __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+ struct nfsd4_layoutget *lgp);
+ __be32 (*encode_layoutget)(struct xdr_stream *,
+ struct nfsd4_layoutget *lgp);
+
+ __be32 (*proc_layoutcommit)(struct inode *inode,
+ struct nfsd4_layoutcommit *lcp);
+};
+
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate, stateid_t *stateid,
+ bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+ struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+ struct nfsd4_compound_state *cstate,
+ struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+ u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#endif /* CONFIG_NFSD_V4 */
+
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+struct nfs4_client;
+struct nfs4_file;
+
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+ struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+ return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
/* For a deleg stateid kept around only to process free_stateid's: */
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
unsigned char sc_type;
stateid_t sc_stateid;
struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
struct list_head cl_delegations;
struct list_head cl_revoked; /* unacknowledged, revoked 4.1 state */
struct list_head cl_lru; /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head cl_lo_states; /* outstanding layout states */
+#endif
struct xdr_netobj cl_name; /* id generated by client */
nfs4_verifier cl_verifier; /* generated by client */
time_t cl_time; /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
atomic_t fi_access[2];
u32 fi_share_deny;
struct file *fi_deleg_file;
- atomic_t fi_delegees;
+ int fi_delegees;
struct knfsd_fh fi_fhandle;
bool fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+ struct list_head fi_lo_states;
+ atomic_t fi_lo_recalls;
+#endif
};
/*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
return container_of(s, struct nfs4_ol_stateid, st_stid);
}
+struct nfs4_layout_stateid {
+ struct nfs4_stid ls_stid;
+ struct list_head ls_perclnt;
+ struct list_head ls_perfile;
+ spinlock_t ls_lock;
+ struct list_head ls_layouts;
+ u32 ls_layout_type;
+ struct file *ls_file;
+ struct nfsd4_callback ls_recall;
+ stateid_t ls_recall_sid;
+ bool ls_recalled;
+};
+
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+ return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
+
/* flags for preprocess_seqid_op() */
#define RD_STATE 0x00000010
#define WR_STATE 0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
enum nfsd4_cb_op {
NFSPROC4_CLNT_CB_NULL = 0,
NFSPROC4_CLNT_CB_RECALL,
+ NFSPROC4_CLNT_CB_LAYOUT,
NFSPROC4_CLNT_CB_SEQUENCE,
};
@@ -545,6 +572,12 @@ struct nfsd_net;
extern __be32 nfs4_preprocess_stateid_op(struct net *net,
struct nfsd4_compound_state *cstate,
stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+ stateid_t *stateid, unsigned char typemask,
+ struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+ struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
struct nfsd_net *nn);
extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+ atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
+
/* grace period management */
void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+
+#include "state.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+ TP_PROTO(stateid_t *stp),
+ TP_ARGS(stp),
+ TP_STRUCT__entry(
+ __field(u32, cl_boot)
+ __field(u32, cl_id)
+ __field(u32, si_id)
+ __field(u32, si_generation)
+ ),
+ TP_fast_assign(
+ __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+ __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+ __entry->si_id = stp->si_opaque.so_id;
+ __entry->si_generation = stp->si_generation;
+ ),
+ TP_printk("client %08x:%08x stateid %08x:%08x",
+ __entry->cl_boot,
+ __entry->cl_id,
+ __entry->si_id,
+ __entry->si_generation)
+)
+
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+ TP_PROTO(stateid_t *stp), \
+ TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+
+#endif /* _NFSD_TRACE_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
u32 rca_one_fs;
};
+struct nfsd4_deviceid {
+ u64 fsid_idx;
+ u32 generation;
+ u32 pad;
+};
+
+struct nfsd4_layout_seg {
+ u32 iomode;
+ u64 offset;
+ u64 length;
+};
+
+struct nfsd4_getdeviceinfo {
+ struct nfsd4_deviceid gd_devid; /* request */
+ u32 gd_layout_type; /* request */
+ u32 gd_maxcount; /* request */
+ u32 gd_notify_types;/* request - response */
+ void *gd_device; /* response */
+};
+
+struct nfsd4_layoutget {
+ u64 lg_minlength; /* request */
+ u32 lg_signal; /* request */
+ u32 lg_layout_type; /* request */
+ u32 lg_maxcount; /* request */
+ stateid_t lg_sid; /* request/response */
+ struct nfsd4_layout_seg lg_seg; /* request/response */
+ void *lg_content; /* response */
+};
+
+struct nfsd4_layoutcommit {
+ stateid_t lc_sid; /* request */
+ struct nfsd4_layout_seg lc_seg; /* request */
+ u32 lc_reclaim; /* request */
+ u32 lc_newoffset; /* request */
+ u64 lc_last_wr; /* request */
+ struct timespec lc_mtime; /* request */
+ u32 lc_layout_type; /* request */
+ u32 lc_up_len; /* layout length */
+ void *lc_up_layout; /* decoded by callback */
+ u32 lc_size_chg; /* boolean for response */
+ u64 lc_newsize; /* response */
+};
+
+struct nfsd4_layoutreturn {
+ u32 lr_return_type; /* request */
+ u32 lr_layout_type; /* request */
+ struct nfsd4_layout_seg lr_seg; /* request */
+ u32 lr_reclaim; /* request */
+ u32 lrf_body_len; /* request */
+ void *lrf_body; /* request */
+ stateid_t lr_sid; /* request/response */
+ u32 lrs_present; /* response */
+};
+
struct nfsd4_fallocate {
/* request */
stateid_t falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
struct nfsd4_reclaim_complete reclaim_complete;
struct nfsd4_test_stateid test_stateid;
struct nfsd4_free_stateid free_stateid;
+ struct nfsd4_getdeviceinfo getdeviceinfo;
+ struct nfsd4_layoutget layoutget;
+ struct nfsd4_layoutcommit layoutcommit;
+ struct nfsd4_layoutreturn layoutreturn;
/* NFSv4.2 */
struct nfsd4_fallocate allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
#define NFS4_dec_cb_recall_sz (cb_compound_dec_hdr_sz + \
cb_sequence_dec_sz + \
op_dec_sz)
+#define NFS4_enc_cb_layout_sz (cb_compound_enc_hdr_sz + \
+ cb_sequence_enc_sz + \
+ 1 + 3 + \
+ enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz (cb_compound_dec_hdr_sz + \
+ cb_sequence_dec_sz + \
+ op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = nilfs_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
inode->i_mapping->a_ops = &empty_aops;
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
inode->i_mode = S_IFREG;
mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
- inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
inode->i_op = &def_mdt_iops;
inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
struct nilfs_shadow_map *shadow)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
- struct backing_dev_info *bdi = inode->i_sb->s_bdi;
INIT_LIST_HEAD(&shadow->frozen_buffers);
address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+ nilfs_mapping_init(&shadow->frozen_data, inode);
address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+ nilfs_mapping_init(&shadow->frozen_btnodes, inode);
mi->mi_shadow = shadow;
return 0;
}
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 91093cd74f0d..385704027575 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
* @ti_save: Backup of journal_info field of task_struct
* @ti_flags: Flags
* @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
*/
struct nilfs_transaction_info {
u32 ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
one of other filesystems has a bug. */
unsigned short ti_flags;
unsigned short ti_count;
- struct list_head ti_garbage;
};
/* ti_magic */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
return nc;
}
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
- struct backing_dev_info *bdi)
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
{
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
mapping->private_data = NULL;
- mapping->backing_dev_info = bdi;
mapping->a_ops = &empty_aops;
}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
void nilfs_clear_dirty_page(struct page *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
- struct backing_dev_info *bdi);
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ef18fc656c2..469086b9f99b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
ti->ti_count = 0;
ti->ti_save = cur_ti;
ti->ti_magic = NILFS_TI_MAGIC;
- INIT_LIST_HEAD(&ti->ti_garbage);
current->journal_info = ti;
for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
- if (!list_empty(&ti->ti_garbage))
- nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
}
}
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+ struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+ sc_iput_work);
+ struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
+
static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
struct nilfs_root *root)
{
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
struct the_nilfs *nilfs)
{
- struct nilfs_transaction_info *ti = current->journal_info;
struct nilfs_inode_info *ii, *n;
+ int defer_iput = false;
spin_lock(&nilfs->ns_inode_lock);
list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
clear_bit(NILFS_I_BUSY, &ii->i_state);
brelse(ii->i_bh);
ii->i_bh = NULL;
- list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+ list_del_init(&ii->i_dirty);
+ if (!ii->vfs_inode.i_nlink) {
+ /*
+ * Defer calling iput() to avoid a deadlock
+ * over I_SYNC flag for inodes with i_nlink == 0
+ */
+ list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+ defer_iput = true;
+ } else {
+ spin_unlock(&nilfs->ns_inode_lock);
+ iput(&ii->vfs_inode);
+ spin_lock(&nilfs->ns_inode_lock);
+ }
}
spin_unlock(&nilfs->ns_inode_lock);
+
+ if (defer_iput)
+ schedule_work(&sci->sc_iput_work);
}
/*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
INIT_LIST_HEAD(&sci->sc_segbufs);
INIT_LIST_HEAD(&sci->sc_write_logs);
INIT_LIST_HEAD(&sci->sc_gc_inodes);
+ INIT_LIST_HEAD(&sci->sc_iput_queue);
+ INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
init_timer(&sci->sc_timer);
sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
nilfs_transaction_unlock(sci->sc_super);
+ flush_work(&sci->sc_iput_work);
+
} while (ret && retrycount-- > 0);
}
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
|| sci->sc_seq_request != sci->sc_seq_done);
spin_unlock(&sci->sc_state_lock);
+ if (flush_work(&sci->sc_iput_work))
+ flag = true;
+
if (flag || !nilfs_segctor_confirm(sci))
nilfs_segctor_write_out(sci);
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
+ if (!list_empty(&sci->sc_iput_queue)) {
+ nilfs_warning(sci->sc_super, __func__,
+ "iput queue is not empty\n");
+ nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+ }
+
WARN_ON(!list_empty(&sci->sc_segbufs));
WARN_ON(!list_empty(&sci->sc_write_logs));
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d0013314..a48d6de1e02c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
#include <linux/types.h>
#include <linux/fs.h>
#include <linux/buffer_head.h>
+#include <linux/workqueue.h>
#include <linux/nilfs2_fs.h>
#include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
* @sc_nblk_inc: Block count of current generation
* @sc_dirty_files: List of files to be written
* @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
* @sc_freesegs: array of segment numbers to be freed
* @sc_nfreesegs: number of segments on @sc_freesegs
* @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
struct list_head sc_dirty_files;
struct list_head sc_gc_inodes;
+ struct list_head sc_iput_queue;
+ struct work_struct sc_iput_work;
__u64 *sc_freesegs;
size_t sc_nfreesegs;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
ii->i_state = 0;
ii->i_cno = 0;
ii->vfs_inode.i_version = 1;
- nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+ nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
return &ii->vfs_inode;
}
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
{
struct the_nilfs *nilfs;
struct nilfs_root *fsroot;
- struct backing_dev_info *bdi;
__u64 cno;
int err;
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
sb->s_time_gran = 1;
sb->s_max_links = NILFS_LINK_MAX;
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- sb->s_bdi = bdi ? : &default_backing_dev_info;
+ sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
err = load_nilfs(nilfs, sb);
if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
config FSNOTIFY
def_bool n
+ select SRCU
source "fs/notify/dnotify/Kconfig"
source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
}
if (S_ISDIR(path->dentry->d_inode->i_mode) &&
- (marks_ignored_mask & FS_ISDIR))
+ !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
return false;
if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c991616acca9..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
struct fsnotify_event *kevent;
char __user *start;
int ret;
- DEFINE_WAIT(wait);
+ DEFINE_WAIT_FUNC(wait, woken_wake_function);
start = buf;
group = file->private_data;
pr_debug("%s: group=%p\n", __func__, group);
+ add_wait_queue(&group->notification_waitq, &wait);
while (1) {
- prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
-
mutex_lock(&group->notification_mutex);
kevent = get_one_event(group, count);
mutex_unlock(&group->notification_mutex);
@@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
if (start != buf)
break;
- schedule();
+
+ wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
continue;
}
@@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
buf += ret;
count -= ret;
}
+ remove_wait_queue(&group->notification_waitq, &wait);
- finish_wait(&group->notification_waitq, &wait);
if (start != buf && ret != -EFAULT)
ret = buf - start;
return ret;
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
unsigned int flags,
int *destroy)
{
- __u32 oldmask;
+ __u32 oldmask = 0;
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ __u32 tmask = fsn_mark->mask & ~mask;
+
+ if (flags & FAN_MARK_ONDIR)
+ tmask &= ~FAN_ONDIR;
+
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+ fsnotify_set_mark_mask_locked(fsn_mark, tmask);
} else {
- oldmask = fsn_mark->ignored_mask;
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+ __u32 tmask = fsn_mark->ignored_mask & ~mask;
+ if (flags & FAN_MARK_ONDIR)
+ tmask &= ~FAN_ONDIR;
+
+ fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
}
+ *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
spin_unlock(&fsn_mark->lock);
- *destroy = !(oldmask & ~mask);
-
return mask & oldmask;
}
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
spin_lock(&fsn_mark->lock);
if (!(flags & FAN_MARK_IGNORED_MASK)) {
+ __u32 tmask = fsn_mark->mask | mask;
+
+ if (flags & FAN_MARK_ONDIR)
+ tmask |= FAN_ONDIR;
+
oldmask = fsn_mark->mask;
- fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+ fsnotify_set_mark_mask_locked(fsn_mark, tmask);
} else {
__u32 tmask = fsn_mark->ignored_mask | mask;
+ if (flags & FAN_MARK_ONDIR)
+ tmask |= FAN_ONDIR;
+
fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
}
-
- if (!(flags & FAN_MARK_ONDIR)) {
- __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
- fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
- }
-
spin_unlock(&fsn_mark->lock);
return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
* Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/backing-dev.h>
#include <linux/buffer_head.h>
#include <linux/gfp.h>
#include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
count = iov_length(iov, nr_segs);
pos = *ppos;
/* We can write back this queue in page reclaim. */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
written = 0;
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
ret = posix_acl_equiv_mode(acl, &mode);
if (ret < 0)
return ret;
- else {
- if (ret == 0)
- acl = NULL;
- ret = ocfs2_acl_set_mode(inode, di_bh,
- handle, mode);
- if (ret)
- return ret;
+ if (ret == 0)
+ acl = NULL;
- }
+ ret = ocfs2_acl_set_mode(inode, di_bh,
+ handle, mode);
+ if (ret)
+ return ret;
}
break;
case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
- goto out_unlock;
+ goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (ret) {
mlog_errno(ret);
need_free = 1;
- goto out_commit;
+ goto out_unlock;
}
page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
if (ret) {
mlog_errno(ret);
need_free = 1;
- goto out_commit;
+ goto out_unlock;
}
inode->i_blocks = ocfs2_inode_sector_count(inode);
}
+out_unlock:
+ if (pages)
+ ocfs2_unlock_and_free_pages(pages, num_pages);
+
out_commit:
if (ret < 0 && did_quota)
dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
ocfs2_commit_trans(osb, handle);
-out_unlock:
+out:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
-
-out:
- if (pages) {
- ocfs2_unlock_and_free_pages(pages, num_pages);
+ if (pages)
kfree(pages);
- }
return ret;
}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46d93e941f3d..44db1808cdb5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
#include <linux/quotaops.h>
+#include <linux/blkdev.h>
#include <cluster/masklog.h>
@@ -47,6 +48,9 @@
#include "ocfs2_trace.h"
#include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
*
* called like this: dio->get_blocks(dio->inode, fs_startblk,
* fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
*/
static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
{
int ret;
+ u32 cpos = 0;
+ int alloc_locked = 0;
u64 p_blkno, inode_blocks, contig_blocks;
unsigned int ext_flags;
unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+ unsigned long len = bh_result->b_size;
+ unsigned int clusters_to_alloc = 0;
+
+ cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
/* This function won't even be called if the request isn't all
* nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
/* We should already CoW the refcounted extent in case of create. */
BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+ /* allocate blocks if no p_blkno is found, and create == 1 */
+ if (!p_blkno && create) {
+ ret = ocfs2_inode_lock(inode, NULL, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ alloc_locked = 1;
+
+ /* fill hole, allocate blocks can't be larger than the size
+ * of the hole */
+ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+ if (clusters_to_alloc > contig_blocks)
+ clusters_to_alloc = contig_blocks;
+
+ /* allocate extent and insert them into the extent tree */
+ ret = ocfs2_extend_allocation(inode, cpos,
+ clusters_to_alloc, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto bail;
+ }
+
+ ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+ &contig_blocks, &ext_flags);
+ if (ret < 0) {
+ mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+ (unsigned long long)iblock);
+ ret = -EIO;
+ goto bail;
+ }
+ }
+
/*
* get_more_blocks() expects us to describe a hole by clearing
* the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
contig_blocks = max_blocks;
bh_result->b_size = contig_blocks << blocksize_bits;
bail:
+ if (alloc_locked)
+ ocfs2_inode_unlock(inode, 1);
return ret;
}
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page);
}
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+ struct inode *inode, loff_t offset)
+{
+ int ret = 0;
+ u32 v_cpos = 0;
+ u32 p_cpos = 0;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+ return 1;
+
+ return 0;
+}
+
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+ struct iov_iter *iter,
+ loff_t offset)
+{
+ ssize_t ret = 0;
+ ssize_t written = 0;
+ bool orphaned = false;
+ int is_overwrite = 0;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ size_t count = iter->count;
+ journal_t *journal = osb->journal->j_journal;
+ u32 zero_len;
+ int cluster_align;
+ loff_t final_size = offset + count;
+ int append_write = offset >= i_size_read(inode) ? 1 : 0;
+ unsigned int num_clusters = 0;
+ unsigned int ext_flags = 0;
+
+ {
+ u64 o = offset;
+
+ zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+ cluster_align = !zero_len;
+ }
+
+ /*
+ * when final_size > inode->i_size, inode->i_size will be
+ * updated after direct write, so add the inode to orphan
+ * dir first.
+ */
+ if (final_size > i_size_read(inode)) {
+ ret = ocfs2_add_inode_to_orphan(osb, inode);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ orphaned = true;
+ }
+
+ if (append_write) {
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+ ret = ocfs2_zero_extend(inode, di_bh, offset);
+ else
+ ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+ offset);
+ if (ret < 0) {
+ mlog_errno(ret);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+
+ is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+ if (is_overwrite < 0) {
+ mlog_errno(is_overwrite);
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+ }
+
+ written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
+ iter, offset,
+ ocfs2_direct_IO_get_blocks,
+ ocfs2_dio_end_io, NULL, 0);
+ if (unlikely(written < 0)) {
+ loff_t i_size = i_size_read(inode);
+
+ if (offset + count > i_size) {
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ if (i_size == i_size_read(inode)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ goto clean_orphan;
+ }
+ }
+
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+ ret = jbd2_journal_force_commit(journal);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+ } else if (written < 0 && append_write && !is_overwrite &&
+ !cluster_align) {
+ u32 p_cpos = 0;
+ u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+
+ ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+ &num_clusters, &ext_flags);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto clean_orphan;
+ }
+
+ BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+
+ ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+ p_cpos << (osb->s_clustersize_bits - 9),
+ zero_len >> 9, GFP_KERNEL, false);
+ if (ret < 0)
+ mlog_errno(ret);
+ }
+
+clean_orphan:
+ if (orphaned) {
+ int tmp_ret;
+ int update_isize = written > 0 ? 1 : 0;
+ loff_t end = update_isize ? offset + written : 0;
+
+ tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+ update_isize, end);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ goto out;
+ }
+
+ tmp_ret = jbd2_journal_force_commit(journal);
+ if (tmp_ret < 0) {
+ ret = tmp_ret;
+ mlog_errno(tmp_ret);
+ }
+ }
+
+out:
+ if (ret >= 0)
+ ret = written;
+ return ret;
+}
+
static ssize_t ocfs2_direct_IO(int rw,
struct kiocb *iocb,
struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file)->i_mapping->host;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
return 0;
- /* Fallback to buffered I/O if we are appending. */
- if (i_size_read(inode) <= offset)
+ /* Fallback to buffered I/O if we are appending and
+ * concurrent O_DIRECT writes are allowed.
+ */
+ if (i_size_read(inode) <= offset && !full_coherency)
return 0;
- return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+ if (rw == READ)
+ return __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev,
iter, offset,
ocfs2_direct_IO_get_blocks,
ocfs2_dio_end_io, NULL, 0);
+ else
+ return ocfs2_direct_IO_write(iocb, iter, offset);
}
static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
memset(map, 0, bytes);
for (node = 0; node < O2NM_MAX_NODES; ++node) {
- o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+ if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+ continue;
if (!ret) {
set_bit(node, map);
sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
struct list_head nn_status_list;
/* connects are attempted from when heartbeat comes up until either hb
- * goes down, the node is unconfigured, no connect attempts succeed
- * before O2NET_CONN_IDLE_DELAY, or a connect succeeds. connect_work
- * is queued from set_nn_state both from hb up and from itself if a
- * connect attempt fails and so can be self-arming. shutdown is
- * careful to first mark the nn such that no connects will be attempted
- * before canceling delayed connect work and flushing the queue. */
+ * goes down, the node is unconfigured, or a connect succeeds.
+ * connect_work is queued from set_nn_state both from hb up and from
+ * itself if a connect attempt fails and so can be self-arming.
+ * shutdown is careful to first mark the nn such that no connects will
+ * be attempted before canceling delayed connect work and flushing the
+ * queue. */
struct delayed_work nn_connect_work;
unsigned long nn_last_connect_attempt;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
int blocksize = dir->i_sb->s_blocksize;
status = ocfs2_read_dir_block(dir, 0, &bh, 0);
- if (status) {
- mlog_errno(status);
+ if (status)
goto bail;
- }
rec_len = OCFS2_DIR_REC_LEN(namelen);
offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = ocfs2_read_dir_block(dir,
offset >> sb->s_blocksize_bits,
&bh, 0);
- if (status) {
- mlog_errno(status);
+ if (status)
goto bail;
- }
+
/* move to next block */
de = (struct ocfs2_dir_entry *) bh->b_data;
}
@@ -3513,7 +3510,6 @@ next:
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
- status = 0;
bail:
brelse(bh);
if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
head = &res->granted;
list_for_each_entry(lock, head, list) {
- if (lock->ml.cookie == cookie)
+ /* if lock is found but unlock is pending ignore the bast */
+ if (lock->ml.cookie == cookie) {
+ if (lock->unlock_pending)
+ break;
goto do_ast;
+ }
}
mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
}
spin_unlock(&dlm->spinlock);
- out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+ out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
return out;
}
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
spin_unlock(&dlm->master_lock);
out += snprintf(buf + out, len - out,
- "Total: %ld, Longest: %ld\n", total, longest);
+ "Total: %lu, Longest: %lu\n", total, longest);
return out;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
spin_unlock(&dlm->spinlock);
}
-int dlm_joined(struct dlm_ctxt *dlm)
-{
- int ret = 0;
-
- spin_lock(&dlm_domain_lock);
-
- if (dlm->dlm_state == DLM_CTXT_JOINED)
- ret = 1;
-
- spin_unlock(&dlm_domain_lock);
-
- return ret;
-}
-
int dlm_shutting_down(struct dlm_ctxt *dlm)
{
int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
extern spinlock_t dlm_domain_lock;
extern struct list_head dlm_domains;
-int dlm_joined(struct dlm_ctxt *dlm);
int dlm_shutting_down(struct dlm_ctxt *dlm);
void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 79b5af5e6a7b..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule DLM_UNLOCK_FREE_LOCK
+ * - do manually */
+ dlm_lock_put(lock);
break;
}
}
@@ -2023,11 +2026,8 @@ leave:
dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- if (newlock)
- dlm_lock_put(newlock);
- }
return ret;
}
@@ -2349,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
dead_node, dlm->name);
list_del_init(&lock->list);
dlm_lock_put(lock);
+ /* Can't schedule
+ * DLM_UNLOCK_FREE_LOCK
+ * - do manually */
+ dlm_lock_put(lock);
break;
}
}
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
ip->ip_conn = NULL;
}
-static struct backing_dev_info dlmfs_backing_dev_info = {
- .name = "ocfs2-dlmfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
-
static struct inode *dlmfs_get_root_inode(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
if (inode) {
inode->i_ino = get_next_ino();
inode_init_owner(inode, NULL, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_ino = get_next_ino();
inode_init_owner(inode, parent, mode);
- inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
int status;
int cleanup_inode = 0, cleanup_worker = 0;
- status = bdi_init(&dlmfs_backing_dev_info);
- if (status)
- return status;
-
dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
sizeof(struct dlmfs_inode_private),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
kmem_cache_destroy(dlmfs_inode_cache);
if (cleanup_worker)
destroy_workqueue(user_dlm_worker);
- bdi_destroy(&dlmfs_backing_dev_info);
} else
printk("OCFS2 User DLM kernel interface loaded\n");
return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
rcu_barrier();
kmem_cache_destroy(dlmfs_inode_cache);
- bdi_destroy(&dlmfs_backing_dev_info);
}
MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
break;
spin_unlock(&dentry_attach_lock);
+ if (S_ISDIR(dl->dl_inode->i_mode))
+ shrink_dcache_parent(dentry);
+
mlog(0, "d_delete(%pd);\n", dentry);
/*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
return ret;
}
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
struct inode *inode,
struct buffer_head *fe_bh,
u64 new_i_size)
@@ -441,7 +441,7 @@ out:
return status;
}
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size)
{
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
handle_t *handle = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
- enum ocfs2_alloc_restarted why;
+ enum ocfs2_alloc_restarted why = RESTART_NONE;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
int did_quota = 0;
@@ -709,6 +709,13 @@ leave:
return status;
}
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten)
+{
+ return __ocfs2_extend_allocation(inode, logical_start,
+ clusters_to_add, mark_unwritten);
+}
+
/*
* While a write will already be ordering the data, a truncate will not.
* Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
loff_t saved_pos = 0, end;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int full_coherency = !(osb->s_mount_opt &
+ OCFS2_MOUNT_COHERENCY_BUFFERED);
/*
* We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* one node could wind up truncating another
* nodes writes.
*/
- if (end > i_size_read(inode)) {
+ if (end > i_size_read(inode) && !full_coherency) {
+ *direct_io = 0;
+ break;
+ }
+
+ /*
+ * Fallback to old way if the feature bit is not set.
+ */
+ if (end > i_size_read(inode) &&
+ !ocfs2_supports_append_dio(osb)) {
*direct_io = 0;
break;
}
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
if (ret == 1) {
- *direct_io = 0;
+ /*
+ * Fallback to old way if the feature bit is not set.
+ * Otherwise try dio first and then complete the rest
+ * request through buffer io.
+ */
+ if (!ocfs2_supports_append_dio(osb))
+ *direct_io = 0;
ret = 0;
} else if (ret < 0)
mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
u32 old_clusters;
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
+ struct address_space *mapping = file->f_mapping;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int full_coherency = !(osb->s_mount_opt &
OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,13 +2383,53 @@ relock:
iov_iter_truncate(from, count);
if (direct_io) {
+ loff_t endbyte;
+ ssize_t written_buffered;
written = generic_file_direct_write(iocb, from, *ppos);
- if (written < 0) {
+ if (written < 0 || written == count) {
ret = written;
goto out_dio;
}
+
+ /*
+ * for completing the rest of the request.
+ */
+ *ppos += written;
+ count -= written;
+ written_buffered = generic_perform_write(file, from, *ppos);
+ /*
+ * If generic_file_buffered_write() returned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out_dio;
+ }
+
+ iocb->ki_pos = *ppos + written_buffered;
+ /* We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = *ppos + written_buffered - 1;
+ ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
+ endbyte);
+ if (ret == 0) {
+ written += written_buffered;
+ invalidate_mapping_pages(mapping,
+ *ppos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
} else {
- current->backing_dev_info = file->f_mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
written = generic_perform_write(file, from, *ppos);
if (likely(written >= 0))
iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *fe_bh,
+ u64 new_i_size);
int ocfs2_simple_size_update(struct inode *inode,
struct buffer_head *di_bh,
u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
u64 new_i_size, u64 zero_to);
int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+ u32 clusters_to_add, int mark_unwritten);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
tid_t i_sync_tid;
tid_t i_datasync_tid;
+ wait_queue_head_t append_dio_wq;
+
struct dquot *i_dquot[MAXQUOTAS];
};
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
#include "sysfile.h"
#include "uptodate.h"
#include "quota.h"
+#include "file.h"
+#include "namei.h"
#include "buffer_head_io.h"
#include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
int slot_num);
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot);
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static int ocfs2_commit_thread(void *arg);
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec);
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type);
static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
{
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
return 0;
}
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
for (i = 0; i < replay_map->rm_slots; i++)
if (replay_map->rm_replay_slots[i])
ocfs2_queue_recovery_completion(osb->journal, i, NULL,
- NULL, NULL);
+ NULL, NULL,
+ orphan_reco_type);
replay_map->rm_state = REPLAY_DONE;
}
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
struct ocfs2_quota_recovery *lri_qrec;
+ enum ocfs2_orphan_reco_type lri_orphan_reco_type;
};
/* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_dinode *la_dinode, *tl_dinode;
struct ocfs2_la_recovery_item *item, *n;
struct ocfs2_quota_recovery *qrec;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
LIST_HEAD(tmp_la_list);
trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
la_dinode = item->lri_la_dinode;
tl_dinode = item->lri_tl_dinode;
qrec = item->lri_qrec;
+ orphan_reco_type = item->lri_orphan_reco_type;
trace_ocfs2_complete_recovery_slot(item->lri_slot,
la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
kfree(tl_dinode);
}
- ret = ocfs2_recover_orphans(osb, item->lri_slot);
+ ret = ocfs2_recover_orphans(osb, item->lri_slot,
+ orphan_reco_type);
if (ret < 0)
mlog_errno(ret);
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
struct ocfs2_dinode *tl_dinode,
- struct ocfs2_quota_recovery *qrec)
+ struct ocfs2_quota_recovery *qrec,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
struct ocfs2_la_recovery_item *item;
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
item->lri_qrec = qrec;
+ item->lri_orphan_reco_type = orphan_reco_type;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
/* No need to queue up our truncate_log as regular cleanup will catch
* that */
ocfs2_queue_recovery_completion(journal, osb->slot_num,
- osb->local_alloc_copy, NULL, NULL);
+ osb->local_alloc_copy, NULL, NULL,
+ ORPHAN_NEED_TRUNCATE);
ocfs2_schedule_truncate_log_flush(osb, 0);
osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
/* queue to recover orphan slots for all offline slots */
ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
ocfs2_free_replay_slots(osb);
}
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
osb->slot_num,
NULL,
NULL,
- osb->quota_rec);
+ osb->quota_rec,
+ ORPHAN_NEED_TRUNCATE);
osb->quota_rec = NULL;
}
}
@@ -1360,7 +1374,7 @@ restart:
/* queue recovery for our own slot */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL, NULL);
+ NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
spin_lock(&osb->osb_lock);
while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
continue;
}
ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
- NULL, NULL, qrec);
+ NULL, NULL, qrec,
+ ORPHAN_NEED_TRUNCATE);
}
ocfs2_super_unlock(osb, 1);
/* queue recovery for offline slots */
- ocfs2_queue_replay_slots(osb);
+ ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
bail:
mutex_lock(&osb->recovery_lock);
@@ -1447,7 +1462,6 @@ bail:
* requires that we call do_exit(). And it isn't exported, but
* complete_and_exit() seems to be a minimal wrapper around it. */
complete_and_exit(NULL, status);
- return status;
}
void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy, NULL);
+ tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
status = 0;
done:
@@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
for (i = 0; i < osb->max_slots; i++)
ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
- NULL);
+ NULL, ORPHAN_NO_NEED_TRUNCATE);
/*
* We queued a recovery on orphan slots, increment the sequence
* number and update LVB so other node will skip the scan for a while
@@ -2001,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (IS_ERR(iter))
return 0;
+ /* Skip inodes which are already added to recover list, since dio may
+ * happen concurrently with unlink/rename */
+ if (OCFS2_I(iter)->ip_next_orphan) {
+ iput(iter);
+ return 0;
+ }
+
trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
/* No locking is required for the next_orphan queue as there
* is only ever a single process doing orphan recovery. */
@@ -2109,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
* advertising our state to ocfs2_delete_inode().
*/
static int ocfs2_recover_orphans(struct ocfs2_super *osb,
- int slot)
+ int slot,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int ret = 0;
struct inode *inode = NULL;
@@ -2133,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
(unsigned long long)oi->ip_blkno);
iter = oi->ip_next_orphan;
+ oi->ip_next_orphan = NULL;
+
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, NULL, 0);
+ if (ret) {
+ mlog_errno(ret);
+ goto next;
+ }
+ ocfs2_inode_unlock(inode, 0);
+
+ if (inode->i_nlink == 0) {
+ spin_lock(&oi->ip_lock);
+ /* Set the proper information to get us going into
+ * ocfs2_delete_inode. */
+ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+ spin_unlock(&oi->ip_lock);
+ } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+ struct buffer_head *di_bh = NULL;
+
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ ocfs2_rw_unlock(inode, 1);
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ ocfs2_inode_unlock(inode, 1);
+ ocfs2_rw_unlock(inode, 1);
+ brelse(di_bh);
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto next;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+ if (ret)
+ mlog_errno(ret);
- spin_lock(&oi->ip_lock);
- /* Set the proper information to get us going into
- * ocfs2_delete_inode. */
- oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
- spin_unlock(&oi->ip_lock);
+ wake_up(&OCFS2_I(inode)->append_dio_wq);
+ } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
+next:
iput(inode);
inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
* orphan dir index leaf */
#define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
+
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
static const struct vm_operations_struct ocfs2_file_vm_ops = {
.fault = ocfs2_fault,
.page_mkwrite = ocfs2_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b931e04e3388..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup);
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio);
static int ocfs2_orphan_add(struct ocfs2_super *osb,
handle_t *handle,
@@ -87,15 +88,26 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode);
+ struct inode *orphan_dir_inode,
+ bool dio);
static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
handle_t *handle,
struct inode *inode,
const char *symname);
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+ struct buffer_head **bh1,
+ struct inode *inode1,
+ struct buffer_head **bh2,
+ struct inode *inode2,
+ int rename);
+
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
@@ -678,8 +690,10 @@ static int ocfs2_link(struct dentry *old_dentry,
{
handle_t *handle;
struct inode *inode = old_dentry->d_inode;
+ struct inode *old_dir = old_dentry->d_parent->d_inode;
int err;
struct buffer_head *fe_bh = NULL;
+ struct buffer_head *old_dir_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -696,19 +710,33 @@ static int ocfs2_link(struct dentry *old_dentry,
dquot_initialize(dir);
- err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+ err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+ &parent_fe_bh, dir, 0);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
return err;
}
+ /* make sure both dirs have bhs
+ * get an extra ref on old_dir_bh if old==new */
+ if (!parent_fe_bh) {
+ if (old_dir_bh) {
+ parent_fe_bh = old_dir_bh;
+ get_bh(parent_fe_bh);
+ } else {
+ mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+ err = -EIO;
+ goto out;
+ }
+ }
+
if (!dir->i_nlink) {
err = -ENOENT;
goto out;
}
- err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+ err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
old_dentry->d_name.len, &old_de_ino);
if (err) {
err = -ENOENT;
@@ -801,10 +829,11 @@ out_unlock_inode:
ocfs2_inode_unlock(inode, 1);
out:
- ocfs2_inode_unlock(dir, 1);
+ ocfs2_double_unlock(old_dir, dir);
brelse(fe_bh);
brelse(parent_fe_bh);
+ brelse(old_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
@@ -927,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
if (ocfs2_inode_is_unlinkable(inode)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -979,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
if (is_unlinkable) {
status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
- orphan_name, &orphan_insert, orphan_dir);
+ orphan_name, &orphan_insert, orphan_dir, false);
if (status < 0)
mlog_errno(status);
}
@@ -1072,14 +1102,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
}
/*
- * The only place this should be used is rename!
+ * The only place this should be used is rename and link!
* if they have the same id, then the 1st one is the only one locked.
*/
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
- struct inode *inode2)
+ struct inode *inode2,
+ int rename)
{
int status;
int inode1_is_ancestor, inode2_is_ancestor;
@@ -1127,7 +1158,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
- OI_LS_RENAME1);
+ rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
@@ -1136,7 +1167,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
- status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+ status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+ rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
if (status < 0) {
/*
* An error return must mean that no cluster locks
@@ -1252,7 +1284,7 @@ static int ocfs2_rename(struct inode *old_dir,
/* if old and new are the same, this'll just do one lock. */
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
- &new_dir_bh, new_dir);
+ &new_dir_bh, new_dir, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1413,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
OCFS2_I(new_inode)->ip_blkno,
- orphan_name, &orphan_insert);
+ orphan_name, &orphan_insert,
+ false);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1480,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
if (should_add_orphan) {
status = ocfs2_orphan_add(osb, handle, new_inode,
newfe_bh, orphan_name,
- &orphan_insert, orphan_dir);
+ &orphan_insert, orphan_dir, false);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -2061,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
struct buffer_head *orphan_dir_bh,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
+
+ if (dio) {
+ ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ ret = -EINVAL;
+ mlog_errno(ret);
+ return ret;
+ }
- ret = ocfs2_blkno_stringify(blkno, name);
+ ret = ocfs2_blkno_stringify(blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ ret = ocfs2_blkno_stringify(blkno, name);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -2074,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
orphan_dir_bh, name,
- OCFS2_ORPHAN_NAMELEN, lookup);
+ namelen, lookup);
if (ret < 0) {
mlog_errno(ret);
return ret;
@@ -2101,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
struct inode **ret_orphan_dir,
u64 blkno,
char *name,
- struct ocfs2_dir_lookup_result *lookup)
+ struct ocfs2_dir_lookup_result *lookup,
+ bool dio)
{
struct inode *orphan_dir_inode = NULL;
struct buffer_head *orphan_dir_bh = NULL;
@@ -2115,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
- blkno, name, lookup);
+ blkno, name, lookup, dio);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2143,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
char *name,
struct ocfs2_dir_lookup_result *lookup,
- struct inode *orphan_dir_inode)
+ struct inode *orphan_dir_inode,
+ bool dio)
{
struct buffer_head *orphan_dir_bh = NULL;
int status = 0;
struct ocfs2_dinode *orphan_fe;
struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+ int namelen = dio ?
+ (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+ OCFS2_ORPHAN_NAMELEN;
trace_ocfs2_orphan_add_begin(
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2192,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
ocfs2_journal_dirty(handle, orphan_dir_bh);
status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
- OCFS2_ORPHAN_NAMELEN, inode,
+ namelen, inode,
OCFS2_I(inode)->ip_blkno,
orphan_dir_bh, lookup);
if (status < 0) {
@@ -2200,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
goto rollback;
}
- fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
- OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+ if (dio) {
+ /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+ * slot.
+ */
+ fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+ } else {
+ fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+ OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
- /* Record which orphan dir our inode now resides
- * in. delete_inode will use this to determine which orphan
- * dir to lock. */
- fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ /* Record which orphan dir our inode now resides
+ * in. delete_inode will use this to determine which orphan
+ * dir to lock. */
+ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+ }
ocfs2_journal_dirty(handle, fe_bh);
@@ -2231,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh)
+ struct buffer_head *orphan_dir_bh,
+ bool dio)
{
- char name[OCFS2_ORPHAN_NAMELEN + 1];
+ const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+ char name[namelen + 1];
struct ocfs2_dinode *orphan_fe;
int status = 0;
struct ocfs2_dir_lookup_result lookup = { NULL, };
- status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+ if (dio) {
+ status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+ OCFS2_DIO_ORPHAN_PREFIX);
+ if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+ status = -EINVAL;
+ mlog_errno(status);
+ return status;
+ }
+
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+ name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+ } else
+ status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2246,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
trace_ocfs2_orphan_del(
(unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
- name, OCFS2_ORPHAN_NAMELEN);
+ name, namelen);
/* find it's spot in the orphan directory */
- status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+ status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
&lookup);
if (status) {
mlog_errno(status);
@@ -2349,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
}
ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
- di_blkno, orphan_name, orphan_insert);
+ di_blkno, orphan_name, orphan_insert,
+ false);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -2455,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
di = (struct ocfs2_dinode *)new_di_bh->b_data;
status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
- &orphan_insert, orphan_dir);
+ &orphan_insert, orphan_dir, false);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -2500,6 +2577,186 @@ leave:
return status;
}
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return 0;
+ }
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+ return ret;
+}
+
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode)
+{
+ char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+ struct inode *orphan_dir_inode = NULL;
+ struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+ struct buffer_head *di_bh = NULL;
+ int status = 0;
+ handle_t *handle = NULL;
+ struct ocfs2_dinode *di = NULL;
+
+restart:
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+ /*
+ * Another append dio crashed?
+ * If so, wait for recovery first.
+ */
+ if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+ ocfs2_dio_orphan_recovered(inode),
+ msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+ goto restart;
+ }
+
+ status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+ OCFS2_I(inode)->ip_blkno,
+ orphan_name,
+ &orphan_insert,
+ true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+ &orphan_insert, orphan_dir_inode, true);
+ if (status)
+ mlog_errno(status);
+
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+
+ ocfs2_free_dir_lookup_result(&orphan_insert);
+
+bail_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+bail:
+ return status;
+}
+
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, int update_isize,
+ loff_t end)
+{
+ struct inode *orphan_dir_inode = NULL;
+ struct buffer_head *orphan_dir_bh = NULL;
+ struct buffer_head *di_bh = NULL;
+ struct ocfs2_dinode *di = NULL;
+ handle_t *handle = NULL;
+ int status = 0;
+
+ status = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ di = (struct ocfs2_dinode *) di_bh->b_data;
+
+ orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+ ORPHAN_DIR_SYSTEM_INODE,
+ le16_to_cpu(di->i_dio_orphaned_slot));
+ if (!orphan_dir_inode) {
+ status = -ENOENT;
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ mutex_lock(&orphan_dir_inode->i_mutex);
+ status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+ if (status < 0) {
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ iput(orphan_dir_inode);
+ mlog_errno(status);
+ goto bail_unlock_inode;
+ }
+
+ handle = ocfs2_start_trans(osb,
+ OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ goto bail_unlock_orphan;
+ }
+
+ BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+
+ status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+ inode, orphan_dir_bh, true);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ status = ocfs2_journal_access_di(handle,
+ INODE_CACHE(inode),
+ di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail_commit;
+ }
+
+ di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+ di->i_dio_orphaned_slot = 0;
+
+ if (update_isize) {
+ status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+ if (status)
+ mlog_errno(status);
+ } else
+ ocfs2_journal_dirty(handle, di_bh);
+
+bail_commit:
+ ocfs2_commit_trans(osb, handle);
+
+bail_unlock_orphan:
+ ocfs2_inode_unlock(orphan_dir_inode, 1);
+ mutex_unlock(&orphan_dir_inode->i_mutex);
+ brelse(orphan_dir_bh);
+ iput(orphan_dir_inode);
+
+bail_unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+
+bail:
+ return status;
+}
+
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *inode,
struct dentry *dentry)
@@ -2588,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
}
status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
- orphan_dir_bh);
+ orphan_dir_bh, false);
if (status < 0) {
mlog_errno(status);
goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
handle_t *handle,
struct inode *orphan_dir_inode,
struct inode *inode,
- struct buffer_head *orphan_dir_bh);
+ struct buffer_head *orphan_dir_bh,
+ bool dio);
int ocfs2_create_inode_in_orphan(struct inode *dir,
int mode,
struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+ struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+ struct inode *inode, int update_isize,
+ loff_t end);
int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
struct inode *new_inode,
struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
#endif
};
+enum ocfs2_orphan_reco_type {
+ ORPHAN_NO_NEED_TRUNCATE = 0,
+ ORPHAN_NEED_TRUNCATE,
+};
+
enum ocfs2_orphan_scan_state {
ORPHAN_SCAN_ACTIVE,
ORPHAN_SCAN_INACTIVE
@@ -279,6 +284,8 @@ enum ocfs2_mount_options
writes */
OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+
+ OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -493,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+ return 1;
+ return 0;
+}
+
+
static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
{
if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -724,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
return clusters;
}
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+ u64 bytes)
+{
+ int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+ unsigned int clusters;
+
+ clusters = (unsigned int)(bytes >> cl_bits);
+ return clusters;
+}
+
static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
u64 bytes)
{
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
| OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
| OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
- | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+ | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -199,6 +200,11 @@
#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO 0x0008
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -229,6 +235,8 @@
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL (0X00002000) /* On the orphan list especially
+ * for dio */
/*
* Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
inode belongs to. Only valid
if allocated from a
discontiguous block group */
-/*A0*/ __le64 i_reserved2[3];
+/*A0*/ __le16 i_dio_orphaned_slot; /* only used for append dio write */
+ __le16 i_reserved1[3];
+ __le64 i_reserved2[2];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
/* In-memory structure with quota header information */
struct ocfs2_mem_dqinfo {
unsigned int dqi_type; /* Quota type this structure describes */
+ unsigned int dqi_flags; /* Flags OLQF_* */
unsigned int dqi_chunks; /* Number of chunks in local quota file */
unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
unsigned int dqi_syncms; /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
ol_dqblk_block_off(sb, c, off);
}
-/* Compute block number from given offset */
-static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
-{
- return off >> sb->s_blocksize_bits;
-}
-
static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
{
return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
OCFS2_LOCAL_INFO_OFF);
spin_lock(&dq_data_lock);
- ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
/* We don't need the lock and we have to acquire quota file locks
* which will later depend on this lock */
mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
- info->dqi_maxblimit = 0x7fffffffffffffffLL;
- info->dqi_maxilimit = 0x7fffffffffffffffLL;
+ info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
+ info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
if (!oinfo) {
mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
}
ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
OCFS2_LOCAL_INFO_OFF);
- info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+ oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
oinfo->dqi_libh = bh;
/* We crashed when using local quota file? */
- if (!(info->dqi_flags & OLQF_CLEAN)) {
+ if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
rec = OCFS2_SB(sb)->quota_rec;
if (!rec) {
rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
}
/* Now mark quota file as used */
- info->dqi_flags &= ~OLQF_CLEAN;
+ oinfo->dqi_flags &= ~OLQF_CLEAN;
status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
if (status < 0) {
mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
goto out;
/* Mark local file as clean */
- info->dqi_flags |= OLQF_CLEAN;
+ oinfo->dqi_flags |= OLQF_CLEAN;
status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
oinfo->dqi_libh,
olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
get_bh(prev_bh);
}
- rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
-
trace_ocfs2_calc_refcount_meta_credits_iterate(
recs_add, (unsigned long long)cpos, clusters,
(unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
#define OCFS2_CHECK_RESERVATIONS
#endif
-DEFINE_SPINLOCK(resv_lock);
+static DEFINE_SPINLOCK(resv_lock);
#define OCFS2_MIN_RESV_WINDOW_BITS 8
#define OCFS2_MAX_RESV_WINDOW_BITS 1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
Opt_coherency_full,
Opt_resv_level,
Opt_dir_resv_level,
+ Opt_journal_async_commit,
Opt_err,
};
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
{Opt_coherency_full, "coherency=full"},
{Opt_resv_level, "resv_level=%u"},
{Opt_dir_resv_level, "dir_resv_level=%u"},
+ {Opt_journal_async_commit, "journal_async_commit"},
{Opt_err, NULL}
};
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
}
}
-/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-{
- unsigned int feature[OCFS2_MAXQUOTAS] = {
- OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
- OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-
- if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
- return -EINVAL;
-
- return dquot_enable(sb_dqopt(sb)->files[type], type,
- format_id, DQUOT_LIMITS_ENABLED);
-}
-
-/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type)
-{
- return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-
-static const struct quotactl_ops ocfs2_quotactl_ops = {
- .quota_on_meta = ocfs2_quota_on,
- .quota_off = ocfs2_quota_off,
- .quota_sync = dquot_quota_sync,
- .get_info = dquot_get_dqinfo,
- .set_info = dquot_set_dqinfo,
- .get_dqblk = dquot_get_dqblk,
- .set_dqblk = dquot_set_dqblk,
-};
-
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
option < OCFS2_MAX_RESV_LEVEL)
mopt->dir_resv_level = option;
break;
+ case Opt_journal_async_commit:
+ mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+ break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
if (osb->osb_dir_resv_level != osb->osb_resv_level)
seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+ if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+ seq_printf(s, ",journal_async_commit");
+
return 0;
}
@@ -1768,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+ init_waitqueue_head(&oi->append_dio_wq);
+
ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
&ocfs2_inode_caching_ops);
@@ -2079,7 +2059,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_op = &ocfs2_sops;
sb->s_d_op = &ocfs2_dentry_ops;
sb->s_export_op = &ocfs2_export_ops;
- sb->s_qcop = &ocfs2_quotactl_ops;
+ sb->s_qcop = &dquot_quotactl_sysfile_ops;
sb->dq_op = &ocfs2_quota_operations;
sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
goto finally;
}
+ if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+ jbd2_journal_set_features(osb->journal->j_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+ else
+ jbd2_journal_clear_features(osb->journal->j_journal,
+ JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+ JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+
if (dirty) {
/* recover my local alloc if we didn't unmount cleanly. */
status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
return ret;
}
-static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
- struct ocfs2_xattr_bucket *bucket,
- int offs)
-{
- int block_off = offs >> inode->i_sb->s_blocksize_bits;
-
- offs = offs % inode->i_sb->s_blocksize;
- return bucket_block(bucket, block_off) + offs;
-}
-
/*
* Truncate the specified xe_off entry in xattr bucket.
* bucket is indicated by header_bh and len is the new length.
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
{
/* NB: we're sure to have correct a_ops only after f_op->open */
if (f->f_flags & O_DIRECT) {
- if (!f->f_mapping->a_ops ||
- ((!f->f_mapping->a_ops->direct_IO) &&
- (!f->f_mapping->a_ops->get_xip_mem))) {
+ if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
return -EINVAL;
- }
}
return 0;
}
@@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
*/
struct file *filp_open(const char *filename, int flags, umode_t mode)
{
- struct filename name = {.name = filename};
- return file_open_name(&name, flags, mode);
+ struct filename *name = getname_kernel(filename);
+ struct file *file = ERR_CAST(name);
+
+ if (!IS_ERR(name)) {
+ file = file_open_name(name, flags, mode);
+ putname(name);
+ }
+ return file;
}
EXPORT_SYMBOL(filp_open);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
#include <linux/pid_namespace.h>
#include <linux/ptrace.h>
#include <linux/tracehook.h>
+#include <linux/string_helpers.h>
#include <linux/user_namespace.h>
#include <asm/pgtable.h>
@@ -89,39 +90,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
- int i;
- char *buf, *end;
- char *name;
+ char *buf;
char tcomm[sizeof(p->comm)];
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- end = m->buf + m->size;
buf = m->buf + m->count;
- name = tcomm;
- i = sizeof(tcomm);
- while (i && (buf < end)) {
- unsigned char c = *name;
- name++;
- i--;
- *buf = c;
- if (!c)
- break;
- if (c == '\\') {
- buf++;
- if (buf < end)
- *buf++ = c;
- continue;
- }
- if (c == '\n') {
- *buf++ = '\\';
- if (buf < end)
- *buf++ = 'n';
- continue;
- }
- buf++;
- }
+
+ /* Ignore error for now */
+ string_escape_str(tcomm, &buf, m->size - m->count,
+ ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+
m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -336,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
{
- seq_puts(m, "Cpus_allowed:\t");
- seq_cpumask(m, &task->cpus_allowed);
- seq_putc(m, '\n');
- seq_puts(m, "Cpus_allowed_list:\t");
- seq_cpumask_list(m, &task->cpus_allowed);
- seq_putc(m, '\n');
+ seq_printf(m, "Cpus_allowed:\t%*pb\n",
+ cpumask_pr_args(&task->cpus_allowed));
+ seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
+ cpumask_pr_args(&task->cpus_allowed));
}
int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct inode *inode = dentry->d_inode;
- struct proc_dir_entry *de = PROC_I(inode)->pde;
+ struct proc_dir_entry *de = PDE(inode);
if (de && de->nlink)
set_nlink(inode, de->nlink);
@@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
if (ret)
return ret;
- if (S_ISDIR(dp->mode)) {
- dp->proc_fops = &proc_dir_operations;
- dp->proc_iops = &proc_dir_inode_operations;
- dir->nlink++;
- } else if (S_ISLNK(dp->mode)) {
- dp->proc_iops = &proc_link_inode_operations;
- } else if (S_ISREG(dp->mode)) {
- BUG_ON(dp->proc_fops == NULL);
- dp->proc_iops = &proc_file_inode_operations;
- } else {
- WARN_ON(1);
- proc_free_inum(dp->low_ino);
- return -EINVAL;
- }
-
spin_lock(&proc_subdir_lock);
dp->parent = dir;
if (pde_subdir_insert(dir, dp) == false) {
WARN(1, "proc_dir_entry '%s/%s' already registered\n",
dir->name, dp->name);
spin_unlock(&proc_subdir_lock);
- if (S_ISDIR(dp->mode))
- dir->nlink--;
proc_free_inum(dp->low_ino);
return -EEXIST;
}
@@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
if (ent->data) {
strcpy((char*)ent->data,dest);
+ ent->proc_iops = &proc_link_inode_operations;
if (proc_register(parent, ent) < 0) {
kfree(ent->data);
kfree(ent);
@@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
if (ent) {
ent->data = data;
+ ent->proc_fops = &proc_dir_operations;
+ ent->proc_iops = &proc_dir_inode_operations;
+ parent->nlink++;
if (proc_register(parent, ent) < 0) {
kfree(ent);
+ parent->nlink--;
ent = NULL;
}
}
@@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
return NULL;
}
+ BUG_ON(proc_fops == NULL);
+
if ((mode & S_IALLUGO) == 0)
mode |= S_IRUGO;
pde = __proc_create(&parent, name, mode, 1);
@@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
goto out;
pde->proc_fops = proc_fops;
pde->data = data;
+ pde->proc_iops = &proc_file_inode_operations;
if (proc_register(parent, pde) < 0)
goto out_free;
return pde;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
put_pid(PROC_I(inode)->pid);
/* Let go of any associated proc directory entry */
- de = PROC_I(inode)->pde;
+ de = PDE(inode);
if (de)
pde_put(de);
head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
#include <linux/ksm.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
+#include <linux/huge_mm.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
* just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
* to make sure a given page is a thp, not a non-huge compound page.
*/
- else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
- PageAnon(compound_head(page))))
- u |= 1 << KPF_THP;
+ else if (PageTransCompound(page)) {
+ struct page *head = compound_head(page);
+
+ if (PageLRU(head) || PageAnon(head))
+ u |= 1 << KPF_THP;
+ else if (is_huge_zero_page(head)) {
+ u |= 1 << KPF_ZERO_PAGE;
+ u |= 1 << KPF_THP;
+ }
+ } else if (is_zero_pfn(page_to_pfn(page)))
+ u |= 1 << KPF_ZERO_PAGE;
+
/*
* Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm)
{
- unsigned long data, text, lib, swap;
+ unsigned long data, text, lib, swap, ptes, pmds;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
/*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
swap = get_mm_counter(mm, MM_SWAPENTS);
+ ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+ pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
seq_printf(m,
"VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmExe:\t%8lu kB\n"
"VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n"
+ "VmPMD:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10),
total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
total_rss << (PAGE_SHIFT-10),
data << (PAGE_SHIFT-10),
mm->stack_vm << (PAGE_SHIFT-10), text, lib,
- (PTRS_PER_PTE * sizeof(pte_t) *
- atomic_long_read(&mm->nr_ptes)) >> 10,
+ ptes >> 10,
+ pmds >> 10,
swap << (PAGE_SHIFT-10));
}
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
#ifdef CONFIG_PROC_PAGE_MONITOR
struct mem_size_stats {
- struct vm_area_struct *vma;
unsigned long resident;
unsigned long shared_clean;
unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
- unsigned long nonlinear;
u64 pss;
};
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
- pgoff_t pgoff = linear_page_index(vma, addr);
+ struct vm_area_struct *vma = walk->vma;
struct page *page = NULL;
if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
mss->swap += PAGE_SIZE;
else if (is_migration_entry(swpent))
page = migration_entry_to_page(swpent);
- } else if (pte_file(*pte)) {
- if (pte_to_pgoff(*pte) != pgoff)
- mss->nonlinear += PAGE_SIZE;
}
if (!page)
return;
-
- if (page->index != pgoff)
- mss->nonlinear += PAGE_SIZE;
-
smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
}
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
struct mm_walk *walk)
{
struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
struct page *page;
/* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct mem_size_stats *mss = walk->private;
- struct vm_area_struct *vma = mss->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht",
- [ilog2(VM_NONLINEAR)] = "nl",
[ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_DONTDUMP)] = "dd",
#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
};
memset(&mss, 0, sizeof mss);
- mss.vma = vma;
/* mmap_sem is held in m_start */
- if (vma->vm_mm && !is_vm_hugetlb_page(vma))
- walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
+ walk_page_vma(vma, &smaps_walk);
show_map_vma(m, vma, is_pid);
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
(vma->vm_flags & VM_LOCKED) ?
(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
- if (vma->vm_flags & VM_NONLINEAR)
- seq_printf(m, "Nonlinear: %8lu kB\n",
- mss.nonlinear >> 10);
-
show_smap_vma_flags(m, vma);
m_cache_vma(m, vma);
return 0;
@@ -747,18 +732,18 @@ enum clear_refs_types {
CLEAR_REFS_ANON,
CLEAR_REFS_MAPPED,
CLEAR_REFS_SOFT_DIRTY,
+ CLEAR_REFS_MM_HIWATER_RSS,
CLEAR_REFS_LAST,
};
struct clear_refs_private {
- struct vm_area_struct *vma;
enum clear_refs_types type;
};
+#ifdef CONFIG_MEM_SOFT_DIRTY
static inline void clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
-#ifdef CONFIG_MEM_SOFT_DIRTY
/*
* The soft-dirty tracker uses #PF-s to catch writes
* to pages, so write-protect the pte as well. See the
@@ -772,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
- } else if (pte_file(ptent)) {
- ptent = pte_file_clear_soft_dirty(ptent);
}
set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd = *pmdp;
+
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+
+ set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+
+#else
+
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
+
static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
struct clear_refs_private *cp = walk->private;
- struct vm_area_struct *vma = cp->vma;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte, ptent;
spinlock_t *ptl;
struct page *page;
- split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+ clear_soft_dirty_pmd(vma, addr, pmd);
+ goto out;
+ }
+
+ page = pmd_page(*pmd);
+
+ /* Clear accessed and referenced bits. */
+ pmdp_test_and_clear_young(vma, addr, pmd);
+ ClearPageReferenced(page);
+out:
+ spin_unlock(ptl);
+ return 0;
+ }
+
if (pmd_trans_unstable(pmd))
return 0;
@@ -818,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
return 0;
}
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct clear_refs_private *cp = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ /*
+ * Writing 1 to /proc/pid/clear_refs affects all pages.
+ * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+ * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+ * Writing 4 to /proc/pid/clear_refs affects all pages.
+ */
+ if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+ return 1;
+ if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+ return 1;
+ return 0;
+}
+
static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
@@ -858,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
};
struct mm_walk clear_refs_walk = {
.pmd_entry = clear_refs_pte_range,
+ .test_walk = clear_refs_test_walk,
.mm = mm,
.private = &cp,
};
+
+ if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ /*
+ * Writing 5 to /proc/pid/clear_refs resets the peak
+ * resident set size to this mm's current rss value.
+ */
+ down_write(&mm->mmap_sem);
+ reset_mm_hiwater_rss(mm);
+ up_write(&mm->mmap_sem);
+ goto out_mm;
+ }
+
down_read(&mm->mmap_sem);
if (type == CLEAR_REFS_SOFT_DIRTY) {
for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -877,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
}
mmu_notifier_invalidate_range_start(mm, 0, -1);
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- cp.vma = vma;
- if (is_vm_hugetlb_page(vma))
- continue;
- /*
- * Writing 1 to /proc/pid/clear_refs affects all pages.
- *
- * Writing 2 to /proc/pid/clear_refs only affects
- * Anonymous pages.
- *
- * Writing 3 to /proc/pid/clear_refs only affects file
- * mapped pages.
- *
- * Writing 4 to /proc/pid/clear_refs affects all pages.
- */
- if (type == CLEAR_REFS_ANON && vma->vm_file)
- continue;
- if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &clear_refs_walk);
- }
+ walk_page_range(0, ~0UL, &clear_refs_walk);
if (type == CLEAR_REFS_SOFT_DIRTY)
mmu_notifier_invalidate_range_end(mm, 0, -1);
flush_tlb_mm(mm);
up_read(&mm->mmap_sem);
+out_mm:
mmput(mm);
}
put_task_struct(task);
@@ -1066,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
struct pagemapread *pm = walk->private;
spinlock_t *ptl;
- pte_t *pte;
+ pte_t *pte, *orig_pte;
int err = 0;
- /* find the first VMA at or above 'addr' */
- vma = find_vma(walk->mm, addr);
- if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
int pmd_flags2;
if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (pmd_trans_unstable(pmd))
return 0;
- while (1) {
- /* End of address space hole, which we mark as non-present. */
- unsigned long hole_end;
-
- if (vma)
- hole_end = min(end, vma->vm_start);
- else
- hole_end = end;
-
- for (; addr < hole_end; addr += PAGE_SIZE) {
- pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
- }
-
- if (!vma || vma->vm_start >= end)
- break;
- /*
- * We can't possibly be in a hugetlb VMA. In general,
- * for a mm_walk with a pmd_entry and a hugetlb_entry,
- * the pmd_entry can only be called on addresses in a
- * hugetlb if the walk starts in a non-hugetlb VMA and
- * spans a hugepage VMA. Since pagemap_read walks are
- * PMD-sized and PMD-aligned, this will never be true.
- */
- BUG_ON(is_vm_hugetlb_page(vma));
-
- /* Addresses in the VMA. */
- for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
- pagemap_entry_t pme;
- pte = pte_offset_map(pmd, addr);
- pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
- pte_unmap(pte);
- err = add_to_pagemap(addr, &pme, pm);
- if (err)
- return err;
- }
+ /*
+ * We can assume that @vma always points to a valid one and @end never
+ * goes beyond vma->vm_end.
+ */
+ orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ pagemap_entry_t pme;
- if (addr == end)
+ pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+ err = add_to_pagemap(addr, &pme, pm);
+ if (err)
break;
-
- vma = find_vma(walk->mm, addr);
}
+ pte_unmap_unlock(orig_pte, ptl);
cond_resched();
@@ -1170,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
struct mm_walk *walk)
{
struct pagemapread *pm = walk->private;
- struct vm_area_struct *vma;
+ struct vm_area_struct *vma = walk->vma;
int err = 0;
int flags2;
pagemap_entry_t pme;
- vma = find_vma(walk->mm, addr);
- WARN_ON_ONCE(!vma);
-
- if (vma && (vma->vm_flags & VM_SOFTDIRTY))
+ if (vma->vm_flags & VM_SOFTDIRTY)
flags2 = __PM_SOFT_DIRTY;
else
flags2 = 0;
@@ -1338,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
#ifdef CONFIG_NUMA
struct numa_maps {
- struct vm_area_struct *vma;
unsigned long pages;
unsigned long anon;
unsigned long active;
@@ -1407,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
- struct numa_maps *md;
+ struct numa_maps *md = walk->private;
+ struct vm_area_struct *vma = walk->vma;
spinlock_t *ptl;
pte_t *orig_pte;
pte_t *pte;
- md = walk->private;
-
- if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pte_t huge_pte = *(pte_t *)pmd;
struct page *page;
- page = can_gather_numa_stats(huge_pte, md->vma, addr);
+ page = can_gather_numa_stats(huge_pte, vma, addr);
if (page)
gather_stats(page, md, pte_dirty(huge_pte),
HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
do {
- struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+ struct page *page = can_gather_numa_stats(*pte, vma, addr);
if (!page)
continue;
gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
return 0;
}
#ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
struct numa_maps *md;
@@ -1459,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
}
#else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end, struct mm_walk *walk)
{
return 0;
@@ -1477,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
struct numa_maps *md = &numa_priv->md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
- struct mm_walk walk = {};
+ struct mm_walk walk = {
+ .hugetlb_entry = gather_hugetlb_stats,
+ .pmd_entry = gather_pte_stats,
+ .private = md,
+ .mm = mm,
+ };
struct mempolicy *pol;
char buffer[64];
int nid;
@@ -1488,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
/* Ensure we start with an empty set of numa_maps statistics. */
memset(md, 0, sizeof(*md));
- md->vma = vma;
-
- walk.hugetlb_entry = gather_hugetbl_stats;
- walk.pmd_entry = gather_pte_stats;
- walk.private = md;
- walk.mm = mm;
-
pol = __get_vma_policy(vma, vma->vm_start);
if (pol) {
mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
if (is_vm_hugetlb_page(vma))
seq_puts(m, " huge");
- walk_page_range(vma->vm_start, vma->vm_end, &walk);
+ /* mmap_sem is held by m_start */
+ walk_page_vma(vma, &walk);
if (!md->pages)
goto out;
@@ -1557,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
for_each_node_state(nid, N_MEMORY)
if (md->node[nid])
seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+
+ seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
out:
seq_putc(m, '\n');
m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
nhdr_ptr = notes_section;
while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf64_Nhdr) +
- ((nhdr_ptr->n_namesz + 3) & ~3) +
- ((nhdr_ptr->n_descsz + 3) & ~3);
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
if ((real_sz + sz) > max_sz) {
pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
nhdr_ptr = notes_section;
while (nhdr_ptr->n_namesz != 0) {
sz = sizeof(Elf32_Nhdr) +
- ((nhdr_ptr->n_namesz + 3) & ~3) +
- ((nhdr_ptr->n_descsz + 3) & ~3);
+ (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
+ (((u64)nhdr_ptr->n_descsz + 3) & ~3);
if ((real_sz + sz) > max_sz) {
pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
{ MS_SYNCHRONOUS, ",sync" },
{ MS_DIRSYNC, ",dirsync" },
{ MS_MANDLOCK, ",mand" },
+ { MS_LAZYTIME, ",lazytime" },
{ 0, NULL }
};
const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
When the option is enabled, pstore will log all kernel
messages, even if no oops or panic happened.
+config PSTORE_PMSG
+ bool "Log user space messages"
+ depends on PSTORE
+ help
+ When the option is enabled, pstore will export a character
+ interface /dev/pmsg0 to log user space messages. On reboot
+ data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+
+ If unsure, say N.
+
config PSTORE_FTRACE
bool "Persistent function tracer"
depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
pstore-objs += inode.o platform.o
obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
+obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
+
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
switch (type) {
case PSTORE_TYPE_DMESG:
- sprintf(name, "dmesg-%s-%lld%s", psname, id,
- compressed ? ".enc.z" : "");
+ scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
+ psname, id, compressed ? ".enc.z" : "");
break;
case PSTORE_TYPE_CONSOLE:
- sprintf(name, "console-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
break;
case PSTORE_TYPE_FTRACE:
- sprintf(name, "ftrace-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
break;
case PSTORE_TYPE_MCE:
- sprintf(name, "mce-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
break;
case PSTORE_TYPE_PPC_RTAS:
- sprintf(name, "rtas-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
break;
case PSTORE_TYPE_PPC_OF:
- sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+ psname, id);
break;
case PSTORE_TYPE_PPC_COMMON:
- sprintf(name, "powerpc-common-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+ psname, id);
+ break;
+ case PSTORE_TYPE_PMSG:
+ scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
break;
case PSTORE_TYPE_UNKNOWN:
- sprintf(name, "unknown-%s-%lld", psname, id);
+ scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
break;
default:
- sprintf(name, "type%d-%s-%lld", type, psname, id);
+ scnprintf(name, sizeof(name), "type%d-%s-%lld",
+ type, psname, id);
break;
}
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
static inline void pstore_register_ftrace(void) {}
#endif
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
+
extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
if (big_oops_buf) {
dst = big_oops_buf;
- hsize = sprintf(dst, "%s#%d Part%d\n", why,
+ hsize = sprintf(dst, "%s#%d Part%u\n", why,
oopscount, part);
size = big_oops_buf_sz - hsize;
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
}
} else {
dst = psinfo->buf;
- hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
+ hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
part);
size = psinfo->bufsize - hsize;
dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
pstore_register_console();
pstore_register_ftrace();
+ pstore_register_pmsg();
}
if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ size_t i, buffer_size;
+ char *buffer;
+
+ if (!count)
+ return 0;
+
+ if (!access_ok(VERIFY_READ, buf, count))
+ return -EFAULT;
+
+ buffer_size = count;
+ if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+ buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+ buffer = vmalloc(buffer_size);
+
+ mutex_lock(&pmsg_lock);
+ for (i = 0; i < count; ) {
+ size_t c = min(count - i, buffer_size);
+ u64 id;
+ long ret;
+
+ ret = __copy_from_user(buffer, buf + i, c);
+ if (unlikely(ret != 0)) {
+ mutex_unlock(&pmsg_lock);
+ vfree(buffer);
+ return -EFAULT;
+ }
+ psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+ psinfo);
+
+ i += c;
+ }
+
+ mutex_unlock(&pmsg_lock);
+ vfree(buffer);
+ return count;
+}
+
+static const struct file_operations pmsg_fops = {
+ .owner = THIS_MODULE,
+ .llseek = noop_llseek,
+ .write = write_pmsg,
+};
+
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+ if (mode)
+ *mode = 0220;
+ return NULL;
+}
+
+void pstore_register_pmsg(void)
+{
+ struct device *pmsg_device;
+
+ pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+ if (pmsg_major < 0) {
+ pr_err("register_chrdev failed\n");
+ goto err;
+ }
+
+ pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+ if (IS_ERR(pmsg_class)) {
+ pr_err("device class file already in use\n");
+ goto err_class;
+ }
+ pmsg_class->devnode = pmsg_devnode;
+
+ pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+ NULL, "%s%d", PMSG_NAME, 0);
+ if (IS_ERR(pmsg_device)) {
+ pr_err("failed to create device\n");
+ goto err_device;
+ }
+ return;
+
+err_device:
+ class_destroy(pmsg_class);
+err_class:
+ unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+ return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
+
static ulong mem_address;
module_param(mem_address, ulong, 0400);
MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
struct persistent_ram_zone **przs;
struct persistent_ram_zone *cprz;
struct persistent_ram_zone *fprz;
+ struct persistent_ram_zone *mprz;
phys_addr_t phys_addr;
unsigned long size;
unsigned int memtype;
size_t record_size;
size_t console_size;
size_t ftrace_size;
+ size_t pmsg_size;
int dump_oops;
struct persistent_ram_ecc_info ecc_info;
unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
unsigned int dump_read_cnt;
unsigned int console_read_cnt;
unsigned int ftrace_read_cnt;
+ unsigned int pmsg_read_cnt;
struct pstore_info pstore;
};
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
cxt->dump_read_cnt = 0;
cxt->console_read_cnt = 0;
cxt->ftrace_read_cnt = 0;
+ cxt->pmsg_read_cnt = 0;
return 0;
}
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
return header_length;
}
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+ return !!prz && !!(persistent_ram_old_size(prz) +
+ persistent_ram_ecc_string(prz, NULL, 0));
+}
+
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
int *count, struct timespec *time,
char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
cxt->max_dump_cnt, id, type,
PSTORE_TYPE_DMESG, 1);
- if (!prz)
+ if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
1, id, type, PSTORE_TYPE_CONSOLE, 0);
- if (!prz)
+ if (!prz_ok(prz))
prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
1, id, type, PSTORE_TYPE_FTRACE, 0);
- if (!prz)
+ if (!prz_ok(prz))
+ prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+ 1, id, type, PSTORE_TYPE_PMSG, 0);
+ if (!prz_ok(prz))
return 0;
if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
return -ENOMEM;
persistent_ram_write(cxt->fprz, buf, size);
return 0;
+ } else if (type == PSTORE_TYPE_PMSG) {
+ if (!cxt->mprz)
+ return -ENOMEM;
+ persistent_ram_write(cxt->mprz, buf, size);
+ return 0;
}
if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
case PSTORE_TYPE_FTRACE:
prz = cxt->fprz;
break;
+ case PSTORE_TYPE_PMSG:
+ prz = cxt->mprz;
+ break;
default:
return -EINVAL;
}
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
goto fail_out;
if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
- !pdata->ftrace_size)) {
+ !pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
pdata->console_size = rounddown_pow_of_two(pdata->console_size);
if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+ if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+ pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
cxt->size = pdata->mem_size;
cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
cxt->record_size = pdata->record_size;
cxt->console_size = pdata->console_size;
cxt->ftrace_size = pdata->ftrace_size;
+ cxt->pmsg_size = pdata->pmsg_size;
cxt->dump_oops = pdata->dump_oops;
cxt->ecc_info = pdata->ecc_info;
paddr = cxt->phys_addr;
- dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+ dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+ - cxt->pmsg_size;
err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
if (err)
goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
if (err)
goto fail_init_fprz;
- if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
- pr_err("memory size too small, minimum is %zu\n",
- cxt->console_size + cxt->record_size +
- cxt->ftrace_size);
- err = -EINVAL;
- goto fail_cnt;
- }
+ err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+ if (err)
+ goto fail_init_mprz;
cxt->pstore.data = cxt;
/*
@@ -525,7 +550,8 @@ fail_buf:
kfree(cxt->pstore.buf);
fail_clear:
cxt->pstore.bufsize = 0;
-fail_cnt:
+ kfree(cxt->mprz);
+fail_init_mprz:
kfree(cxt->fprz);
fail_init_fprz:
kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
dummy_data->record_size = record_size;
dummy_data->console_size = ramoops_console_size;
dummy_data->ftrace_size = ramoops_ftrace_size;
+ dummy_data->pmsg_size = ramoops_pmsg_size;
dummy_data->dump_oops = dump_oops;
/*
* For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
config QUOTA
bool "Quota support"
select QUOTACTL
+ select SRCU
help
If you say Y here, you will be able to set per user limits for disk
usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
return capable(CAP_SYS_RESOURCE) &&
(info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
- !(info->dqi_flags & V1_DQF_RSQUASH));
+ !(info->dqi_flags & DQF_ROOT_SQUASH));
}
/* needs dq_data_lock */
@@ -2385,41 +2385,106 @@ out:
}
EXPORT_SYMBOL(dquot_quota_on_mount);
-static inline qsize_t qbtos(qsize_t blocks)
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
{
- return blocks << QIF_DQBLKSIZE_BITS;
+ int ret;
+ int type;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+ return -ENOSYS;
+ /* Accounting cannot be turned on while fs is mounted */
+ flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+ if (!flags)
+ return -EINVAL;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!(flags & qtype_enforce_flag(type)))
+ continue;
+ /* Can't enforce without accounting */
+ if (!sb_has_quota_usage_enabled(sb, type))
+ return -EINVAL;
+ ret = dquot_enable(dqopt->files[type], type,
+ dqopt->info[type].dqi_fmt_id,
+ DQUOT_LIMITS_ENABLED);
+ if (ret < 0)
+ goto out_err;
+ }
+ return 0;
+out_err:
+ /* Backout enforcement enablement we already did */
+ for (type--; type >= 0; type--) {
+ if (flags & qtype_enforce_flag(type))
+ dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+ }
+ /* Error code translation for better compatibility with XFS */
+ if (ret == -EBUSY)
+ ret = -EEXIST;
+ return ret;
}
-static inline qsize_t stoqb(qsize_t space)
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
{
- return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+ int ret;
+ int type;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+ return -ENOSYS;
+ /*
+ * We don't support turning off accounting via quotactl. In principle
+ * quota infrastructure can do this but filesystems don't expect
+ * userspace to be able to do it.
+ */
+ if (flags &
+ (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+ return -EOPNOTSUPP;
+
+ /* Filter out limits not enabled */
+ for (type = 0; type < MAXQUOTAS; type++)
+ if (!sb_has_quota_limits_enabled(sb, type))
+ flags &= ~qtype_enforce_flag(type);
+ /* Nothing left? */
+ if (!flags)
+ return -EEXIST;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (flags & qtype_enforce_flag(type)) {
+ ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+ if (ret < 0)
+ goto out_err;
+ }
+ }
+ return 0;
+out_err:
+ /* Backout enforcement disabling we already did */
+ for (type--; type >= 0; type--) {
+ if (flags & qtype_enforce_flag(type))
+ dquot_enable(dqopt->files[type], type,
+ dqopt->info[type].dqi_fmt_id,
+ DQUOT_LIMITS_ENABLED);
+ }
+ return ret;
}
/* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
struct mem_dqblk *dm = &dquot->dq_dqb;
memset(di, 0, sizeof(*di));
- di->d_version = FS_DQUOT_VERSION;
- di->d_flags = dquot->dq_id.type == USRQUOTA ?
- FS_USER_QUOTA : FS_GROUP_QUOTA;
- di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
-
spin_lock(&dq_data_lock);
- di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
- di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+ di->d_spc_hardlimit = dm->dqb_bhardlimit;
+ di->d_spc_softlimit = dm->dqb_bsoftlimit;
di->d_ino_hardlimit = dm->dqb_ihardlimit;
di->d_ino_softlimit = dm->dqb_isoftlimit;
- di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
- di->d_icount = dm->dqb_curinodes;
- di->d_btimer = dm->dqb_btime;
- di->d_itimer = dm->dqb_itime;
+ di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
+ di->d_ino_count = dm->dqb_curinodes;
+ di->d_spc_timer = dm->dqb_btime;
+ di->d_ino_timer = dm->dqb_itime;
spin_unlock(&dq_data_lock);
}
int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
- struct fs_disk_quota *di)
+ struct qc_dqblk *di)
{
struct dquot *dquot;
@@ -2433,70 +2498,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
}
EXPORT_SYMBOL(dquot_get_dqblk);
-#define VFS_FS_DQ_MASK \
- (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
- FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
- FS_DQ_BTIMER | FS_DQ_ITIMER)
+#define VFS_QC_MASK \
+ (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
+ QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
+ QC_SPC_TIMER | QC_INO_TIMER)
/* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
{
struct mem_dqblk *dm = &dquot->dq_dqb;
int check_blim = 0, check_ilim = 0;
struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
- if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+ if (di->d_fieldmask & ~VFS_QC_MASK)
return -EINVAL;
- if (((di->d_fieldmask & FS_DQ_BSOFT) &&
- (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
- ((di->d_fieldmask & FS_DQ_BHARD) &&
- (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
- ((di->d_fieldmask & FS_DQ_ISOFT) &&
- (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
- ((di->d_fieldmask & FS_DQ_IHARD) &&
- (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+ if (((di->d_fieldmask & QC_SPC_SOFT) &&
+ di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
+ ((di->d_fieldmask & QC_SPC_HARD) &&
+ di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
+ ((di->d_fieldmask & QC_INO_SOFT) &&
+ (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
+ ((di->d_fieldmask & QC_INO_HARD) &&
+ (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
return -ERANGE;
spin_lock(&dq_data_lock);
- if (di->d_fieldmask & FS_DQ_BCOUNT) {
- dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+ if (di->d_fieldmask & QC_SPACE) {
+ dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
check_blim = 1;
set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
}
- if (di->d_fieldmask & FS_DQ_BSOFT)
- dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
- if (di->d_fieldmask & FS_DQ_BHARD)
- dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
- if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+ if (di->d_fieldmask & QC_SPC_SOFT)
+ dm->dqb_bsoftlimit = di->d_spc_softlimit;
+ if (di->d_fieldmask & QC_SPC_HARD)
+ dm->dqb_bhardlimit = di->d_spc_hardlimit;
+ if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
check_blim = 1;
set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
}
- if (di->d_fieldmask & FS_DQ_ICOUNT) {
- dm->dqb_curinodes = di->d_icount;
+ if (di->d_fieldmask & QC_INO_COUNT) {
+ dm->dqb_curinodes = di->d_ino_count;
check_ilim = 1;
set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
}
- if (di->d_fieldmask & FS_DQ_ISOFT)
+ if (di->d_fieldmask & QC_INO_SOFT)
dm->dqb_isoftlimit = di->d_ino_softlimit;
- if (di->d_fieldmask & FS_DQ_IHARD)
+ if (di->d_fieldmask & QC_INO_HARD)
dm->dqb_ihardlimit = di->d_ino_hardlimit;
- if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+ if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
check_ilim = 1;
set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
}
- if (di->d_fieldmask & FS_DQ_BTIMER) {
- dm->dqb_btime = di->d_btimer;
+ if (di->d_fieldmask & QC_SPC_TIMER) {
+ dm->dqb_btime = di->d_spc_timer;
check_blim = 1;
set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
}
- if (di->d_fieldmask & FS_DQ_ITIMER) {
- dm->dqb_itime = di->d_itimer;
+ if (di->d_fieldmask & QC_INO_TIMER) {
+ dm->dqb_itime = di->d_ino_timer;
check_ilim = 1;
set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
}
@@ -2506,7 +2571,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
dm->dqb_curspace < dm->dqb_bsoftlimit) {
dm->dqb_btime = 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
- } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+ } else if (!(di->d_fieldmask & QC_SPC_TIMER))
/* Set grace only if user hasn't provided his own... */
dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
}
@@ -2515,7 +2580,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
dm->dqb_curinodes < dm->dqb_isoftlimit) {
dm->dqb_itime = 0;
clear_bit(DQ_INODES_B, &dquot->dq_flags);
- } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+ } else if (!(di->d_fieldmask & QC_INO_TIMER))
/* Set grace only if user hasn't provided his own... */
dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
}
@@ -2531,7 +2596,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
}
int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
- struct fs_disk_quota *di)
+ struct qc_dqblk *di)
{
struct dquot *dquot;
int rc;
@@ -2582,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
goto out;
}
mi = sb_dqopt(sb)->info + type;
+ if (ii->dqi_valid & IIF_FLAGS) {
+ if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
+ (ii->dqi_flags & DQF_ROOT_SQUASH &&
+ mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
+ err = -EINVAL;
+ goto out;
+ }
+ }
spin_lock(&dq_data_lock);
if (ii->dqi_valid & IIF_BGRACE)
mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2611,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
};
EXPORT_SYMBOL(dquot_quotactl_ops);
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+ .quota_enable = dquot_quota_enable,
+ .quota_disable = dquot_quota_disable,
+ .quota_sync = dquot_quota_sync,
+ .get_info = dquot_get_dqinfo,
+ .set_info = dquot_set_dqinfo,
+ .get_dqblk = dquot_get_dqblk,
+ .set_dqblk = dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
+
static int do_proc_dqstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
return ret;
}
+unsigned int qtype_enforce_flag(int type)
+{
+ switch (type) {
+ case USRQUOTA:
+ return FS_QUOTA_UDQ_ENFD;
+ case GRPQUOTA:
+ return FS_QUOTA_GDQ_ENFD;
+ case PRJQUOTA:
+ return FS_QUOTA_PDQ_ENFD;
+ }
+ return 0;
+}
+
static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
struct path *path)
{
- if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+ if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
return -ENOSYS;
- if (sb->s_qcop->quota_on_meta)
- return sb->s_qcop->quota_on_meta(sb, type, id);
+ if (sb->s_qcop->quota_enable)
+ return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
if (IS_ERR(path))
return PTR_ERR(path);
return sb->s_qcop->quota_on(sb, type, id, path);
}
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+ if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+ return -ENOSYS;
+ if (sb->s_qcop->quota_disable)
+ return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+ return sb->s_qcop->quota_off(sb, type);
+}
+
static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
{
__u32 fmt;
@@ -118,17 +140,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
return sb->s_qcop->set_info(sb, type, &info);
}
-static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+static inline qsize_t qbtos(qsize_t blocks)
+{
+ return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+ return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
{
memset(dst, 0, sizeof(*dst));
- dst->dqb_bhardlimit = src->d_blk_hardlimit;
- dst->dqb_bsoftlimit = src->d_blk_softlimit;
- dst->dqb_curspace = src->d_bcount;
+ dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
+ dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
+ dst->dqb_curspace = src->d_space;
dst->dqb_ihardlimit = src->d_ino_hardlimit;
dst->dqb_isoftlimit = src->d_ino_softlimit;
- dst->dqb_curinodes = src->d_icount;
- dst->dqb_btime = src->d_btimer;
- dst->dqb_itime = src->d_itimer;
+ dst->dqb_curinodes = src->d_ino_count;
+ dst->dqb_btime = src->d_spc_timer;
+ dst->dqb_itime = src->d_ino_timer;
dst->dqb_valid = QIF_ALL;
}
@@ -136,7 +168,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
void __user *addr)
{
struct kqid qid;
- struct fs_disk_quota fdq;
+ struct qc_dqblk fdq;
struct if_dqblk idq;
int ret;
@@ -154,36 +186,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
return 0;
}
-static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
{
- dst->d_blk_hardlimit = src->dqb_bhardlimit;
- dst->d_blk_softlimit = src->dqb_bsoftlimit;
- dst->d_bcount = src->dqb_curspace;
+ dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
+ dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
+ dst->d_space = src->dqb_curspace;
dst->d_ino_hardlimit = src->dqb_ihardlimit;
dst->d_ino_softlimit = src->dqb_isoftlimit;
- dst->d_icount = src->dqb_curinodes;
- dst->d_btimer = src->dqb_btime;
- dst->d_itimer = src->dqb_itime;
+ dst->d_ino_count = src->dqb_curinodes;
+ dst->d_spc_timer = src->dqb_btime;
+ dst->d_ino_timer = src->dqb_itime;
dst->d_fieldmask = 0;
if (src->dqb_valid & QIF_BLIMITS)
- dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+ dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
if (src->dqb_valid & QIF_SPACE)
- dst->d_fieldmask |= FS_DQ_BCOUNT;
+ dst->d_fieldmask |= QC_SPACE;
if (src->dqb_valid & QIF_ILIMITS)
- dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+ dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
if (src->dqb_valid & QIF_INODES)
- dst->d_fieldmask |= FS_DQ_ICOUNT;
+ dst->d_fieldmask |= QC_INO_COUNT;
if (src->dqb_valid & QIF_BTIME)
- dst->d_fieldmask |= FS_DQ_BTIMER;
+ dst->d_fieldmask |= QC_SPC_TIMER;
if (src->dqb_valid & QIF_ITIME)
- dst->d_fieldmask |= FS_DQ_ITIMER;
+ dst->d_fieldmask |= QC_INO_TIMER;
}
static int quota_setquota(struct super_block *sb, int type, qid_t id,
void __user *addr)
{
- struct fs_disk_quota fdq;
+ struct qc_dqblk fdq;
struct if_dqblk idq;
struct kqid qid;
@@ -198,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
return sb->s_qcop->set_dqblk(sb, qid, &fdq);
}
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
+{
+ __u32 flags;
+
+ if (copy_from_user(&flags, addr, sizeof(flags)))
+ return -EFAULT;
+ if (!sb->s_qcop->quota_enable)
+ return -ENOSYS;
+ return sb->s_qcop->quota_enable(sb, flags);
+}
+
+static int quota_disable(struct super_block *sb, void __user *addr)
{
__u32 flags;
if (copy_from_user(&flags, addr, sizeof(flags)))
return -EFAULT;
- if (!sb->s_qcop->set_xstate)
+ if (!sb->s_qcop->quota_disable)
return -ENOSYS;
- return sb->s_qcop->set_xstate(sb, flags, cmd);
+ return sb->s_qcop->quota_disable(sb, flags);
}
static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -247,10 +290,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
return ret;
}
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+
+static inline u64 quota_bbtob(u64 blocks)
+{
+ return blocks << XFS_BB_SHIFT;
+}
+
+static inline u64 quota_btobb(u64 bytes)
+{
+ return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+ dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+ dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+ dst->d_ino_hardlimit = src->d_ino_hardlimit;
+ dst->d_ino_softlimit = src->d_ino_softlimit;
+ dst->d_space = quota_bbtob(src->d_bcount);
+ dst->d_ino_count = src->d_icount;
+ dst->d_ino_timer = src->d_itimer;
+ dst->d_spc_timer = src->d_btimer;
+ dst->d_ino_warns = src->d_iwarns;
+ dst->d_spc_warns = src->d_bwarns;
+ dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+ dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+ dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+ dst->d_rt_spc_timer = src->d_rtbtimer;
+ dst->d_rt_spc_warns = src->d_rtbwarns;
+ dst->d_fieldmask = 0;
+ if (src->d_fieldmask & FS_DQ_ISOFT)
+ dst->d_fieldmask |= QC_INO_SOFT;
+ if (src->d_fieldmask & FS_DQ_IHARD)
+ dst->d_fieldmask |= QC_INO_HARD;
+ if (src->d_fieldmask & FS_DQ_BSOFT)
+ dst->d_fieldmask |= QC_SPC_SOFT;
+ if (src->d_fieldmask & FS_DQ_BHARD)
+ dst->d_fieldmask |= QC_SPC_HARD;
+ if (src->d_fieldmask & FS_DQ_RTBSOFT)
+ dst->d_fieldmask |= QC_RT_SPC_SOFT;
+ if (src->d_fieldmask & FS_DQ_RTBHARD)
+ dst->d_fieldmask |= QC_RT_SPC_HARD;
+ if (src->d_fieldmask & FS_DQ_BTIMER)
+ dst->d_fieldmask |= QC_SPC_TIMER;
+ if (src->d_fieldmask & FS_DQ_ITIMER)
+ dst->d_fieldmask |= QC_INO_TIMER;
+ if (src->d_fieldmask & FS_DQ_RTBTIMER)
+ dst->d_fieldmask |= QC_RT_SPC_TIMER;
+ if (src->d_fieldmask & FS_DQ_BWARNS)
+ dst->d_fieldmask |= QC_SPC_WARNS;
+ if (src->d_fieldmask & FS_DQ_IWARNS)
+ dst->d_fieldmask |= QC_INO_WARNS;
+ if (src->d_fieldmask & FS_DQ_RTBWARNS)
+ dst->d_fieldmask |= QC_RT_SPC_WARNS;
+ if (src->d_fieldmask & FS_DQ_BCOUNT)
+ dst->d_fieldmask |= QC_SPACE;
+ if (src->d_fieldmask & FS_DQ_ICOUNT)
+ dst->d_fieldmask |= QC_INO_COUNT;
+ if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+ dst->d_fieldmask |= QC_RT_SPACE;
+}
+
static int quota_setxquota(struct super_block *sb, int type, qid_t id,
void __user *addr)
{
struct fs_disk_quota fdq;
+ struct qc_dqblk qdq;
struct kqid qid;
if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +371,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
qid = make_kqid(current_user_ns(), type, id);
if (!qid_valid(qid))
return -EINVAL;
- return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+ copy_from_xfs_dqblk(&qdq, &fdq);
+ return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+ int type, qid_t id)
+{
+ memset(dst, 0, sizeof(*dst));
+ dst->d_version = FS_DQUOT_VERSION;
+ dst->d_id = id;
+ if (type == USRQUOTA)
+ dst->d_flags = FS_USER_QUOTA;
+ else if (type == PRJQUOTA)
+ dst->d_flags = FS_PROJ_QUOTA;
+ else
+ dst->d_flags = FS_GROUP_QUOTA;
+ dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+ dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+ dst->d_ino_hardlimit = src->d_ino_hardlimit;
+ dst->d_ino_softlimit = src->d_ino_softlimit;
+ dst->d_bcount = quota_btobb(src->d_space);
+ dst->d_icount = src->d_ino_count;
+ dst->d_itimer = src->d_ino_timer;
+ dst->d_btimer = src->d_spc_timer;
+ dst->d_iwarns = src->d_ino_warns;
+ dst->d_bwarns = src->d_spc_warns;
+ dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+ dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+ dst->d_rtbcount = quota_btobb(src->d_rt_space);
+ dst->d_rtbtimer = src->d_rt_spc_timer;
+ dst->d_rtbwarns = src->d_rt_spc_warns;
}
static int quota_getxquota(struct super_block *sb, int type, qid_t id,
void __user *addr)
{
struct fs_disk_quota fdq;
+ struct qc_dqblk qdq;
struct kqid qid;
int ret;
@@ -275,8 +417,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
qid = make_kqid(current_user_ns(), type, id);
if (!qid_valid(qid))
return -EINVAL;
- ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
- if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+ ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
+ if (ret)
+ return ret;
+ copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+ if (copy_to_user(addr, &fdq, sizeof(fdq)))
return -EFAULT;
return ret;
}
@@ -317,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
case Q_QUOTAON:
return quota_quotaon(sb, type, cmd, id, path);
case Q_QUOTAOFF:
- if (!sb->s_qcop->quota_off)
- return -ENOSYS;
- return sb->s_qcop->quota_off(sb, type);
+ return quota_quotaoff(sb, type);
case Q_GETFMT:
return quota_getfmt(sb, type, addr);
case Q_GETINFO:
@@ -335,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
return -ENOSYS;
return sb->s_qcop->quota_sync(sb, type);
case Q_XQUOTAON:
+ return quota_enable(sb, addr);
case Q_XQUOTAOFF:
- return quota_setxstate(sb, cmd, addr);
+ return quota_disable(sb, addr);
case Q_XQUOTARM:
return quota_rmxquota(sb, addr);
case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
}
ret = 0;
/* limits are stored as unsigned 32-bit data */
- dqopt->info[type].dqi_maxblimit = 0xffffffff;
- dqopt->info[type].dqi_maxilimit = 0xffffffff;
+ dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+ dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
dqopt->info[type].dqi_igrace =
dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
qinfo = info->dqi_priv;
if (version == 0) {
/* limits are stored as unsigned 32-bit data */
- info->dqi_maxblimit = 0xffffffff;
- info->dqi_maxilimit = 0xffffffff;
+ info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
+ info->dqi_max_ino_limit = 0xffffffff;
} else {
- /* used space is stored as unsigned 64-bit value */
- info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */
- info->dqi_maxilimit = 0xffffffffffffffffULL;
+ /* used space is stored as unsigned 64-bit value in bytes */
+ info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
+ info->dqi_max_ino_limit = 0xffffffffffffffffULL;
}
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
- info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+ /* No flags currently supported */
+ info->dqi_flags = 0;
qinfo->dqi_sb = sb;
qinfo->dqi_type = type;
qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
info->dqi_flags &= ~DQF_INFO_DIRTY;
dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
- dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ /* No flags currently supported */
+ dinfo.dqi_flags = cpu_to_le32(0);
spin_unlock(&dq_data_lock);
dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
unsigned long flags);
static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+ return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
+
const struct file_operations ramfs_file_operations = {
+ .mmap_capabilities = ramfs_mmap_capabilities,
.mmap = ramfs_nommu_mmap,
.get_unmapped_area = ramfs_nommu_get_unmapped_area,
.read = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
.set_page_dirty = __set_page_dirty_no_writeback,
};
-static struct backing_dev_info ramfs_backing_dev_info = {
- .name = "ramfs",
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK |
- BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
- BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
-
struct inode *ramfs_get_inode(struct super_block *sb,
const struct inode *dir, umode_t mode, dev_t dev)
{
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_mapping->a_ops = &ramfs_aops;
- inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
mapping_set_unevictable(inode->i_mapping);
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
int __init init_ramfs_fs(void)
{
static unsigned long once;
- int err;
if (test_and_set_bit(0, &once))
return 0;
-
- err = bdi_init(&ramfs_backing_dev_info);
- if (err)
- return err;
-
- err = register_filesystem(&ramfs_fs_type);
- if (err)
- bdi_destroy(&ramfs_backing_dev_info);
-
- return err;
+ return register_filesystem(&ramfs_fs_type);
}
fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..8e1b68786d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
}
#endif
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!file->f_op->read_iter)
+ return -EINVAL;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = iov_iter_count(iter);
+
+ iter->type |= READ;
+ ret = file->f_op->read_iter(&kiocb, iter);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+
+ if (ret > 0)
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+ struct kiocb kiocb;
+ ssize_t ret;
+
+ if (!file->f_op->write_iter)
+ return -EINVAL;
+
+ init_sync_kiocb(&kiocb, file);
+ kiocb.ki_pos = *ppos;
+ kiocb.ki_nbytes = iov_iter_count(iter);
+
+ iter->type |= WRITE;
+ ret = file->f_op->write_iter(&kiocb, iter);
+ if (ret == -EIOCBQUEUED)
+ ret = wait_on_sync_kiocb(&kiocb);
+
+ if (ret > 0)
+ *ppos = kiocb.ki_pos;
+ return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
+
/*
* rw_verify_area doesn't like huge counts. We limit
* them to something that fits in "int" so that others
@@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
return retval;
}
- if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+ if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
retval = locks_mandatory_area(
read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
inode, file, pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a7eec9888f10..e72401e1f995 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file,
int old_ref = 0;
inode = mapping->host;
- *fsdata = 0;
+ *fsdata = NULL;
if (flags & AOP_FLAG_CONT_EXPAND &&
(pos & (inode->i_sb->s_blocksize - 1)) == 0) {
pos ++;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
}
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+ struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+
+ if (!mtd)
+ return NOMMU_MAP_COPY;
+ return mtd_mmap_capabilities(mtd);
+}
+
const struct file_operations romfs_ro_fops = {
.llseek = generic_file_llseek,
.read = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
.splice_read = generic_file_splice_read,
.mmap = romfs_mmap,
.get_unmapped_area = romfs_get_unmapped_area,
+ .mmap_capabilities = romfs_mmap_capabilities,
};
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
case ROMFH_REG:
i->i_fop = &romfs_ro_fops;
i->i_data.a_ops = &romfs_aops;
- if (i->i_sb->s_mtd)
- i->i_data.backing_dev_info =
- i->i_sb->s_mtd->backing_dev_info;
if (nextfh & ROMFH_EXEC)
mode |= S_IXUGO;
break;
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
if (ret == -EINTR) {
struct restart_block *restart_block;
- restart_block = &current_thread_info()->restart_block;
+ restart_block = &current->restart_block;
restart_block->fn = do_restart_poll;
restart_block->poll.ufds = ufds;
restart_block->poll.nfds = nfds;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
return res;
}
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
- unsigned int nr_bits)
-{
- if (m->count < m->size) {
- int len = bitmap_scnprintf(m->buf + m->count,
- m->size - m->count, bits, nr_bits);
- if (m->count + len < m->size) {
- m->count += len;
- return 0;
- }
- }
- seq_set_overflow(m);
- return -1;
-}
-EXPORT_SYMBOL(seq_bitmap);
-
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
- unsigned int nr_bits)
-{
- if (m->count < m->size) {
- int len = bitmap_scnlistprintf(m->buf + m->count,
- m->size - m->count, bits, nr_bits);
- if (m->count + len < m->size) {
- m->count += len;
- return 0;
- }
- }
- seq_set_overflow(m);
- return -1;
-}
-EXPORT_SYMBOL(seq_bitmap_list);
-
static void *single_start(struct seq_file *p, loff_t *pos)
{
return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
struct iov_iter from;
- struct kiocb kiocb;
size_t left;
int n, idx;
@@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
left -= this_len;
}
- /* ... iov_iter */
- from.type = ITER_BVEC | WRITE;
- from.bvec = array;
- from.nr_segs = n;
- from.count = sd.total_len - left;
- from.iov_offset = 0;
-
- /* ... and iocb */
- init_sync_kiocb(&kiocb, out);
- kiocb.ki_pos = sd.pos;
- kiocb.ki_nbytes = sd.total_len - left;
-
- /* now, send it */
- ret = out->f_op->write_iter(&kiocb, &from);
- if (-EIOCBQUEUED == ret)
- ret = wait_on_sync_kiocb(&kiocb);
-
+ iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
+ sd.total_len - left);
+ ret = vfs_iter_write(out, &from, &sd.pos);
if (ret <= 0)
break;
sd.num_spliced += ret;
sd.total_len -= ret;
- *ppos = sd.pos = kiocb.ki_pos;
+ *ppos = sd.pos;
/* dismiss the fully eaten buffers, adjust the partial one */
while (ret) {
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..65a53efc1cf4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
#include "internal.h"
-LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static LIST_HEAD(super_blocks);
+static DEFINE_SPINLOCK(sb_lock);
static char *sb_writers_name[SB_FREEZE_LEVELS] = {
"sb_writers",
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
return SHRINK_STOP;
if (sb->s_op->nr_cached_objects)
- fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+ fs_objects = sb->s_op->nr_cached_objects(sb, sc);
- inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
- dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+ inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+ dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects = dentries + inodes + fs_objects + 1;
if (!total_objects)
total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
/* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+ fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
/*
* prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches
+ *
+ * Ensure that we always scan at least one object - memcg kmem
+ * accounting uses this to fully empty the caches.
*/
- freed = prune_dcache_sb(sb, dentries, sc->nid);
- freed += prune_icache_sb(sb, inodes, sc->nid);
+ sc->nr_to_scan = dentries + 1;
+ freed = prune_dcache_sb(sb, sc);
+ sc->nr_to_scan = inodes + 1;
+ freed += prune_icache_sb(sb, sc);
if (fs_objects) {
- fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
- total_objects);
- freed += sb->s_op->free_cached_objects(sb, fs_objects,
- sc->nid);
+ sc->nr_to_scan = fs_objects + 1;
+ freed += sb->s_op->free_cached_objects(sb, sc);
}
drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
* scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway.
* Call to super_cache_count with shrinker_rwsem held
- * ensures the safety of call to list_lru_count_node() and
+ * ensures the safety of call to list_lru_shrink_count() and
* s_op->nr_cached_objects().
*/
if (sb->s_op && sb->s_op->nr_cached_objects)
- total_objects = sb->s_op->nr_cached_objects(sb,
- sc->nid);
+ total_objects = sb->s_op->nr_cached_objects(sb, sc);
- total_objects += list_lru_count_node(&sb->s_dentry_lru,
- sc->nid);
- total_objects += list_lru_count_node(&sb->s_inode_lru,
- sc->nid);
+ total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+ total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
total_objects = vfs_pressure_ratio(total_objects);
return total_objects;
@@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
}
init_waitqueue_head(&s->s_writers.wait);
init_waitqueue_head(&s->s_writers.wait_unfrozen);
+ s->s_bdi = &noop_backing_dev_info;
s->s_flags = flags;
- s->s_bdi = &default_backing_dev_info;
INIT_HLIST_NODE(&s->s_instances);
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);
- if (list_lru_init(&s->s_dentry_lru))
+ if (list_lru_init_memcg(&s->s_dentry_lru))
goto fail;
- if (list_lru_init(&s->s_inode_lru))
+ if (list_lru_init_memcg(&s->s_inode_lru))
goto fail;
init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
s->s_shrink.scan_objects = super_cache_scan;
s->s_shrink.count_objects = super_cache_count;
s->s_shrink.batch = 1024;
- s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+ s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
return s;
fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
unregister_shrinker(&s->s_shrink);
fs->kill_sb(s);
+ /*
+ * Since list_lru_destroy() may sleep, we cannot call it from
+ * put_super(), where we hold the sb_lock. Therefore we destroy
+ * the lru lists right now.
+ */
+ list_lru_destroy(&s->s_dentry_lru);
+ list_lru_destroy(&s->s_inode_lru);
+
put_filesystem(fs);
put_super(s);
} else {
@@ -706,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
if (remount_ro) {
- if (sb->s_pins.first) {
+ if (!hlist_empty(&sb->s_pins)) {
up_write(&sb->s_umount);
- sb_pin_kill(sb);
+ group_pin_kill(&sb->s_pins);
down_write(&sb->s_umount);
if (!sb->s_root)
return 0;
@@ -863,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
int set_anon_super(struct super_block *s, void *data)
{
- int error = get_anon_bdev(&s->s_dev);
- if (!error)
- s->s_bdi = &noop_backing_dev_info;
- return error;
+ return get_anon_bdev(&s->s_dev);
}
EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
sb = root->d_sb;
BUG_ON(!sb);
WARN_ON(!sb->s_bdi);
- WARN_ON(sb->s_bdi == &default_backing_dev_info);
sb->s_flags |= MS_BORN;
error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
*/
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
+ struct inode *inode = file->f_mapping->host;
+
if (!file->f_op->fsync)
return -EINVAL;
+ if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+ spin_lock(&inode->i_lock);
+ inode->i_state &= ~I_DIRTY_TIME;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ }
return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
key = attr->key ?: (struct lock_class_key *)&attr->skey;
#endif
kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
- (void *)attr, ns, true, key);
+ (void *)attr, ns, key);
if (IS_ERR(kn)) {
if (PTR_ERR(kn) == -EEXIST)
sysfs_warn_dup(parent, attr->name);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7d2a860ba788..2554d8835b48 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -99,7 +99,7 @@ static int internal_create_group(struct kobject *kobj, int update,
return -EINVAL;
if (!grp->attrs && !grp->bin_attrs) {
WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
- kobj->name, grp->name ? "" : grp->name);
+ kobj->name, grp->name ?: "");
return -EINVAL;
}
if (grp->name) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7ed13e1e216a..4cfb3e82c56f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
long long blk_offs;
struct ubifs_data_node *dn = node;
+ ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
+
/*
* Search the inode node this data node belongs to and insert
* it to the RB-tree of inodes.
@@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
struct ubifs_dent_node *dent = node;
struct fsck_inode *fscki1;
+ ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
+
err = ubifs_validate_entry(c, dent);
if (err)
goto out_dump;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..0fa6c803992e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
inode->i_mtime = inode->i_atime = inode->i_ctime =
ubifs_current_time(inode);
inode->i_mapping->nrpages = 0;
- /* Disable readahead */
- inode->i_mapping->backing_dev_info = &c->bdi;
switch (mode & S_IFMT) {
case S_IFREG:
@@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto out_budg;
}
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_cancel;
+
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
@@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto out_budg;
}
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_cancel;
+
mutex_lock(&dir_ui->ui_mutex);
insert_inode_hash(inode);
inc_nlink(inode);
@@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
ui->data = dev;
ui->data_len = devlen;
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_cancel;
+
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
@@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
ui->data_len = len;
inode->i_size = ubifs_inode(inode)->ui_size = len;
+ err = ubifs_init_security(dir, inode, &dentry->d_name);
+ if (err)
+ goto out_cancel;
+
mutex_lock(&dir_ui->ui_mutex);
dir->i_size += sz_change;
dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..e627c0acf626 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ubifs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1574,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
.follow_link = ubifs_follow_link,
.setattr = ubifs_setattr,
.getattr = ubifs_getattr,
+ .setxattr = ubifs_setxattr,
+ .getxattr = ubifs_getxattr,
+ .listxattr = ubifs_listxattr,
+ .removexattr = ubifs_removexattr,
};
const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3187925e9879..9b40a1c5e160 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c)
do {
err = replay_log_leb(c, lnum, 0, c->sbuf);
- if (err == 1)
- /* We hit the end of the log */
- break;
+ if (err == 1) {
+ if (lnum != c->lhead_lnum)
+ /* We hit the end of the log */
+ break;
+
+ /*
+ * The head of the log must always start with the
+ * "commit start" node on a properly formatted UBIFS.
+ * But we found no nodes at all, which means that
+ * someting went wrong and we cannot proceed mounting
+ * the file-system.
+ */
+ ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
+ lnum, 0);
+ err = -EINVAL;
+ }
if (err)
goto out;
lnum = ubifs_next_log_lnum(c, lnum);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..93e946561c5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_invalid;
- /* Disable read-ahead */
- inode->i_mapping->backing_dev_info = &c->bdi;
-
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
* Read-ahead will be disabled because @c->bdi.ra_pages is 0.
*/
c->bdi.name = "ubifs",
- c->bdi.capabilities = BDI_CAP_MAP_COPY;
+ c->bdi.capabilities = 0;
err = bdi_init(&c->bdi);
if (err)
goto out_close;
@@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
+ sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4fe900c67ab..bc04b9c69891 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -36,6 +36,7 @@
#include <linux/mtd/ubi.h>
#include <linux/pagemap.h>
#include <linux/backing-dev.h>
+#include <linux/security.h>
#include "ubifs-media.h"
/* Version of this UBIFS implementation */
@@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock;
extern atomic_long_t ubifs_clean_zn_cnt;
extern struct kmem_cache *ubifs_inode_slab;
extern const struct super_operations ubifs_super_operations;
+extern const struct xattr_handler *ubifs_xattr_handlers[];
extern const struct address_space_operations ubifs_file_address_operations;
extern const struct file_operations ubifs_file_operations;
extern const struct inode_operations ubifs_file_inode_operations;
@@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
size_t size);
ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
int ubifs_removexattr(struct dentry *dentry, const char *name);
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+ const struct qstr *qstr);
/* super.c */
struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5e0a63b1b0d5..a92be244a6fb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -100,24 +100,30 @@ static const struct file_operations empty_fops;
static int create_xattr(struct ubifs_info *c, struct inode *host,
const struct qstr *nm, const void *value, int size)
{
- int err;
+ int err, names_len;
struct inode *inode;
struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
.new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
- if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+ if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
+ ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
+ host->i_ino, host_ui->xattr_cnt);
return -ENOSPC;
+ }
/*
* Linux limits the maximum size of the extended attribute names list
* to %XATTR_LIST_MAX. This means we should not allow creating more
* extended attributes if the name list becomes larger. This limitation
* is artificial for UBIFS, though.
*/
- if (host_ui->xattr_names + host_ui->xattr_cnt +
- nm->len + 1 > XATTR_LIST_MAX)
+ names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
+ if (names_len > XATTR_LIST_MAX) {
+ ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
+ host->i_ino, names_len, XATTR_LIST_MAX);
return -ENOSPC;
+ }
err = ubifs_budget_space(c, &req);
if (err)
@@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
return ERR_PTR(-EINVAL);
}
-int ubifs_setxattr(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags)
+static int setxattr(struct inode *host, const char *name, const void *value,
+ size_t size, int flags)
{
- struct inode *inode, *host = dentry->d_inode;
+ struct inode *inode;
struct ubifs_info *c = host->i_sb->s_fs_info;
struct qstr nm = QSTR_INIT(name, strlen(name));
struct ubifs_dent_node *xent;
union ubifs_key key;
int err, type;
- dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
- host->i_ino, dentry, size);
ubifs_assert(mutex_is_locked(&host->i_mutex));
if (size > UBIFS_MAX_INO_DATA)
@@ -356,6 +360,15 @@ out_free:
return err;
}
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
+ name, dentry->d_inode->i_ino, dentry, size);
+
+ return setxattr(dentry->d_inode, name, value, size, flags);
+}
+
ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
size_t size)
{
@@ -568,3 +581,84 @@ out_free:
kfree(xent);
return err;
}
+
+static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
+ const char *name, size_t name_len, int flags)
+{
+ const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+
+ return total_len;
+}
+
+static int security_getxattr(struct dentry *d, const char *name, void *buffer,
+ size_t size, int flags)
+{
+ return ubifs_getxattr(d, name, buffer, size);
+}
+
+static int security_setxattr(struct dentry *d, const char *name,
+ const void *value, size_t size, int flags,
+ int handler_flags)
+{
+ return ubifs_setxattr(d, name, value, size, flags);
+}
+
+static const struct xattr_handler ubifs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = security_listxattr,
+ .get = security_getxattr,
+ .set = security_setxattr,
+};
+
+const struct xattr_handler *ubifs_xattr_handlers[] = {
+ &ubifs_xattr_security_handler,
+ NULL,
+};
+
+static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
+ void *fs_info)
+{
+ const struct xattr *xattr;
+ char *name;
+ int err = 0;
+
+ for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+ name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+ strlen(xattr->name) + 1, GFP_NOFS);
+ if (!name) {
+ err = -ENOMEM;
+ break;
+ }
+ strcpy(name, XATTR_SECURITY_PREFIX);
+ strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
+ kfree(name);
+ if (err < 0)
+ break;
+ }
+
+ return err;
+}
+
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+ const struct qstr *qstr)
+{
+ int err;
+
+ mutex_lock(&inode->i_mutex);
+ err = security_inode_init_security(inode, dentry, qstr,
+ &init_xattrs, 0);
+ mutex_unlock(&inode->i_mutex);
+
+ if (err)
+ ubifs_err("cannot initialize security for inode %lu, error %d",
+ inode->i_ino, err);
+ return err;
+}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
tristate "UDF file system support"
select CRC_ITU_T
help
- This is the new file system used on some CD-ROMs and DVDs. Say Y if
- you intend to mount DVD discs or CDRW's written in packet mode, or
- if written to by other UDF utilities, such as DirectCD.
- Please read <file:Documentation/filesystems/udf.txt>.
+ This is a file system used on some CD-ROMs and DVDs. Since the
+ file system is supported by multiple operating systems and is more
+ compatible with standard unix file systems, it is also suitable for
+ removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
+ written in packet mode, or if you want to use UDF for removable USB
+ disks. Please read <file:Documentation/filesystems/udf.txt>.
To compile this file system support as a module, choose M here: the
module will be called udf.
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index a012c51caffd..05e90edd1992 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -57,6 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
sector_t offset;
int i, num, ret = 0;
struct extent_position epos = { NULL, 0, {0, 0} };
+ struct super_block *sb = dir->i_sb;
if (ctx->pos == 0) {
if (!dir_emit_dot(file, ctx))
@@ -76,16 +77,16 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
if (nf_pos == 0)
nf_pos = udf_ext0_offset(dir);
- fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
+ fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
- if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
+ if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits,
&epos, &eloc, &elen, &offset)
!= (EXT_RECORDED_ALLOCATED >> 30)) {
ret = -ENOENT;
goto out;
}
- block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
- if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+ block = udf_get_lb_pblock(sb, &eloc, offset);
+ if ((++offset << sb->s_blocksize_bits) < elen) {
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
epos.offset -= sizeof(struct short_ad);
else if (iinfo->i_alloc_type ==
@@ -95,18 +96,18 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
offset = 0;
}
- if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
+ if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) {
ret = -EIO;
goto out;
}
- if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
- i = 16 >> (dir->i_sb->s_blocksize_bits - 9);
- if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
- i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
+ if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) {
+ i = 16 >> (sb->s_blocksize_bits - 9);
+ if (i + offset > (elen >> sb->s_blocksize_bits))
+ i = (elen >> sb->s_blocksize_bits) - offset;
for (num = 0; i > 0; i--) {
- block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
- tmp = udf_tgetblk(dir->i_sb, block);
+ block = udf_get_lb_pblock(sb, &eloc, offset + i);
+ tmp = udf_tgetblk(sb, block);
if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
bha[num++] = tmp;
else
@@ -152,12 +153,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
}
if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
- if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
continue;
}
if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
- if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
continue;
}
@@ -167,12 +168,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
continue;
}
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
if (!flen)
continue;
tloc = lelb_to_cpu(cfi.icb.extLocation);
- iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+ iblock = udf_get_lb_pblock(sb, &tloc, 0);
if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
goto out;
} /* end while */
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bb15771b92ae..08f3555fbeac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -224,7 +224,7 @@ out:
static int udf_release_file(struct inode *inode, struct file *filp)
{
if (filp->f_mode & FMODE_WRITE &&
- atomic_read(&inode->i_writecount) > 1) {
+ atomic_read(&inode->i_writecount) == 1) {
/*
* Grab i_mutex to avoid races with writes changing i_size
* while we are running.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c9b4df5810d5..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
/* Are we beyond EOF? */
if (etype == -1) {
int ret;
- isBeyondEOF = 1;
+ isBeyondEOF = true;
if (count) {
if (c)
laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
endnum = c + 1;
lastblock = 1;
} else {
- isBeyondEOF = 0;
+ isBeyondEOF = false;
endnum = startnum = ((count > 2) ? 2 : count);
/* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
struct kernel_lb_addr *iloc = &iinfo->i_location;
unsigned int link_count;
unsigned int indirections = 0;
+ int bs = inode->i_sb->s_blocksize;
int ret = -EIO;
reread:
@@ -1374,38 +1375,35 @@ reread:
if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
iinfo->i_efe = 1;
iinfo->i_use = 0;
- ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ ret = udf_alloc_i_data(inode, bs -
sizeof(struct extendedFileEntry));
if (ret)
goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct extendedFileEntry),
- inode->i_sb->s_blocksize -
- sizeof(struct extendedFileEntry));
+ bs - sizeof(struct extendedFileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
iinfo->i_efe = 0;
iinfo->i_use = 0;
- ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
- sizeof(struct fileEntry));
+ ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
if (ret)
goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct fileEntry),
- inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+ bs - sizeof(struct fileEntry));
} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
iinfo->i_efe = 0;
iinfo->i_use = 1;
iinfo->i_lenAlloc = le32_to_cpu(
((struct unallocSpaceEntry *)bh->b_data)->
lengthAllocDescs);
- ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+ ret = udf_alloc_i_data(inode, bs -
sizeof(struct unallocSpaceEntry));
if (ret)
goto out;
memcpy(iinfo->i_ext.i_data,
bh->b_data + sizeof(struct unallocSpaceEntry),
- inode->i_sb->s_blocksize -
- sizeof(struct unallocSpaceEntry));
+ bs - sizeof(struct unallocSpaceEntry));
return 0;
}
@@ -1489,6 +1487,28 @@ reread:
}
inode->i_generation = iinfo->i_unique;
+ /*
+ * Sanity check length of allocation descriptors and extended attrs to
+ * avoid integer overflows
+ */
+ if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
+ goto out;
+ /* Now do exact checks */
+ if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
+ goto out;
+ /* Sanity checks for files in ICB so that we don't get confused later */
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+ /*
+ * For file in ICB data is stored in allocation descriptor
+ * so sizes should match
+ */
+ if (iinfo->i_lenAlloc != inode->i_size)
+ goto out;
+ /* File in ICB has to fit in there... */
+ if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
+ goto out;
+ }
+
switch (fe->icbTag.fileType) {
case ICBTAG_FILE_TYPE_DIRECTORY:
inode->i_op = &udf_dir_inode_operations;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c12e260fd6c4..33b246b82c98 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -159,18 +159,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
struct udf_inode_info *dinfo = UDF_I(dir);
int isdotdot = child->len == 2 &&
child->name[0] == '.' && child->name[1] == '.';
+ struct super_block *sb = dir->i_sb;
size = udf_ext0_offset(dir) + dir->i_size;
f_pos = udf_ext0_offset(dir);
fibh->sbh = fibh->ebh = NULL;
- fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+ fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
- if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+ if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
&eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
goto out_err;
- block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
- if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+ block = udf_get_lb_pblock(sb, &eloc, offset);
+ if ((++offset << sb->s_blocksize_bits) < elen) {
if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
epos.offset -= sizeof(struct short_ad);
else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
@@ -178,7 +179,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
} else
offset = 0;
- fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+ fibh->sbh = fibh->ebh = udf_tread(sb, block);
if (!fibh->sbh)
goto out_err;
}
@@ -217,12 +218,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
}
if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
- if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
continue;
}
if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
- if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+ if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
continue;
}
@@ -233,7 +234,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
if (!lfi)
continue;
- flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+ flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
if (flen && udf_match(flen, fname, child->len, child->name))
goto out_ok;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
struct udf_vds_record *curr;
struct generic_desc *gd;
struct volDescPtr *vdp;
- int done = 0;
+ bool done = false;
uint32_t vdsn;
uint16_t ident;
long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
lastblock = next_e;
next_s = next_e = 0;
} else
- done = 1;
+ done = true;
break;
}
brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
udf_close_lvid(sb);
brelse(sbi->s_lvid_bh);
udf_sb_free_partitions(sb);
+ mutex_destroy(&sbi->s_alloc_mutex);
kfree(sb->s_fs_info);
sb->s_fs_info = NULL;
}
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6fb7945c1e6e..ac10ca939f26 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -30,49 +30,73 @@
#include <linux/buffer_head.h>
#include "udf_i.h"
-static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
- int fromlen, unsigned char *to)
+static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
+ int fromlen, unsigned char *to, int tolen)
{
struct pathComponent *pc;
int elen = 0;
+ int comp_len;
unsigned char *p = to;
+ /* Reserve one byte for terminating \0 */
+ tolen--;
while (elen < fromlen) {
pc = (struct pathComponent *)(from + elen);
+ elen += sizeof(struct pathComponent);
switch (pc->componentType) {
case 1:
/*
* Symlink points to some place which should be agreed
* upon between originator and receiver of the media. Ignore.
*/
- if (pc->lengthComponentIdent > 0)
+ if (pc->lengthComponentIdent > 0) {
+ elen += pc->lengthComponentIdent;
break;
+ }
/* Fall through */
case 2:
+ if (tolen == 0)
+ return -ENAMETOOLONG;
p = to;
*p++ = '/';
+ tolen--;
break;
case 3:
+ if (tolen < 3)
+ return -ENAMETOOLONG;
memcpy(p, "../", 3);
p += 3;
+ tolen -= 3;
break;
case 4:
+ if (tolen < 2)
+ return -ENAMETOOLONG;
memcpy(p, "./", 2);
p += 2;
+ tolen -= 2;
/* that would be . - just ignore */
break;
case 5:
- p += udf_get_filename(sb, pc->componentIdent, p,
- pc->lengthComponentIdent);
+ elen += pc->lengthComponentIdent;
+ if (elen > fromlen)
+ return -EIO;
+ comp_len = udf_get_filename(sb, pc->componentIdent,
+ pc->lengthComponentIdent,
+ p, tolen);
+ p += comp_len;
+ tolen -= comp_len;
+ if (tolen == 0)
+ return -ENAMETOOLONG;
*p++ = '/';
+ tolen--;
break;
}
- elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
}
if (p > to + 1)
p[-1] = '\0';
else
p[0] = '\0';
+ return 0;
}
static int udf_symlink_filler(struct file *file, struct page *page)
@@ -80,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page)
struct inode *inode = page->mapping->host;
struct buffer_head *bh = NULL;
unsigned char *symlink;
- int err = -EIO;
+ int err;
unsigned char *p = kmap(page);
struct udf_inode_info *iinfo;
uint32_t pos;
+ /* We don't support symlinks longer than one block */
+ if (inode->i_size > inode->i_sb->s_blocksize) {
+ err = -ENAMETOOLONG;
+ goto out_unmap;
+ }
+
iinfo = UDF_I(inode);
pos = udf_block_map(inode, 0);
@@ -94,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page)
} else {
bh = sb_bread(inode->i_sb, pos);
- if (!bh)
- goto out;
+ if (!bh) {
+ err = -EIO;
+ goto out_unlock_inode;
+ }
symlink = bh->b_data;
}
- udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
+ err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
brelse(bh);
+ if (err)
+ goto out_unlock_inode;
up_read(&iinfo->i_data_sem);
SetPageUptodate(page);
@@ -109,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
unlock_page(page);
return 0;
-out:
+out_unlock_inode:
up_read(&iinfo->i_data_sem);
SetPageError(page);
+out_unmap:
kunmap(page);
unlock_page(page);
return err;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 1cc3c993ebd0..47bb3f5ca360 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -211,7 +211,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
}
/* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
+extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
+ int);
extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
int);
extern int udf_build_ustr(struct ustr *, dstring *, int);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index afd470e588ff..b84fee372734 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,7 +28,8 @@
#include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
+static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
+ int);
static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
{
@@ -333,8 +334,8 @@ try_again:
return u_len + 1;
}
-int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
- int flen)
+int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
+ uint8_t *dname, int dlen)
{
struct ustr *filename, *unifilename;
int len = 0;
@@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
if (!unifilename)
goto out1;
- if (udf_build_ustr_exact(unifilename, sname, flen))
+ if (udf_build_ustr_exact(unifilename, sname, slen))
goto out2;
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
@@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
} else
goto out2;
- len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+ len = udf_translate_to_linux(dname, dlen,
+ filename->u_name, filename->u_len,
unifilename->u_name, unifilename->u_len);
out2:
kfree(unifilename);
@@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname,
#define EXT_MARK '.'
#define CRC_MARK '#'
#define EXT_SIZE 5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN 5
-static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
- int udfLen, uint8_t *fidName,
- int fidNameLen)
+static int udf_translate_to_linux(uint8_t *newName, int newLen,
+ uint8_t *udfName, int udfLen,
+ uint8_t *fidName, int fidNameLen)
{
int index, newIndex = 0, needsCRC = 0;
int extIndex = 0, newExtIndex = 0, hasExt = 0;
@@ -439,7 +443,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
newExtIndex = newIndex;
}
}
- if (newIndex < 256)
+ if (newIndex < newLen)
newName[newIndex++] = curr;
else
needsCRC = 1;
@@ -467,13 +471,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
}
ext[localExtIndex++] = curr;
}
- maxFilenameLen = 250 - localExtIndex;
+ maxFilenameLen = newLen - CRC_LEN - localExtIndex;
if (newIndex > maxFilenameLen)
newIndex = maxFilenameLen;
else
newIndex = newExtIndex;
- } else if (newIndex > 250)
- newIndex = 250;
+ } else if (newIndex > newLen - CRC_LEN)
+ newIndex = newLen - CRC_LEN;
newName[newIndex++] = CRC_MARK;
valueCRC = crc_itu_t(0, fidName, fidNameLen);
newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index da73801301d5..8092d3759a5e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -95,22 +95,18 @@
void lock_ufs(struct super_block *sb)
{
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
struct ufs_sb_info *sbi = UFS_SB(sb);
mutex_lock(&sbi->mutex);
sbi->mutex_owner = current;
-#endif
}
void unlock_ufs(struct super_block *sb)
{
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
struct ufs_sb_info *sbi = UFS_SB(sb);
sbi->mutex_owner = NULL;
mutex_unlock(&sbi->mutex);
-#endif
}
static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
@@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep;
static struct inode *ufs_alloc_inode(struct super_block *sb)
{
struct ufs_inode_info *ei;
- ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+
+ ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
if (!ei)
return NULL;
+
ei->vfs_inode.i_version = 1;
return &ei->vfs_inode;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
static enum lru_status
xfs_buftarg_wait_rele(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
*/
atomic_set(&bp->b_lru_ref, 0);
bp->b_state |= XFS_BSTATE_DISPOSE;
- list_move(item, dispose);
+ list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
static enum lru_status
xfs_buftarg_isolate(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
{
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
}
bp->b_state |= XFS_BSTATE_DISPOSE;
- list_move(item, dispose);
+ list_lru_isolate_move(lru, item, dispose);
spin_unlock(&bp->b_lock);
return LRU_REMOVED;
}
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
struct xfs_buftarg, bt_shrinker);
LIST_HEAD(dispose);
unsigned long freed;
- unsigned long nr_to_scan = sc->nr_to_scan;
- freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
- &dispose, &nr_to_scan);
+ freed = list_lru_shrink_walk(&btp->bt_lru, sc,
+ xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) {
struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
{
struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker);
- return list_lru_count_node(&btp->bt_lru, sc->nid);
+ return list_lru_shrink_count(&btp->bt_lru, sc);
}
void
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 56dcfce8d7d6..ce615d12fb44 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -740,7 +740,7 @@ xfs_file_buffered_aio_write(
iov_iter_truncate(from, count);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -1414,5 +1414,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_vm_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 3e8186279541..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
static enum lru_status
xfs_qm_dquot_isolate(
struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
__releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
XFS_STATS_INC(xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
- list_del_init(&dqp->q_lru);
+ list_lru_isolate(lru, &dqp->q_lru);
XFS_STATS_DEC(xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
xfs_dqunlock(dqp);
ASSERT(dqp->q_nrefs == 0);
- list_move_tail(&dqp->q_lru, &isol->dispose);
+ list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
XFS_STATS_DEC(xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
struct xfs_qm_isolate isol;
unsigned long freed;
int error;
- unsigned long nr_to_scan = sc->nr_to_scan;
if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
INIT_LIST_HEAD(&isol.buffers);
INIT_LIST_HEAD(&isol.dispose);
- freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
- &nr_to_scan);
+ freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+ xfs_qm_dquot_isolate, &isol);
error = xfs_buf_delwri_submit(&isol.buffers);
if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
struct xfs_quotainfo *qi = container_of(shrink,
struct xfs_quotainfo, qi_shrinker);
- return list_lru_count_node(&qi->qi_lru, sc->nid);
+ return list_lru_shrink_count(&qi->qi_lru, sc);
}
/*
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index d6e4d88094ab..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -165,9 +165,9 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
/* quota ops */
extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
- uint, struct fs_disk_quota *);
+ uint, struct qc_dqblk *);
extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
- struct fs_disk_quota *);
+ struct qc_dqblk *);
extern int xfs_qm_scall_getqstat(struct xfs_mount *,
struct fs_quota_stat *);
extern int xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index b8a565edb4ae..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
uint);
STATIC uint xfs_qm_export_flags(uint);
-STATIC uint xfs_qm_export_qtype_flags(uint);
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -326,22 +325,16 @@ xfs_qm_scall_quotaon(
return -EINVAL;
}
- /* No fs can turn on quotas with a delayed effect */
- ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
-
/*
* Can't enforce without accounting. We check the superblock
* qflags here instead of m_qflags because rootfs can have
* quota acct on ondisk without m_qflags' knowing.
*/
- if (((flags & XFS_UQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+ if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
(flags & XFS_UQUOTA_ENFD)) ||
- ((flags & XFS_GQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+ ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
(flags & XFS_GQUOTA_ENFD)) ||
- ((flags & XFS_PQUOTA_ACCT) == 0 &&
- (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+ ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
(flags & XFS_PQUOTA_ENFD))) {
xfs_debug(mp,
"%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -380,8 +373,7 @@ xfs_qm_scall_quotaon(
((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
(mp->m_qflags & XFS_PQUOTA_ACCT)) ||
((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
- (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
- (flags & XFS_ALL_QUOTA_ENFD) == 0)
+ (mp->m_qflags & XFS_GQUOTA_ACCT)))
return 0;
if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -418,20 +410,12 @@ xfs_qm_scall_getqstat(
memset(out, 0, sizeof(fs_quota_stat_t));
out->qs_version = FS_QSTAT_VERSION;
- if (!xfs_sb_version_hasquota(&mp->m_sb)) {
- out->qs_uquota.qfs_ino = NULLFSINO;
- out->qs_gquota.qfs_ino = NULLFSINO;
- return 0;
- }
-
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
(XFS_ALL_QUOTA_ACCT|
XFS_ALL_QUOTA_ENFD));
- if (q) {
- uip = q->qi_uquotaip;
- gip = q->qi_gquotaip;
- pip = q->qi_pquotaip;
- }
+ uip = q->qi_uquotaip;
+ gip = q->qi_gquotaip;
+ pip = q->qi_pquotaip;
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
0, 0, &uip) == 0)
@@ -477,14 +461,13 @@ xfs_qm_scall_getqstat(
if (temppqip)
IRELE(pip);
}
- if (q) {
- out->qs_incoredqs = q->qi_dquots;
- out->qs_btimelimit = q->qi_btimelimit;
- out->qs_itimelimit = q->qi_itimelimit;
- out->qs_rtbtimelimit = q->qi_rtbtimelimit;
- out->qs_bwarnlimit = q->qi_bwarnlimit;
- out->qs_iwarnlimit = q->qi_iwarnlimit;
- }
+ out->qs_incoredqs = q->qi_dquots;
+ out->qs_btimelimit = q->qi_btimelimit;
+ out->qs_itimelimit = q->qi_itimelimit;
+ out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+ out->qs_bwarnlimit = q->qi_bwarnlimit;
+ out->qs_iwarnlimit = q->qi_iwarnlimit;
+
return 0;
}
@@ -505,13 +488,6 @@ xfs_qm_scall_getqstatv(
bool tempgqip = false;
bool temppqip = false;
- if (!xfs_sb_version_hasquota(&mp->m_sb)) {
- out->qs_uquota.qfs_ino = NULLFSINO;
- out->qs_gquota.qfs_ino = NULLFSINO;
- out->qs_pquota.qfs_ino = NULLFSINO;
- return 0;
- }
-
out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
(XFS_ALL_QUOTA_ACCT|
XFS_ALL_QUOTA_ENFD));
@@ -519,11 +495,9 @@ xfs_qm_scall_getqstatv(
out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
- if (q) {
- uip = q->qi_uquotaip;
- gip = q->qi_gquotaip;
- pip = q->qi_pquotaip;
- }
+ uip = q->qi_uquotaip;
+ gip = q->qi_gquotaip;
+ pip = q->qi_pquotaip;
if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
0, 0, &uip) == 0)
@@ -558,19 +532,18 @@ xfs_qm_scall_getqstatv(
if (temppqip)
IRELE(pip);
}
- if (q) {
- out->qs_incoredqs = q->qi_dquots;
- out->qs_btimelimit = q->qi_btimelimit;
- out->qs_itimelimit = q->qi_itimelimit;
- out->qs_rtbtimelimit = q->qi_rtbtimelimit;
- out->qs_bwarnlimit = q->qi_bwarnlimit;
- out->qs_iwarnlimit = q->qi_iwarnlimit;
- }
+ out->qs_incoredqs = q->qi_dquots;
+ out->qs_btimelimit = q->qi_btimelimit;
+ out->qs_itimelimit = q->qi_itimelimit;
+ out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+ out->qs_bwarnlimit = q->qi_bwarnlimit;
+ out->qs_iwarnlimit = q->qi_iwarnlimit;
+
return 0;
}
-#define XFS_DQ_MASK \
- (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+#define XFS_QC_MASK \
+ (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
/*
* Adjust quota limits, and start/stop timers accordingly.
@@ -580,7 +553,7 @@ xfs_qm_scall_setqlim(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
- fs_disk_quota_t *newlim)
+ struct qc_dqblk *newlim)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_disk_dquot *ddq;
@@ -589,9 +562,9 @@ xfs_qm_scall_setqlim(
int error;
xfs_qcnt_t hard, soft;
- if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+ if (newlim->d_fieldmask & ~XFS_QC_MASK)
return -EINVAL;
- if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+ if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
return 0;
/*
@@ -629,11 +602,11 @@ xfs_qm_scall_setqlim(
/*
* Make sure that hardlimits are >= soft limits before changing.
*/
- hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+ hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
+ (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
be64_to_cpu(ddq->d_blk_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+ soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
+ (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
be64_to_cpu(ddq->d_blk_softlimit);
if (hard == 0 || hard >= soft) {
ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -646,11 +619,11 @@ xfs_qm_scall_setqlim(
} else {
xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
}
- hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+ hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
+ (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
be64_to_cpu(ddq->d_rtb_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
- (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+ soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
+ (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
be64_to_cpu(ddq->d_rtb_softlimit);
if (hard == 0 || hard >= soft) {
ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -663,10 +636,10 @@ xfs_qm_scall_setqlim(
xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
}
- hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+ hard = (newlim->d_fieldmask & QC_INO_HARD) ?
(xfs_qcnt_t) newlim->d_ino_hardlimit :
be64_to_cpu(ddq->d_ino_hardlimit);
- soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+ soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
(xfs_qcnt_t) newlim->d_ino_softlimit :
be64_to_cpu(ddq->d_ino_softlimit);
if (hard == 0 || hard >= soft) {
@@ -683,12 +656,12 @@ xfs_qm_scall_setqlim(
/*
* Update warnings counter(s) if requested
*/
- if (newlim->d_fieldmask & FS_DQ_BWARNS)
- ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
- if (newlim->d_fieldmask & FS_DQ_IWARNS)
- ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
- if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
- ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+ if (newlim->d_fieldmask & QC_SPC_WARNS)
+ ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
+ if (newlim->d_fieldmask & QC_INO_WARNS)
+ ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
+ if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+ ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
if (id == 0) {
/*
@@ -698,24 +671,24 @@ xfs_qm_scall_setqlim(
* soft and hard limit values (already done, above), and
* for warnings.
*/
- if (newlim->d_fieldmask & FS_DQ_BTIMER) {
- q->qi_btimelimit = newlim->d_btimer;
- ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+ if (newlim->d_fieldmask & QC_SPC_TIMER) {
+ q->qi_btimelimit = newlim->d_spc_timer;
+ ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
}
- if (newlim->d_fieldmask & FS_DQ_ITIMER) {
- q->qi_itimelimit = newlim->d_itimer;
- ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+ if (newlim->d_fieldmask & QC_INO_TIMER) {
+ q->qi_itimelimit = newlim->d_ino_timer;
+ ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
}
- if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
- q->qi_rtbtimelimit = newlim->d_rtbtimer;
- ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+ if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
+ q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
+ ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
}
- if (newlim->d_fieldmask & FS_DQ_BWARNS)
- q->qi_bwarnlimit = newlim->d_bwarns;
- if (newlim->d_fieldmask & FS_DQ_IWARNS)
- q->qi_iwarnlimit = newlim->d_iwarns;
- if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
- q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+ if (newlim->d_fieldmask & QC_SPC_WARNS)
+ q->qi_bwarnlimit = newlim->d_spc_warns;
+ if (newlim->d_fieldmask & QC_INO_WARNS)
+ q->qi_iwarnlimit = newlim->d_ino_warns;
+ if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
+ q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
} else {
/*
* If the user is now over quota, start the timelimit.
@@ -820,7 +793,7 @@ xfs_qm_scall_getquota(
struct xfs_mount *mp,
xfs_dqid_t id,
uint type,
- struct fs_disk_quota *dst)
+ struct qc_dqblk *dst)
{
struct xfs_dquot *dqp;
int error;
@@ -844,28 +817,25 @@ xfs_qm_scall_getquota(
}
memset(dst, 0, sizeof(*dst));
- dst->d_version = FS_DQUOT_VERSION;
- dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
- dst->d_id = be32_to_cpu(dqp->q_core.d_id);
- dst->d_blk_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
- dst->d_blk_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
+ dst->d_spc_hardlimit =
+ XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
+ dst->d_spc_softlimit =
+ XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
- dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
- dst->d_icount = dqp->q_res_icount;
- dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
- dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
- dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
- dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
- dst->d_rtb_hardlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
- dst->d_rtb_softlimit =
- XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
- dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
- dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
- dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+ dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
+ dst->d_ino_count = dqp->q_res_icount;
+ dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
+ dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
+ dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
+ dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
+ dst->d_rt_spc_hardlimit =
+ XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+ dst->d_rt_spc_softlimit =
+ XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+ dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
+ dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+ dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
/*
* Internally, we don't reset all the timers when quota enforcement
@@ -878,23 +848,23 @@ xfs_qm_scall_getquota(
dqp->q_core.d_flags == XFS_DQ_GROUP) ||
(!XFS_IS_PQUOTA_ENFORCED(mp) &&
dqp->q_core.d_flags == XFS_DQ_PROJ)) {
- dst->d_btimer = 0;
- dst->d_itimer = 0;
- dst->d_rtbtimer = 0;
+ dst->d_spc_timer = 0;
+ dst->d_ino_timer = 0;
+ dst->d_rt_spc_timer = 0;
}
#ifdef DEBUG
- if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
- (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
- (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
- dst->d_id != 0) {
- if ((dst->d_bcount > dst->d_blk_softlimit) &&
- (dst->d_blk_softlimit > 0)) {
- ASSERT(dst->d_btimer != 0);
+ if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
+ (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
+ (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
+ id != 0) {
+ if ((dst->d_space > dst->d_spc_softlimit) &&
+ (dst->d_spc_softlimit > 0)) {
+ ASSERT(dst->d_spc_timer != 0);
}
- if ((dst->d_icount > dst->d_ino_softlimit) &&
+ if ((dst->d_ino_count > dst->d_ino_softlimit) &&
(dst->d_ino_softlimit > 0)) {
- ASSERT(dst->d_itimer != 0);
+ ASSERT(dst->d_ino_timer != 0);
}
}
#endif
@@ -904,26 +874,6 @@ out_put:
}
STATIC uint
-xfs_qm_export_qtype_flags(
- uint flags)
-{
- /*
- * Can't be more than one, or none.
- */
- ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
- (FS_PROJ_QUOTA | FS_USER_QUOTA));
- ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
- (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
- ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
- (FS_USER_QUOTA | FS_GROUP_QUOTA));
- ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-
- return (flags & XFS_DQ_USER) ?
- FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
- FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-
-STATIC uint
xfs_qm_export_flags(
uint flags)
{
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
return xfs_qm_scall_getqstatv(mp, fqs);
}
-STATIC int
-xfs_fs_set_xstate(
- struct super_block *sb,
- unsigned int uflags,
- int op)
+static unsigned int
+xfs_quota_flags(unsigned int uflags)
{
- struct xfs_mount *mp = XFS_M(sb);
- unsigned int flags = 0;
-
- if (sb->s_flags & MS_RDONLY)
- return -EROFS;
- if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
- return -ENOSYS;
+ unsigned int flags = 0;
if (uflags & FS_QUOTA_UDQ_ACCT)
flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
if (uflags & FS_QUOTA_PDQ_ENFD)
flags |= XFS_PQUOTA_ENFD;
- switch (op) {
- case Q_XQUOTAON:
- return xfs_qm_scall_quotaon(mp, flags);
- case Q_XQUOTAOFF:
- if (!XFS_IS_QUOTA_ON(mp))
- return -EINVAL;
- return xfs_qm_scall_quotaoff(mp, flags);
- }
+ return flags;
+}
+
+STATIC int
+xfs_quota_enable(
+ struct super_block *sb,
+ unsigned int uflags)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+
+ return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+
+STATIC int
+xfs_quota_disable(
+ struct super_block *sb,
+ unsigned int uflags)
+{
+ struct xfs_mount *mp = XFS_M(sb);
+
+ if (sb->s_flags & MS_RDONLY)
+ return -EROFS;
+ if (!XFS_IS_QUOTA_RUNNING(mp))
+ return -ENOSYS;
+ if (!XFS_IS_QUOTA_ON(mp))
+ return -EINVAL;
- return -EINVAL;
+ return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
}
STATIC int
@@ -131,7 +145,7 @@ STATIC int
xfs_fs_get_dqblk(
struct super_block *sb,
struct kqid qid,
- struct fs_disk_quota *fdq)
+ struct qc_dqblk *qdq)
{
struct xfs_mount *mp = XFS_M(sb);
@@ -141,14 +155,14 @@ xfs_fs_get_dqblk(
return -ESRCH;
return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
- xfs_quota_type(qid.type), fdq);
+ xfs_quota_type(qid.type), qdq);
}
STATIC int
xfs_fs_set_dqblk(
struct super_block *sb,
struct kqid qid,
- struct fs_disk_quota *fdq)
+ struct qc_dqblk *qdq)
{
struct xfs_mount *mp = XFS_M(sb);
@@ -160,13 +174,14 @@ xfs_fs_set_dqblk(
return -ESRCH;
return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
- xfs_quota_type(qid.type), fdq);
+ xfs_quota_type(qid.type), qdq);
}
const struct quotactl_ops xfs_quotactl_operations = {
.get_xstatev = xfs_fs_get_xstatev,
.get_xstate = xfs_fs_get_xstate,
- .set_xstate = xfs_fs_set_xstate,
+ .quota_enable = xfs_quota_enable,
+ .quota_disable = xfs_quota_disable,
.rm_xquota = xfs_fs_rm_xquota,
.get_dqblk = xfs_fs_get_dqblk,
.set_dqblk = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index f2449fd86926..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1537,7 +1537,7 @@ xfs_fs_mount(
static long
xfs_fs_nr_cached_objects(
struct super_block *sb,
- int nid)
+ struct shrink_control *sc)
{
return xfs_reclaim_inodes_count(XFS_M(sb));
}
@@ -1545,10 +1545,9 @@ xfs_fs_nr_cached_objects(
static long
xfs_fs_free_cached_objects(
struct super_block *sb,
- long nr_to_scan,
- int nid)
+ struct shrink_control *sc)
{
- return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+ return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
}
static const struct super_operations xfs_super_operations = {