summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/Makefile5
-rw-r--r--fs/9p/acl.c65
-rw-r--r--fs/9p/cache.h1
-rw-r--r--fs/9p/vfs_file.c5
-rw-r--r--fs/9p/vfs_inode.c3
-rw-r--r--fs/9p/vfs_inode_dotl.c3
-rw-r--r--fs/9p/xattr.c42
-rw-r--r--fs/9p/xattr.h3
-rw-r--r--fs/9p/xattr_security.c80
-rw-r--r--fs/9p/xattr_trusted.c80
-rw-r--r--fs/9p/xattr_user.c80
-rw-r--r--fs/Makefile7
-rw-r--r--fs/binfmt_elf.c22
-rw-r--r--fs/binfmt_elf_fdpic.c68
-rw-r--r--fs/block_dev.c24
-rw-r--r--fs/btrfs/backref.c14
-rw-r--r--fs/btrfs/btrfs_inode.h2
-rw-r--r--fs/btrfs/check-integrity.c8
-rw-r--r--fs/btrfs/compression.c104
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h182
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/delayed-ref.c190
-rw-r--r--fs/btrfs/delayed-ref.h23
-rw-r--r--fs/btrfs/dev-replace.c55
-rw-r--r--fs/btrfs/disk-io.c177
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/export.c10
-rw-r--r--fs/btrfs/extent-tree.c599
-rw-r--r--fs/btrfs/extent_io.c249
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/file.c246
-rw-r--r--fs/btrfs/free-space-cache.c83
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c296
-rw-r--r--fs/btrfs/ioctl.c288
-rw-r--r--fs/btrfs/locking.c12
-rw-r--r--fs/btrfs/ordered-data.c70
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c242
-rw-r--r--fs/btrfs/qgroup.h31
-rw-r--r--fs/btrfs/raid56.c6
-rw-r--r--fs/btrfs/reada.c8
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/root-tree.c11
-rw-r--r--fs/btrfs/scrub.c217
-rw-r--r--fs/btrfs/send.c220
-rw-r--r--fs/btrfs/super.c59
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/transaction.c162
-rw-r--r--fs/btrfs/transaction.h26
-rw-r--r--fs/btrfs/tree-log.c24
-rw-r--r--fs/btrfs/volumes.c452
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c4
-rw-r--r--fs/buffer.c26
-rw-r--r--fs/cachefiles/internal.h2
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/rdwr.c73
-rw-r--r--fs/ceph/addr.c7
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c76
-rw-r--r--fs/ceph/file.c87
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/locks.c4
-rw-r--r--fs/ceph/mds_client.c57
-rw-r--r--fs/ceph/mds_client.h3
-rw-r--r--fs/ceph/super.h1
-rw-r--r--fs/cifs/cifs_spnego.c6
-rw-r--r--fs/cifs/cifsacl.c25
-rw-r--r--fs/cifs/cifsencrypt.c53
-rw-r--r--fs/cifs/cifsfs.c21
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h8
-rw-r--r--fs/cifs/connect.c87
-rw-r--r--fs/cifs/file.c8
-rw-r--r--fs/cifs/inode.c34
-rw-r--r--fs/cifs/ioctl.c17
-rw-r--r--fs/cifs/readdir.c2
-rw-r--r--fs/cifs/sess.c2
-rw-r--r--fs/cifs/smb2file.c19
-rw-r--r--fs/cifs/smb2ops.c18
-rw-r--r--fs/cifs/smb2pdu.c207
-rw-r--r--fs/cifs/smb2pdu.h45
-rw-r--r--fs/cifs/smbfsctl.h2
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/coredump.c35
-rw-r--r--fs/dax.c76
-rw-r--r--fs/debugfs/file.c177
-rw-r--r--fs/debugfs/inode.c12
-rw-r--r--fs/direct-io.c25
-rw-r--r--fs/dlm/plock.c6
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h5
-rw-r--r--fs/ecryptfs/inode.c2
-rw-r--r--fs/exofs/namei.c3
-rw-r--r--fs/ext2/ext2.h11
-rw-r--r--fs/ext2/file.c84
-rw-r--r--fs/ext2/inode.c10
-rw-r--r--fs/ext2/namei.c3
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext2/xattr.c7
-rw-r--r--fs/ext2/xattr_security.c15
-rw-r--r--fs/ext2/xattr_trusted.c15
-rw-r--r--fs/ext2/xattr_user.c15
-rw-r--r--fs/ext4/Kconfig2
-rw-r--r--fs/ext4/Makefile2
-rw-r--r--fs/ext4/balloc.c92
-rw-r--r--fs/ext4/block_validity.c2
-rw-r--r--fs/ext4/crypto.c52
-rw-r--r--fs/ext4/crypto_fname.c2
-rw-r--r--fs/ext4/crypto_key.c20
-rw-r--r--fs/ext4/crypto_policy.c3
-rw-r--r--fs/ext4/dir.c7
-rw-r--r--fs/ext4/ext4.h177
-rw-r--r--fs/ext4/ext4_jbd2.c6
-rw-r--r--fs/ext4/ext4_jbd2.h10
-rw-r--r--fs/ext4/extents.c88
-rw-r--r--fs/ext4/extents_status.c60
-rw-r--r--fs/ext4/extents_status.h2
-rw-r--r--fs/ext4/ialloc.c113
-rw-r--r--fs/ext4/indirect.c5
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c89
-rw-r--r--fs/ext4/ioctl.c15
-rw-r--r--fs/ext4/mballoc.c87
-rw-r--r--fs/ext4/migrate.c9
-rw-r--r--fs/ext4/mmp.c8
-rw-r--r--fs/ext4/namei.c41
-rw-r--r--fs/ext4/page-io.c5
-rw-r--r--fs/ext4/readpage.c6
-rw-r--r--fs/ext4/resize.c34
-rw-r--r--fs/ext4/super.c697
-rw-r--r--fs/ext4/symlink.c2
-rw-r--r--fs/ext4/sysfs.c448
-rw-r--r--fs/ext4/xattr.c39
-rw-r--r--fs/ext4/xattr_security.c15
-rw-r--r--fs/ext4/xattr_trusted.c15
-rw-r--r--fs/ext4/xattr_user.c15
-rw-r--r--fs/f2fs/checkpoint.c49
-rw-r--r--fs/f2fs/crypto_key.c4
-rw-r--r--fs/f2fs/data.c176
-rw-r--r--fs/f2fs/debug.c36
-rw-r--r--fs/f2fs/dir.c19
-rw-r--r--fs/f2fs/extent_cache.c195
-rw-r--r--fs/f2fs/f2fs.h86
-rw-r--r--fs/f2fs/file.c327
-rw-r--r--fs/f2fs/gc.c77
-rw-r--r--fs/f2fs/gc.h6
-rw-r--r--fs/f2fs/inline.c42
-rw-r--r--fs/f2fs/inode.c8
-rw-r--r--fs/f2fs/namei.c22
-rw-r--r--fs/f2fs/node.c26
-rw-r--r--fs/f2fs/node.h4
-rw-r--r--fs/f2fs/recovery.c15
-rw-r--r--fs/f2fs/segment.c206
-rw-r--r--fs/f2fs/segment.h4
-rw-r--r--fs/f2fs/super.c37
-rw-r--r--fs/f2fs/xattr.c60
-rw-r--r--fs/file.c94
-rw-r--r--fs/fs-writeback.c62
-rw-r--r--fs/fscache/cookie.c2
-rw-r--r--fs/fscache/netfs.c38
-rw-r--r--fs/fscache/object-list.c4
-rw-r--r--fs/fscache/page.c8
-rw-r--r--fs/fuse/file.c2
-rw-r--r--fs/gfs2/dir.c7
-rw-r--r--fs/gfs2/file.c10
-rw-r--r--fs/gfs2/glock.c84
-rw-r--r--fs/gfs2/glock.h4
-rw-r--r--fs/gfs2/glops.c10
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/lock_dlm.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/rgrp.c9
-rw-r--r--fs/gfs2/trans.c4
-rw-r--r--fs/gfs2/xattr.c13
-rw-r--r--fs/hfsplus/xattr.c21
-rw-r--r--fs/hfsplus/xattr_security.c21
-rw-r--r--fs/hfsplus/xattr_trusted.c21
-rw-r--r--fs/hfsplus/xattr_user.c21
-rw-r--r--fs/hpfs/namei.c2
-rw-r--r--fs/inode.c1
-rw-r--r--fs/jbd2/checkpoint.c8
-rw-r--r--fs/jbd2/commit.c22
-rw-r--r--fs/jbd2/journal.c27
-rw-r--r--fs/jbd2/recovery.c26
-rw-r--r--fs/jbd2/revoke.c4
-rw-r--r--fs/jbd2/transaction.c4
-rw-r--r--fs/jffs2/background.c7
-rw-r--r--fs/jffs2/dir.c3
-rw-r--r--fs/jffs2/malloc.c27
-rw-r--r--fs/jffs2/readinode.c6
-rw-r--r--fs/jffs2/security.c16
-rw-r--r--fs/jffs2/wbuf.c3
-rw-r--r--fs/jffs2/xattr.c9
-rw-r--r--fs/jffs2/xattr_trusted.c19
-rw-r--r--fs/jffs2/xattr_user.c16
-rw-r--r--fs/jfs/namei.c3
-rw-r--r--fs/jfs/super.c3
-rw-r--r--fs/lockd/clntproc.c13
-rw-r--r--fs/lockd/host.c8
-rw-r--r--fs/lockd/mon.c125
-rw-r--r--fs/lockd/netns.h4
-rw-r--r--fs/lockd/svc.c2
-rw-r--r--fs/lockd/svc4proc.c2
-rw-r--r--fs/lockd/svcproc.c2
-rw-r--r--fs/locks.c106
-rw-r--r--fs/logfs/dev_bdev.c4
-rw-r--r--fs/logfs/segment.c2
-rw-r--r--fs/mpage.c38
-rw-r--r--fs/namei.c21
-rw-r--r--fs/ncpfs/dir.c4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c7
-rw-r--r--fs/nfs/callback.c40
-rw-r--r--fs/nfs/callback.h12
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/callback_xdr.c39
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c14
-rw-r--r--fs/nfs/delegation.h2
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfs/direct.c7
-rw-r--r--fs/nfs/file.c19
-rw-r--r--fs/nfs/filelayout/filelayout.c31
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c40
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h7
-rw-r--r--fs/nfs/mount_clnt.c4
-rw-r--r--fs/nfs/nfs42.h1
-rw-r--r--fs/nfs/nfs42proc.c75
-rw-r--r--fs/nfs/nfs42xdr.c97
-rw-r--r--fs/nfs/nfs4_fs.h6
-rw-r--r--fs/nfs/nfs4file.c136
-rw-r--r--fs/nfs/nfs4idmap.c4
-rw-r--r--fs/nfs/nfs4proc.c371
-rw-r--r--fs/nfs/nfs4state.c5
-rw-r--r--fs/nfs/nfs4trace.h2
-rw-r--r--fs/nfs/nfs4xdr.c53
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/objlayout/objio_osd.c2
-rw-r--r--fs/nfs/pagelist.c2
-rw-r--r--fs/nfs/pnfs.c47
-rw-r--r--fs/nfs/pnfs.h7
-rw-r--r--fs/nfs/read.c12
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfs/write.c24
-rw-r--r--fs/nfsd/blocklayout.c8
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4layouts.c34
-rw-r--r--fs/nfsd/nfs4proc.c4
-rw-r--r--fs/nfsd/nfs4state.c265
-rw-r--r--fs/nfsd/nfscache.c32
-rw-r--r--fs/nfsd/nfsfh.c5
-rw-r--r--fs/nfsd/nfsfh.h20
-rw-r--r--fs/nfsd/state.h43
-rw-r--r--fs/nfsd/trace.c2
-rw-r--r--fs/nfsd/trace.h2
-rw-r--r--fs/nfsd/vfs.c4
-rw-r--r--fs/nfsd/vfs.h4
-rw-r--r--fs/nfsd/xdr4.h2
-rw-r--r--fs/nilfs2/alloc.c308
-rw-r--r--fs/nilfs2/alloc.h1
-rw-r--r--fs/nilfs2/btree.c7
-rw-r--r--fs/nilfs2/dat.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/inode.c4
-rw-r--r--fs/nilfs2/mdt.c6
-rw-r--r--fs/nilfs2/mdt.h2
-rw-r--r--fs/nilfs2/namei.c3
-rw-r--r--fs/nilfs2/recovery.c4
-rw-r--r--fs/nilfs2/segment.c107
-rw-r--r--fs/nilfs2/segment.h3
-rw-r--r--fs/nilfs2/sufile.c11
-rw-r--r--fs/nilfs2/super.c17
-rw-r--r--fs/notify/fdinfo.c9
-rw-r--r--fs/notify/inotify/inotify_user.c14
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ocfs2/aops.c2
-rw-r--r--fs/ocfs2/cluster/heartbeat.c19
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c12
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c12
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c3
-rw-r--r--fs/ocfs2/inode.h2
-rw-r--r--fs/ocfs2/journal.c105
-rw-r--r--fs/ocfs2/locks.c8
-rw-r--r--fs/ocfs2/namei.c13
-rw-r--r--fs/ocfs2/namei.h3
-rw-r--r--fs/ocfs2/refcounttree.c5
-rw-r--r--fs/ocfs2/suballoc.c5
-rw-r--r--fs/ocfs2/xattr.c45
-rw-r--r--fs/overlayfs/copy_up.c6
-rw-r--r--fs/overlayfs/inode.c3
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/pipe.c18
-rw-r--r--fs/posix_acl.c36
-rw-r--r--fs/proc/array.c26
-rw-r--r--fs/proc/base.c19
-rw-r--r--fs/proc/fd.c14
-rw-r--r--fs/proc/meminfo.c7
-rw-r--r--fs/proc/proc_sysctl.c2
-rw-r--r--fs/proc/task_mmu.c64
-rw-r--r--fs/pstore/Kconfig2
-rw-r--r--fs/pstore/Makefile6
-rw-r--r--fs/pstore/ftrace.c25
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h6
-rw-r--r--fs/pstore/platform.c47
-rw-r--r--fs/pstore/pmsg.c9
-rw-r--r--fs/pstore/ram.c19
-rw-r--r--fs/ramfs/file-nommu.c5
-rw-r--r--fs/reiserfs/namei.c3
-rw-r--r--fs/reiserfs/xattr.c16
-rw-r--r--fs/reiserfs/xattr_security.c13
-rw-r--r--fs/reiserfs/xattr_trusted.c13
-rw-r--r--fs/reiserfs/xattr_user.c13
-rw-r--r--fs/seq_file.c51
-rw-r--r--fs/splice.c2
-rw-r--r--fs/squashfs/xattr.c86
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sync.c10
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/sysfs/group.c44
-rw-r--r--fs/tracefs/inode.c6
-rw-r--r--fs/ubifs/Kconfig15
-rw-r--r--fs/ubifs/debug.c8
-rw-r--r--fs/ubifs/dir.c11
-rw-r--r--fs/ubifs/file.c50
-rw-r--r--fs/ubifs/lpt.c6
-rw-r--r--fs/ubifs/misc.h9
-rw-r--r--fs/ubifs/recovery.c8
-rw-r--r--fs/ubifs/super.c13
-rw-r--r--fs/ubifs/tnc.c3
-rw-r--r--fs/ubifs/ubifs.h8
-rw-r--r--fs/ubifs/xattr.c55
-rw-r--r--fs/userfaultfd.c12
-rw-r--r--fs/xattr.c39
-rw-r--r--fs/xfs/Makefile2
-rw-r--r--fs/xfs/kmem.c10
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c30
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr.c6
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c3
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c65
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h13
-rw-r--r--fs/xfs/libxfs/xfs_btree.c21
-rw-r--r--fs/xfs/libxfs/xfs_btree.h39
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c3
-rw-r--r--fs/xfs/libxfs/xfs_format.h18
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c10
-rw-r--r--fs/xfs/libxfs/xfs_sb.c10
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c7
-rw-r--r--fs/xfs/xfs_acl.c14
-rw-r--r--fs/xfs/xfs_acl.h4
-rw-r--r--fs/xfs/xfs_aops.c119
-rw-r--r--fs/xfs/xfs_aops.h3
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_util.c38
-rw-r--r--fs/xfs/xfs_buf.c21
-rw-r--r--fs/xfs/xfs_dir2_readdir.c2
-rw-r--r--fs/xfs/xfs_dquot.c14
-rw-r--r--fs/xfs/xfs_file.c116
-rw-r--r--fs/xfs/xfs_icache.c18
-rw-r--r--fs/xfs/xfs_inode.c8
-rw-r--r--fs/xfs/xfs_inode_item.c1
-rw-r--r--fs/xfs/xfs_inode_item.h1
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_iomap.c70
-rw-r--r--fs/xfs/xfs_iops.c4
-rw-r--r--fs/xfs/xfs_linux.h7
-rw-r--r--fs/xfs/xfs_log.c93
-rw-r--r--fs/xfs/xfs_log.h1
-rw-r--r--fs/xfs/xfs_log_priv.h51
-rw-r--r--fs/xfs/xfs_log_recover.c14
-rw-r--r--fs/xfs/xfs_message.c7
-rw-r--r--fs/xfs/xfs_mount.c21
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_pnfs.c5
-rw-r--r--fs/xfs/xfs_qm.c16
-rw-r--r--fs/xfs/xfs_stats.c93
-rw-r--r--fs/xfs/xfs_stats.h36
-rw-r--r--fs/xfs/xfs_super.c57
-rw-r--r--fs/xfs/xfs_sysctl.c15
-rw-r--r--fs/xfs/xfs_sysfs.c185
-rw-r--r--fs/xfs/xfs_sysfs.h1
-rw-r--r--fs/xfs/xfs_trace.h2
-rw-r--r--fs/xfs/xfs_trans.c6
-rw-r--r--fs/xfs/xfs_trans_ail.c13
-rw-r--r--fs/xfs/xfs_trans_inode.c9
-rw-r--r--fs/xfs/xfs_xattr.c41
404 files changed, 9656 insertions, 5536 deletions
diff --git a/fs/9p/Makefile b/fs/9p/Makefile
index ff7be98f84f2..9619ccadd2fc 100644
--- a/fs/9p/Makefile
+++ b/fs/9p/Makefile
@@ -10,10 +10,7 @@ obj-$(CONFIG_9P_FS) := 9p.o
vfs_dentry.o \
v9fs.o \
fid.o \
- xattr.o \
- xattr_user.o \
- xattr_trusted.o
+ xattr.o
9p-$(CONFIG_9P_FSCACHE) += cache.o
9p-$(CONFIG_9P_FS_POSIX_ACL) += acl.o
-9p-$(CONFIG_9P_FS_SECURITY) += xattr_security.o
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index 31c010372660..a7e28890f5ef 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -212,26 +212,9 @@ int v9fs_acl_mode(struct inode *dir, umode_t *modep,
return 0;
}
-static int v9fs_remote_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_get(dentry, full_name, buffer, size);
-}
-
-static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct v9fs_session_info *v9ses;
struct posix_acl *acl;
@@ -245,9 +228,9 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
* We allow set/get/list of acl when access=client is not specified
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_get_acl(dentry, name, buffer, size, type);
+ return v9fs_xattr_get(dentry, handler->prefix, buffer, size);
- acl = v9fs_get_cached_acl(d_inode(dentry), type);
+ acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -258,29 +241,9 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name,
return error;
}
-static int v9fs_remote_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
-{
- char *full_name;
-
- switch (type) {
- case ACL_TYPE_ACCESS:
- full_name = POSIX_ACL_XATTR_ACCESS;
- break;
- case ACL_TYPE_DEFAULT:
- full_name = POSIX_ACL_XATTR_DEFAULT;
- break;
- default:
- BUG();
- }
- return v9fs_xattr_set(dentry, full_name, value, size, flags);
-}
-
-
-static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
- const void *value, size_t size,
- int flags, int type)
+static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
int retval;
struct posix_acl *acl;
@@ -296,8 +259,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
* xattr value. We leave it to the server to validate
*/
if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
- return v9fs_remote_set_acl(dentry, name,
- value, size, flags, type);
+ return v9fs_xattr_set(dentry, handler->prefix, value, size,
+ flags);
if (S_ISLNK(inode->i_mode))
return -EOPNOTSUPP;
@@ -316,9 +279,8 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
} else
acl = NULL;
- switch (type) {
+ switch (handler->flags) {
case ACL_TYPE_ACCESS:
- name = POSIX_ACL_XATTR_ACCESS;
if (acl) {
umode_t mode = inode->i_mode;
retval = posix_acl_equiv_mode(acl, &mode);
@@ -349,7 +311,6 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
}
break;
case ACL_TYPE_DEFAULT:
- name = POSIX_ACL_XATTR_DEFAULT;
if (!S_ISDIR(inode->i_mode)) {
retval = acl ? -EINVAL : 0;
goto err_out;
@@ -358,9 +319,9 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
default:
BUG();
}
- retval = v9fs_xattr_set(dentry, name, value, size, flags);
+ retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags);
if (!retval)
- set_cached_acl(inode, type, acl);
+ set_cached_acl(inode, handler->flags, acl);
err_out:
posix_acl_release(acl);
return retval;
diff --git a/fs/9p/cache.h b/fs/9p/cache.h
index 2f9675491095..247e47e54bcc 100644
--- a/fs/9p/cache.h
+++ b/fs/9p/cache.h
@@ -21,6 +21,7 @@
*/
#ifndef _9P_CACHE_H
+#define _9P_CACHE_H
#ifdef CONFIG_9P_FSCACHE
#include <linux/fscache.h>
#include <linux/spinlock.h>
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 3abc447783aa..7bf835f85bc8 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -161,7 +161,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
if ((fl->fl_flags & FL_POSIX) != FL_POSIX)
BUG();
- res = posix_lock_file_wait(filp, fl);
+ res = locks_lock_file_wait(filp, fl);
if (res < 0)
goto out;
@@ -231,7 +231,8 @@ out_unlock:
if (res < 0 && fl->fl_type != F_UNLCK) {
fl_type = fl->fl_type;
fl->fl_type = F_UNLCK;
- res = posix_lock_file_wait(filp, fl);
+ /* Even if this fails we want to return the remote error */
+ locks_lock_file_wait(filp, fl);
fl->fl_type = fl_type;
}
out:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b1dc51888048..699941e90667 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1368,9 +1368,6 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde
dir->i_ino, dentry, mode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/* build extension */
if (S_ISBLK(mode))
sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev));
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index e8aa57dc8d6d..cb899af1babc 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -829,9 +829,6 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
dir->i_ino, dentry, omode,
MAJOR(rdev), MINOR(rdev));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
v9ses = v9fs_inode2v9ses(dir);
dir_dentry = dentry->d_parent;
dfid = v9fs_fid_lookup(dir_dentry);
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index 0cf44b6cccd6..e3d026ac382e 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -137,6 +137,48 @@ ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
return v9fs_xattr_get(dentry, NULL, buffer, buffer_size);
}
+static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return v9fs_xattr_get(dentry, full_name, buffer, size);
+}
+
+static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ const char *full_name = xattr_full_name(handler, name);
+
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return v9fs_xattr_set(dentry, full_name, value, size, flags);
+}
+
+static struct xattr_handler v9fs_xattr_user_handler = {
+ .prefix = XATTR_USER_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+static struct xattr_handler v9fs_xattr_trusted_handler = {
+ .prefix = XATTR_TRUSTED_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+
+#ifdef CONFIG_9P_FS_SECURITY
+static struct xattr_handler v9fs_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .get = v9fs_xattr_handler_get,
+ .set = v9fs_xattr_handler_set,
+};
+#endif
+
const struct xattr_handler *v9fs_xattr_handlers[] = {
&v9fs_xattr_user_handler,
&v9fs_xattr_trusted_handler,
diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h
index d3e2ea3840be..c63c3bea5de5 100644
--- a/fs/9p/xattr.h
+++ b/fs/9p/xattr.h
@@ -19,9 +19,6 @@
#include <net/9p/client.h>
extern const struct xattr_handler *v9fs_xattr_handlers[];
-extern struct xattr_handler v9fs_xattr_user_handler;
-extern struct xattr_handler v9fs_xattr_trusted_handler;
-extern struct xattr_handler v9fs_xattr_security_handler;
extern const struct xattr_handler v9fs_xattr_acl_access_handler;
extern const struct xattr_handler v9fs_xattr_acl_default_handler;
diff --git a/fs/9p/xattr_security.c b/fs/9p/xattr_security.c
deleted file mode 100644
index cb247a142a6e..000000000000
--- a/fs/9p/xattr_security.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .get = v9fs_xattr_security_get,
- .set = v9fs_xattr_security_set,
-};
diff --git a/fs/9p/xattr_trusted.c b/fs/9p/xattr_trusted.c
deleted file mode 100644
index e30d33b8a3fb..000000000000
--- a/fs/9p/xattr_trusted.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_TRUSTED_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_trusted_handler = {
- .prefix = XATTR_TRUSTED_PREFIX,
- .get = v9fs_xattr_trusted_get,
- .set = v9fs_xattr_trusted_set,
-};
diff --git a/fs/9p/xattr_user.c b/fs/9p/xattr_user.c
deleted file mode 100644
index d0b701b72080..000000000000
--- a/fs/9p/xattr_user.c
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright IBM Corporation, 2010
- * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of version 2.1 of the GNU Lesser General Public License
- * as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- */
-
-
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/fs.h>
-#include <linux/slab.h>
-#include "xattr.h"
-
-static int v9fs_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name+prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_get(dentry, full_name, buffer, size);
- kfree(full_name);
- return retval;
-}
-
-static int v9fs_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
-{
- int retval;
- char *full_name;
- size_t name_len;
- size_t prefix_len = XATTR_USER_PREFIX_LEN;
-
- if (name == NULL)
- return -EINVAL;
-
- if (strcmp(name, "") == 0)
- return -EINVAL;
-
- name_len = strlen(name);
- full_name = kmalloc(prefix_len + name_len + 1 , GFP_KERNEL);
- if (!full_name)
- return -ENOMEM;
- memcpy(full_name, XATTR_USER_PREFIX, prefix_len);
- memcpy(full_name + prefix_len, name, name_len);
- full_name[prefix_len + name_len] = '\0';
-
- retval = v9fs_xattr_set(dentry, full_name, value, size, flags);
- kfree(full_name);
- return retval;
-}
-
-struct xattr_handler v9fs_xattr_user_handler = {
- .prefix = XATTR_USER_PREFIX,
- .get = v9fs_xattr_user_get,
- .set = v9fs_xattr_user_set,
-};
diff --git a/fs/Makefile b/fs/Makefile
index f79cf4043e60..79f522575cba 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -63,10 +63,11 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
-obj-$(CONFIG_EXT2_FS) += ext2/
-# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
-# unless explicitly requested by rootfstype
obj-$(CONFIG_EXT4_FS) += ext4/
+# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
+# ext2 driver, which doesn't know about journalling! Explicitly request ext2
+# by giving the rootfstype= parameter.
+obj-$(CONFIG_EXT2_FS) += ext2/
obj-$(CONFIG_JBD2) += jbd2/
obj-$(CONFIG_CRAMFS) += cramfs/
obj-$(CONFIG_SQUASHFS) += squashfs/
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 6b659967898e..3a93755e880f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -35,6 +35,7 @@
#include <linux/utsname.h>
#include <linux/coredump.h>
#include <linux/sched.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -487,7 +488,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
}
/**
- * arch_check_elf() - check a PT_LOPROC..PT_HIPROC ELF program header
+ * arch_check_elf() - check an ELF executable
* @ehdr: The main ELF header
* @has_interp: True if the ELF has an interpreter, else false.
* @state: Architecture-specific state preserved throughout the process
@@ -759,16 +760,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
*/
would_dump(bprm, interpreter);
- retval = kernel_read(interpreter, 0, bprm->buf,
- BINPRM_BUF_SIZE);
- if (retval != BINPRM_BUF_SIZE) {
+ /* Get the exec headers */
+ retval = kernel_read(interpreter, 0,
+ (void *)&loc->interp_elf_ex,
+ sizeof(loc->interp_elf_ex));
+ if (retval != sizeof(loc->interp_elf_ex)) {
if (retval >= 0)
retval = -EIO;
goto out_free_dentry;
}
- /* Get the exec headers */
- loc->interp_elf_ex = *((struct elfhdr *)bprm->buf);
break;
}
elf_ppnt++;
@@ -1236,6 +1237,15 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma,
if (vma->vm_flags & VM_DONTDUMP)
return 0;
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if ((vma->vm_flags & VM_SHARED) && FILTER(DAX_SHARED))
+ goto whole;
+ if (!(vma->vm_flags & VM_SHARED) && FILTER(DAX_PRIVATE))
+ goto whole;
+ return 0;
+ }
+
/* Hugetlb memory check */
if (vma->vm_flags & VM_HUGETLB) {
if ((vma->vm_flags & VM_SHARED) && FILTER(HUGETLB_SHARED))
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d3634bfb7fe1..b1adb92e69de 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -35,6 +35,7 @@
#include <linux/elf-fdpic.h>
#include <linux/elfcore.h>
#include <linux/coredump.h>
+#include <linux/dax.h>
#include <asm/uaccess.h>
#include <asm/param.h>
@@ -103,19 +104,36 @@ static void __exit exit_elf_fdpic_binfmt(void)
core_initcall(init_elf_fdpic_binfmt);
module_exit(exit_elf_fdpic_binfmt);
-static int is_elf_fdpic(struct elfhdr *hdr, struct file *file)
+static int is_elf(struct elfhdr *hdr, struct file *file)
{
if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0)
return 0;
if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN)
return 0;
- if (!elf_check_arch(hdr) || !elf_check_fdpic(hdr))
+ if (!elf_check_arch(hdr))
return 0;
if (!file->f_op->mmap)
return 0;
return 1;
}
+#ifndef elf_check_fdpic
+#define elf_check_fdpic(x) 0
+#endif
+
+#ifndef elf_check_const_displacement
+#define elf_check_const_displacement(x) 0
+#endif
+
+static int is_constdisp(struct elfhdr *hdr)
+{
+ if (!elf_check_fdpic(hdr))
+ return 1;
+ if (elf_check_const_displacement(hdr))
+ return 1;
+ return 0;
+}
+
/*****************************************************************************/
/*
* read the program headers table into memory
@@ -191,8 +209,18 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* check that this is a binary we know how to deal with */
retval = -ENOEXEC;
- if (!is_elf_fdpic(&exec_params.hdr, bprm->file))
+ if (!is_elf(&exec_params.hdr, bprm->file))
+ goto error;
+ if (!elf_check_fdpic(&exec_params.hdr)) {
+#ifdef CONFIG_MMU
+ /* binfmt_elf handles non-fdpic elf except on nommu */
goto error;
+#else
+ /* nommu can only load ET_DYN (PIE) ELF */
+ if (exec_params.hdr.e_type != ET_DYN)
+ goto error;
+#endif
+ }
/* read the program header table */
retval = elf_fdpic_fetch_phdrs(&exec_params, bprm->file);
@@ -269,13 +297,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
}
- if (elf_check_const_displacement(&exec_params.hdr))
+ if (is_constdisp(&exec_params.hdr))
exec_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* perform insanity checks on the interpreter */
if (interpreter_name) {
retval = -ELIBBAD;
- if (!is_elf_fdpic(&interp_params.hdr, interpreter))
+ if (!is_elf(&interp_params.hdr, interpreter))
goto error;
interp_params.flags = ELF_FDPIC_FLAG_PRESENT;
@@ -306,9 +334,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
retval = -ENOEXEC;
if (stack_size == 0)
- goto error;
+ stack_size = 131072UL; /* same as exec.c's default commit */
- if (elf_check_const_displacement(&interp_params.hdr))
+ if (is_constdisp(&interp_params.hdr))
interp_params.flags |= ELF_FDPIC_FLAG_CONSTDISP;
/* flush all traces of the currently running executable */
@@ -319,7 +347,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
- set_personality(PER_LINUX_FDPIC);
+ if (elf_check_fdpic(&exec_params.hdr))
+ set_personality(PER_LINUX_FDPIC);
+ else
+ set_personality(PER_LINUX);
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
@@ -374,10 +405,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
PAGE_ALIGN(current->mm->start_brk);
#else
- /* create a stack and brk area big enough for everyone
- * - the brk heap starts at the bottom and works up
- * - the stack starts at the top and works down
- */
+ /* create a stack area and zero-size brk area */
stack_size = (stack_size + PAGE_SIZE - 1) & PAGE_MASK;
if (stack_size < PAGE_SIZE * 2)
stack_size = PAGE_SIZE * 2;
@@ -400,8 +428,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->brk = current->mm->start_brk;
current->mm->context.end_brk = current->mm->start_brk;
- current->mm->context.end_brk +=
- (stack_size > PAGE_SIZE) ? (stack_size - PAGE_SIZE) : 0;
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
@@ -1206,6 +1232,20 @@ static int maydump(struct vm_area_struct *vma, unsigned long mm_flags)
return 0;
}
+ /* support for DAX */
+ if (vma_is_dax(vma)) {
+ if (vma->vm_flags & VM_SHARED) {
+ dump_ok = test_bit(MMF_DUMP_DAX_SHARED, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX shared)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ } else {
+ dump_ok = test_bit(MMF_DUMP_DAX_PRIVATE, &mm_flags);
+ kdcore("%08lx: %08lx: %s (DAX private)", vma->vm_start,
+ vma->vm_flags, dump_ok ? "yes" : "no");
+ }
+ return dump_ok;
+ }
+
/* By default, dump shared memory if mapped from an anonymous file. */
if (vma->vm_flags & VM_SHARED) {
if (file_inode(vma->vm_file)->i_nlink == 0) {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 22ea424ee741..bb0dfb1c7af1 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -50,12 +50,21 @@ struct block_device *I_BDEV(struct inode *inode)
}
EXPORT_SYMBOL(I_BDEV);
-static void bdev_write_inode(struct inode *inode)
+static void bdev_write_inode(struct block_device *bdev)
{
+ struct inode *inode = bdev->bd_inode;
+ int ret;
+
spin_lock(&inode->i_lock);
while (inode->i_state & I_DIRTY) {
spin_unlock(&inode->i_lock);
- WARN_ON_ONCE(write_inode_now(inode, true));
+ ret = write_inode_now(inode, true);
+ if (ret) {
+ char name[BDEVNAME_SIZE];
+ pr_warn_ratelimited("VFS: Dirty inode writeback failed "
+ "for block device %s (err=%d).\n",
+ bdevname(bdev, name), ret);
+ }
spin_lock(&inode->i_lock);
}
spin_unlock(&inode->i_lock);
@@ -1075,7 +1084,7 @@ int revalidate_disk(struct gendisk *disk)
if (disk->fops->revalidate_disk)
ret = disk->fops->revalidate_disk(disk);
-
+ blk_integrity_revalidate(disk);
bdev = bdget_disk(disk, 0);
if (!bdev)
return ret;
@@ -1242,6 +1251,13 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
}
bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
+ /*
+ * If the partition is not aligned on a page
+ * boundary, we can't do dax I/O to it.
+ */
+ if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
+ (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+ bdev->bd_inode->i_flags &= ~S_DAX;
}
} else {
if (bdev->bd_contains == bdev) {
@@ -1497,7 +1513,7 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
* ->release can cause the queue to disappear, so flush all
* dirty data before.
*/
- bdev_write_inode(bdev->bd_inode);
+ bdev_write_inode(bdev);
}
if (bdev->bd_contains == bdev) {
if (disk->fops->release)
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ecbc63d3143e..6dcdb2ec9211 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
goto out;
}
+ if (btrfs_test_is_dummy_root(root)) {
+ srcu_read_unlock(&fs_info->subvol_srcu, index);
+ ret = -ENOENT;
+ goto out;
+ }
+
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
else if (time_seq == (u64)-1)
@@ -1828,7 +1834,6 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
int found = 0;
struct extent_buffer *eb;
struct btrfs_inode_extref *extref;
- struct extent_buffer *leaf;
u32 item_size;
u32 cur_offset;
unsigned long ptr;
@@ -1856,9 +1861,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
btrfs_release_path(path);
- leaf = path->nodes[0];
- item_size = btrfs_item_size_nr(leaf, slot);
- ptr = btrfs_item_ptr_offset(leaf, slot);
+ item_size = btrfs_item_size_nr(eb, slot);
+ ptr = btrfs_item_ptr_offset(eb, slot);
cur_offset = 0;
while (cur_offset < item_size) {
@@ -1872,7 +1876,7 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
if (ret)
break;
- cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+ cur_offset += btrfs_inode_extref_name_len(eb, extref);
cur_offset += sizeof(*extref);
}
btrfs_tree_read_unlock_blocking(eb);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 81220b2203c6..0ef5cc13fae2 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,8 +44,6 @@
#define BTRFS_INODE_IN_DELALLOC_LIST 9
#define BTRFS_INODE_READDIO_NEED_LOCK 10
#define BTRFS_INODE_HAS_PROPS 11
-/* DIO is ready to submit */
-#define BTRFS_INODE_DIO_READY 12
/*
* The following 3 bits are meant only for the btree inode.
* When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfaed276..0340c57bf377 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
if (NULL == selected_super) {
printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
- return -1;
+ return -ENOMEM;
}
list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
superblock_tmp->never_written = 0;
superblock_tmp->mirror_num = 1 + superblock_mirror_num;
if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
- printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
- " @%llu (%s/%llu/%d)\n",
+ btrfs_info_in_rcu(device->dev_root->fs_info,
+ "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
dev_state->name, dev_bytenr,
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
sizeof(*block_ctx->pagev)) *
num_pages, GFP_NOFS);
if (!block_ctx->mem_to_free)
- return -1;
+ return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca29b06..c473c42d7d6c 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -482,13 +482,12 @@ static noinline int add_ra_bio_pages(struct inode *inode,
goto next;
}
- page = __page_cache_alloc(mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ page = __page_cache_alloc(mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
break;
- if (add_to_page_cache_lru(page, mapping, pg_index,
- GFP_NOFS)) {
+ if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
page_cache_release(page);
goto next;
}
@@ -745,11 +744,13 @@ out:
return ret;
}
-static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
-static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
-static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
-static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
-static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+static struct {
+ struct list_head idle_ws;
+ spinlock_t ws_lock;
+ int num_ws;
+ atomic_t alloc_ws;
+ wait_queue_head_t ws_wait;
+} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
static const struct btrfs_compress_op * const btrfs_compress_op[] = {
&btrfs_zlib_compress,
@@ -761,10 +762,10 @@ void __init btrfs_init_compress(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- INIT_LIST_HEAD(&comp_idle_workspace[i]);
- spin_lock_init(&comp_workspace_lock[i]);
- atomic_set(&comp_alloc_workspace[i], 0);
- init_waitqueue_head(&comp_workspace_wait[i]);
+ INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
+ spin_lock_init(&btrfs_comp_ws[i].ws_lock);
+ atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
+ init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
}
}
@@ -778,38 +779,38 @@ static struct list_head *find_workspace(int type)
int cpus = num_online_cpus();
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
again:
- spin_lock(workspace_lock);
- if (!list_empty(idle_workspace)) {
- workspace = idle_workspace->next;
+ spin_lock(ws_lock);
+ if (!list_empty(idle_ws)) {
+ workspace = idle_ws->next;
list_del(workspace);
- (*num_workspace)--;
- spin_unlock(workspace_lock);
+ (*num_ws)--;
+ spin_unlock(ws_lock);
return workspace;
}
- if (atomic_read(alloc_workspace) > cpus) {
+ if (atomic_read(alloc_ws) > cpus) {
DEFINE_WAIT(wait);
- spin_unlock(workspace_lock);
- prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+ spin_unlock(ws_lock);
+ prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
+ if (atomic_read(alloc_ws) > cpus && !*num_ws)
schedule();
- finish_wait(workspace_wait, &wait);
+ finish_wait(ws_wait, &wait);
goto again;
}
- atomic_inc(alloc_workspace);
- spin_unlock(workspace_lock);
+ atomic_inc(alloc_ws);
+ spin_unlock(ws_lock);
workspace = btrfs_compress_op[idx]->alloc_workspace();
if (IS_ERR(workspace)) {
- atomic_dec(alloc_workspace);
- wake_up(workspace_wait);
+ atomic_dec(alloc_ws);
+ wake_up(ws_wait);
}
return workspace;
}
@@ -821,27 +822,30 @@ again:
static void free_workspace(int type, struct list_head *workspace)
{
int idx = type - 1;
- struct list_head *idle_workspace = &comp_idle_workspace[idx];
- spinlock_t *workspace_lock = &comp_workspace_lock[idx];
- atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
- wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
- int *num_workspace = &comp_num_workspace[idx];
-
- spin_lock(workspace_lock);
- if (*num_workspace < num_online_cpus()) {
- list_add(workspace, idle_workspace);
- (*num_workspace)++;
- spin_unlock(workspace_lock);
+ struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
+ spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
+ atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
+ wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
+ int *num_ws = &btrfs_comp_ws[idx].num_ws;
+
+ spin_lock(ws_lock);
+ if (*num_ws < num_online_cpus()) {
+ list_add(workspace, idle_ws);
+ (*num_ws)++;
+ spin_unlock(ws_lock);
goto wake;
}
- spin_unlock(workspace_lock);
+ spin_unlock(ws_lock);
btrfs_compress_op[idx]->free_workspace(workspace);
- atomic_dec(alloc_workspace);
+ atomic_dec(alloc_ws);
wake:
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
- if (waitqueue_active(workspace_wait))
- wake_up(workspace_wait);
+ if (waitqueue_active(ws_wait))
+ wake_up(ws_wait);
}
/*
@@ -853,11 +857,11 @@ static void free_workspaces(void)
int i;
for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
- while (!list_empty(&comp_idle_workspace[i])) {
- workspace = comp_idle_workspace[i].next;
+ while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
+ workspace = btrfs_comp_ws[i].idle_ws.next;
list_del(workspace);
btrfs_compress_op[i]->free_workspace(workspace);
- atomic_dec(&comp_alloc_workspace[i]);
+ atomic_dec(&btrfs_comp_ws[i].alloc_ws);
}
}
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745eadf77d..5b8e235c4b6d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
return ret;
if (refs == 0) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
} else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
child = read_node_slot(root, mid, 0);
if (!child) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
*/
if (!left) {
ret = -EROFS;
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
goto enospc;
}
wret = balance_node_right(trans, root, mid, left);
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
{
struct extent_buffer *leaf;
struct btrfs_item *item;
- int last_off;
- int dsize = 0;
+ u32 last_off;
+ u32 dsize = 0;
int ret = 0;
int wret;
int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe33be80..8c58191249cc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
*/
__le64 profiles;
- /* usage filter */
- __le64 usage;
+ /*
+ * usage filter
+ * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
+ * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
+ */
+ union {
+ __le64 usage;
+ struct {
+ __le32 usage_min;
+ __le32 usage_max;
+ };
+ };
/* devid filter */
__le64 devid;
@@ -846,10 +856,27 @@ struct btrfs_disk_balance_args {
/* BTRFS_BALANCE_ARGS_* */
__le64 flags;
- /* BTRFS_BALANCE_ARGS_LIMIT value */
- __le64 limit;
+ /*
+ * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
+ * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
+ * and maximum
+ */
+ union {
+ __le64 limit;
+ struct {
+ __le32 limit_min;
+ __le32 limit_max;
+ };
+ };
- __le64 unused[7];
+ /*
+ * Process chunks that cross stripes_min..stripes_max devices,
+ * BTRFS_BALANCE_ARGS_STRIPES_RANGE
+ */
+ __le32 stripes_min;
+ __le32 stripes_max;
+
+ __le64 unused[6];
} __attribute__ ((__packed__));
/*
@@ -1154,6 +1181,10 @@ struct btrfs_space_info {
delalloc/allocations */
u64 bytes_readonly; /* total bytes that are read only */
+ u64 max_extent_size; /* This will hold the maximum extent size of
+ the space info if we had an ENOSPC in the
+ allocator. */
+
unsigned int full:1; /* indicates that we cannot allocate any more
chunks for this space */
unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -1228,6 +1259,9 @@ struct btrfs_free_cluster {
/* first extent starting offset */
u64 window_start;
+ /* We did a full search and couldn't create a cluster */
+ bool fragmented;
+
struct btrfs_block_group_cache *block_group;
/*
* when a cluster is allocated from a block group, we put the
@@ -1943,6 +1977,9 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshoted;
+
+ /* For qgroup metadata space reserve */
+ atomic_t qgroup_meta_rsv;
};
struct btrfs_ioctl_defrag_range_args {
@@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args {
#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
+#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
+#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args {
btrfs_clear_opt(root->fs_info->mount_opt, opt); \
}
+#ifdef CONFIG_BTRFS_DEBUG
+static inline int
+btrfs_should_fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
+ (btrfs_test_opt(root, FRAGMENT_DATA) &&
+ block_group->flags & BTRFS_BLOCK_GROUP_DATA);
+}
+#endif
+
/*
* Requests for changes that need to be done during transaction commit.
*
@@ -3316,7 +3367,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
{
- return mapping_gfp_mask(mapping) & ~__GFP_FS;
+ return mapping_gfp_constraint(mapping, ~__GFP_FS);
}
/* extent-tree.c */
@@ -3379,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins);
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner, u64 offset,
@@ -3398,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
int btrfs_free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota);
+ u64 owner, u64 offset);
int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
int delalloc);
@@ -3411,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset, int no_quota);
+ u64 root_objectid, u64 owner, u64 offset);
int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
@@ -3449,8 +3501,11 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_ALL,
};
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes);
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len);
void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3466,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
u64 qgroup_reserved);
int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
unsigned short type);
@@ -4004,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
/* sysfs.c */
int btrfs_init_sysfs(void);
void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info);
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info);
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
/* xattr.c */
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4039,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+/*
+ * Wrappers that use printk_in_rcu
+ */
+#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk_in_rcu
+ */
+#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
+
+/*
+ * Wrappers that use a ratelimited printk
+ */
+#define btrfs_emerg_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
#ifdef DEBUG
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
#else
#define btrfs_debug(fs_info, fmt, args...) \
no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
+#define btrfs_debug_rl(fs_info, fmt, args...) \
+ no_printk(KERN_DEBUG fmt, ##args)
#endif
+#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
+#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ if (__ratelimit(&_rs)) \
+ btrfs_printk(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_printk_ratelimited(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#ifdef CONFIG_BTRFS_ASSERT
__cold
@@ -4127,14 +4270,7 @@ do { \
__LINE__, (errno)); \
} while (0)
-#define btrfs_std_error(fs_info, errno) \
-do { \
- if ((errno)) \
- __btrfs_std_error((fs_info), __func__, \
- __LINE__, (errno), NULL); \
-} while (0)
-
-#define btrfs_error(fs_info, errno, fmt, args...) \
+#define btrfs_std_error(fs_info, errno, fmt, args...) \
do { \
__btrfs_std_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae42720a6a..e0941fbb913c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
static void finish_one_item(struct btrfs_delayed_root *delayed_root)
{
int seq = atomic_inc_return(&delayed_root->items_seq);
+
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if ((atomic_dec_return(&delayed_root->items) <
BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81da6d4e..e06dd75ad13f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates--;
}
+static bool merge_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head,
+ struct btrfs_delayed_ref_node *ref,
+ u64 seq)
+{
+ struct btrfs_delayed_ref_node *next;
+ bool done = false;
+
+ next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (!done && &next->list != &head->ref_list) {
+ int mod;
+ struct btrfs_delayed_ref_node *next2;
+
+ next2 = list_next_entry(next, list);
+
+ if (next == ref)
+ goto next;
+
+ if (seq && next->seq >= seq)
+ goto next;
+
+ if (next->type != ref->type)
+ goto next;
+
+ if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
+ comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
+ btrfs_delayed_node_to_tree_ref(next),
+ ref->type))
+ goto next;
+ if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
+ ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
+ comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
+ btrfs_delayed_node_to_data_ref(next)))
+ goto next;
+
+ if (ref->action == next->action) {
+ mod = next->ref_mod;
+ } else {
+ if (ref->ref_mod < next->ref_mod) {
+ swap(ref, next);
+ done = true;
+ }
+ mod = -next->ref_mod;
+ }
+
+ drop_delayed_ref(trans, delayed_refs, head, next);
+ ref->ref_mod += mod;
+ if (ref->ref_mod == 0) {
+ drop_delayed_ref(trans, delayed_refs, head, ref);
+ done = true;
+ } else {
+ /*
+ * Can't have multiples of the same ref on a tree block.
+ */
+ WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+ ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+ }
+next:
+ next = next2;
+ }
+
+ return done;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_node *ref;
+ u64 seq = 0;
+
+ assert_spin_locked(&head->lock);
+
+ if (list_empty(&head->ref_list))
+ return;
+
+ /* We don't have too many refs to merge for data. */
+ if (head->is_data)
+ return;
+
+ spin_lock(&fs_info->tree_mod_seq_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct seq_list, list);
+ seq = elem->seq;
+ }
+ spin_unlock(&fs_info->tree_mod_seq_lock);
+
+ ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
+ list);
+ while (&ref->list != &head->ref_list) {
+ if (seq && ref->seq >= seq)
+ goto next;
+
+ if (merge_ref(trans, delayed_refs, head, ref, seq)) {
+ if (list_empty(&head->ref_list))
+ break;
+ ref = list_first_entry(&head->ref_list,
+ struct btrfs_delayed_ref_node,
+ list);
+ continue;
+ }
+next:
+ ref = list_next_entry(ref, list);
+ }
+}
+
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
u64 seq)
@@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
list);
/* No need to compare bytenr nor is_head */
- if (exist->type != ref->type || exist->no_quota != ref->no_quota ||
- exist->seq != ref->seq)
+ if (exist->type != ref->type || exist->seq != ref->seq)
goto add_tail;
if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref,
struct btrfs_qgroup_extent_record *qrecord,
- u64 bytenr, u64 num_bytes, int action, int is_data)
+ u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
+ int action, int is_data)
{
struct btrfs_delayed_ref_head *existing;
struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
int count_mod = 1;
int must_insert_reserved = 0;
+ /* If reserved is provided, it must be a data extent. */
+ BUG_ON(!is_data && reserved);
+
/*
* the head node stores the sum of all the mods, so dropping a ref
* should drop the sum in the head node by one.
@@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&head_ref->ref_list);
head_ref->processing = 0;
head_ref->total_ref_mod = count_mod;
+ head_ref->qgroup_reserved = 0;
+ head_ref->qgroup_ref_root = 0;
/* Record qgroup extent info if provided */
if (qrecord) {
+ if (ref_root && reserved) {
+ head_ref->qgroup_ref_root = ref_root;
+ head_ref->qgroup_reserved = reserved;
+ }
+
qrecord->bytenr = bytenr;
qrecord->num_bytes = num_bytes;
qrecord->old_roots = NULL;
@@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
existing = htree_insert(&delayed_refs->href_root,
&head_ref->href_node);
if (existing) {
+ WARN_ON(ref_root && reserved && existing->qgroup_ref_root
+ && existing->qgroup_reserved);
update_existing_head_ref(delayed_refs, &existing->node, ref);
/*
* we've updated the existing ref, free the newly
@@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, int level,
- int action, int no_quota)
+ int action)
{
struct btrfs_delayed_tree_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *head_ref,
struct btrfs_delayed_ref_node *ref, u64 bytenr,
u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
- u64 offset, int action, int no_quota)
+ u64 offset, int action)
{
struct btrfs_delayed_data_ref *full_ref;
struct btrfs_delayed_ref_root *delayed_refs;
@@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
ref->action = action;
ref->is_head = 0;
ref->in_tree = 1;
- ref->no_quota = no_quota;
ref->seq = seq;
full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_tree_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
@@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 0);
+ bytenr, num_bytes, 0, 0, action, 0);
add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
- num_bytes, parent, ref_root, level, action,
- no_quota);
+ num_bytes, parent, ref_root, level, action);
spin_unlock(&delayed_refs->lock);
return 0;
@@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota)
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op)
{
struct btrfs_delayed_data_ref *ref;
struct btrfs_delayed_ref_head *head_ref;
struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_qgroup_extent_record *record = NULL;
- if (!is_fstree(ref_root) || !fs_info->quota_enabled)
- no_quota = 0;
-
BUG_ON(extent_op && !extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
if (!ref)
@@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
* the spin lock
*/
head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
- bytenr, num_bytes, action, 1);
+ bytenr, num_bytes, ref_root, reserved,
+ action, 1);
add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
num_bytes, parent, ref_root, owner, offset,
- action, no_quota);
+ action);
spin_unlock(&delayed_refs->lock);
return 0;
}
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *ref_head;
+ int ret = 0;
+
+ if (!fs_info->quota_enabled || !is_fstree(ref_root))
+ return 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ spin_lock(&delayed_refs->lock);
+ ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ if (!ref_head) {
+ ret = -ENOENT;
+ goto out;
+ }
+ WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
+ ref_head->qgroup_ref_root = ref_root;
+ ref_head->qgroup_reserved = num_bytes;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
@@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs->lock);
add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
- num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+ num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
extent_op->is_data);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6090fe..00ed02cbf3e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
unsigned int action:8;
unsigned int type:8;
- unsigned int no_quota:1;
/* is this node still in the rbtree? */
unsigned int is_head:1;
unsigned int in_tree:1;
@@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head {
int total_ref_mod;
/*
+ * For qgroup reserved space freeing.
+ *
+ * ref_root and reserved will be recorded after
+ * BTRFS_ADD_DELAYED_EXTENT is called.
+ * And will be used to free reserved qgroup space at
+ * run_delayed_refs() time.
+ */
+ u64 qgroup_ref_root;
+ u64 qgroup_reserved;
+
+ /*
* when a new extent is allocated, it is just reserved in memory
* The actual extent isn't inserted into the extent allocation tree
* until the delayed ref is processed. must_insert_reserved is
@@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, u64 parent,
u64 ref_root, int level, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ struct btrfs_delayed_extent_op *extent_op);
int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
u64 parent, u64 ref_root,
- u64 owner, u64 offset, int action,
- struct btrfs_delayed_extent_op *extent_op,
- int no_quota);
+ u64 owner, u64 offset, u64 reserved, int action,
+ struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
+ struct btrfs_trans_handle *trans,
+ u64 ref_root, u64 bytenr, u64 num_bytes);
int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd5905cee..1e668fb7dd4c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
- /*
- * Here we commit the transaction to make sure commit_total_bytes
- * of all the devices are updated.
- */
- trans = btrfs_attach_transaction(root);
- if (!IS_ERR(trans)) {
- ret = btrfs_commit_transaction(trans, root);
- if (ret)
- return ret;
- } else if (PTR_ERR(trans) != -ENOENT) {
- return PTR_ERR(trans);
- }
-
/* the disk copy procedure reuses the scrub code */
mutex_lock(&fs_info->volume_mutex);
ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
if (ret)
return ret;
+ /*
+ * Here we commit the transaction to make sure commit_total_bytes
+ * of all the devices are updated.
+ */
+ trans = btrfs_attach_transaction(root);
+ if (!IS_ERR(trans)) {
+ ret = btrfs_commit_transaction(trans, root);
+ if (ret)
+ return ret;
+ } else if (PTR_ERR(trans) != -ENOENT) {
+ return PTR_ERR(trans);
+ }
+
btrfs_dev_replace_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
WARN_ON(!tgt_device);
dev_replace->tgtdev = tgt_device;
- ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device);
- if (ret)
- btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
-
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s started",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
btrfs_dev_replace_unlock(dev_replace);
+ ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
+ if (ret)
+ btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
+
btrfs_wait_ordered_roots(root->fs_info, -1);
/* force writing the updated state information to disk */
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- if (waitqueue_active(&fs_info->replace_wait))
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
src_device,
tgt_device);
} else {
- printk_in_rcu(KERN_ERR
- "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
+ btrfs_err_in_rcu(root->fs_info,
+ "btrfs_scrub_dev(%s, %llu, %s) failed %d",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
return scrub_ret;
}
- printk_in_rcu(KERN_INFO
- "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
+ btrfs_info_in_rcu(root->fs_info,
+ "dev_replace from %s (devid %llu) to %s finished",
src_device->missing ? "<missing disk>" :
rcu_str_deref(src_device->name),
src_device->devid,
@@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
mutex_unlock(&uuid_mutex);
/* replace the sysfs entry */
- btrfs_kobj_rm_device(fs_info->fs_devices, src_device);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
/* write back the superblocks */
@@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
progress = status_args->status.progress_1000;
kfree(status_args);
progress = div_u64(progress, 10);
- printk_in_rcu(KERN_INFO
- "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
+ btrfs_info_in_rcu(fs_info,
+ "continuing dev_replace from %s (devid %llu) to %s @%u%%",
dev_replace->srcdev->missing ? "<missing disk>" :
rcu_str_deref(dev_replace->srcdev->name),
dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0d98aee34fee..974be09e7556 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
memcpy(&found, result, csum_size);
read_extent_buffer(buf, &val, 0, csum_size);
- printk_ratelimited(KERN_WARNING
- "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
- "level %d\n",
+ btrfs_warn_rl(fs_info,
+ "%s checksum verify failed on %llu wanted %X found %X "
+ "level %d",
fs_info->sb->s_id, buf->start,
val, found, btrfs_header_level(buf));
if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
ret = 0;
goto out;
}
- printk_ratelimited(KERN_ERR
- "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
- eb->fs_info->sb->s_id, eb->start,
+ btrfs_err_rl(eb->fs_info,
+ "parent transid verify failed on %llu wanted %llu found %llu",
+ eb->start,
parent_transid, btrfs_header_generation(eb));
ret = 1;
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
- "%llu %llu\n",
- eb->fs_info->sb->s_id, found_start, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
+ found_start, eb->start);
ret = -EIO;
goto err;
}
if (check_tree_block_fsid(root->fs_info, eb)) {
- printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
- eb->fs_info->sb->s_id, eb->start);
+ btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
+ eb->start);
ret = -EIO;
goto err;
}
@@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work)
limit = btrfs_async_submit_limit(fs_info);
limit = limit * 2 / 3;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
atomic_set(&root->orphan_inodes, 0);
atomic_set(&root->refs, 1);
atomic_set(&root->will_be_snapshoted, 0);
+ atomic_set(&root->qgroup_meta_rsv, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg)
int again;
struct btrfs_trans_handle *trans;
+ set_freezable();
do {
again = 0;
@@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
u64 bytenr = btrfs_super_log_root(disk_super);
if (fs_devices->rw_devices == 0) {
- printk(KERN_WARNING "BTRFS: log replay required "
- "on RO media\n");
+ btrfs_warn(fs_info, "log replay required on RO media");
return -EIO;
}
@@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
log_tree_root->node = read_tree_block(tree_root, bytenr,
fs_info->generation + 1);
if (IS_ERR(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
kfree(log_tree_root);
return ret;
} else if (!extent_buffer_uptodate(log_tree_root->node)) {
- printk(KERN_ERR "BTRFS: failed to read log tree\n");
+ btrfs_err(fs_info, "failed to read log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
return -EIO;
@@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
/* returns with log_tree_root freed on success */
ret = btrfs_recover_log_trees(log_tree_root);
if (ret) {
- btrfs_error(tree_root->fs_info, ret,
+ btrfs_std_error(tree_root->fs_info, ret,
"Failed to recover log tree");
free_extent_buffer(log_tree_root->node);
kfree(log_tree_root);
@@ -2572,7 +2575,7 @@ int open_ctree(struct super_block *sb,
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
- INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
fs_info->thread_pool_size = min_t(unsigned long,
@@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb,
* Read super block and check the signature bytes only
*/
bh = btrfs_read_dev_super(fs_devices->latest_bdev);
- if (!bh) {
- err = -EINVAL;
+ if (IS_ERR(bh)) {
+ err = PTR_ERR(bh);
goto fail_alloc;
}
@@ -2847,6 +2850,8 @@ int open_ctree(struct super_block *sb,
!extent_buffer_uptodate(chunk_root->node)) {
printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
sb->s_id);
+ if (!IS_ERR(chunk_root->node))
+ free_extent_buffer(chunk_root->node);
chunk_root->node = NULL;
goto fail_tree_roots;
}
@@ -2885,6 +2890,8 @@ retry_root_backup:
!extent_buffer_uptodate(tree_root->node)) {
printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n",
sb->s_id);
+ if (!IS_ERR(tree_root->node))
+ free_extent_buffer(tree_root->node);
tree_root->node = NULL;
goto recovery_tree_root;
}
@@ -2933,7 +2940,7 @@ retry_root_backup:
goto fail_fsdev_sysfs;
}
- ret = btrfs_sysfs_add_one(fs_info);
+ ret = btrfs_sysfs_add_mounted(fs_info);
if (ret) {
pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
goto fail_fsdev_sysfs;
@@ -3113,7 +3120,7 @@ fail_cleaner:
filemap_write_and_wait(fs_info->btree_inode->i_mapping);
fail_sysfs:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
fail_fsdev_sysfs:
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3175,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
struct btrfs_device *device = (struct btrfs_device *)
bh->b_private;
- printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to "
- "I/O error on %s\n",
+ btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
+ "lost page write due to IO error on %s",
rcu_str_deref(device->name));
/* note, we dont' set_buffer_write_io_error because we have
* our own ways of dealing with the IO errors
@@ -3188,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret)
+{
+ struct buffer_head *bh;
+ struct btrfs_super_block *super;
+ u64 bytenr;
+
+ bytenr = btrfs_sb_offset(copy_num);
+ if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
+ return -EINVAL;
+
+ bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
+ /*
+ * If we fail to read from the underlying devices, as of now
+ * the best option we have is to mark it EIO.
+ */
+ if (!bh)
+ return -EIO;
+
+ super = (struct btrfs_super_block *)bh->b_data;
+ if (btrfs_super_bytenr(super) != bytenr ||
+ btrfs_super_magic(super) != BTRFS_MAGIC) {
+ brelse(bh);
+ return -EINVAL;
+ }
+
+ *bh_ret = bh;
+ return 0;
+}
+
+
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
{
struct buffer_head *bh;
@@ -3195,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
struct btrfs_super_block *super;
int i;
u64 transid = 0;
- u64 bytenr;
+ int ret = -EINVAL;
/* we would like to check all the supers, but that would make
* a btrfs mount succeed after a mkfs from a different FS.
@@ -3203,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
for (i = 0; i < 1; i++) {
- bytenr = btrfs_sb_offset(i);
- if (bytenr + BTRFS_SUPER_INFO_SIZE >=
- i_size_read(bdev->bd_inode))
- break;
- bh = __bread(bdev, bytenr / 4096,
- BTRFS_SUPER_INFO_SIZE);
- if (!bh)
+ ret = btrfs_read_dev_one_super(bdev, i, &bh);
+ if (ret)
continue;
super = (struct btrfs_super_block *)bh->b_data;
- if (btrfs_super_bytenr(super) != bytenr ||
- btrfs_super_magic(super) != BTRFS_MAGIC) {
- brelse(bh);
- continue;
- }
if (!latest || btrfs_super_generation(super) > transid) {
brelse(latest);
@@ -3227,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
brelse(bh);
}
}
+
+ if (!latest)
+ return ERR_PTR(ret);
+
return latest;
}
@@ -3295,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device,
bh = __getblk(device->bdev, bytenr / 4096,
BTRFS_SUPER_INFO_SIZE);
if (!bh) {
- printk(KERN_ERR "BTRFS: couldn't get super "
- "buffer head for bytenr %Lu\n", bytenr);
+ btrfs_err(device->dev_root->fs_info,
+ "couldn't get super buffer head for bytenr %llu",
+ bytenr);
errors++;
continue;
}
@@ -3445,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
- if ((flags & (BTRFS_BLOCK_GROUP_DUP |
- BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
- ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
- return 0;
+ int raid_type;
+ int min_tolerated = INT_MAX;
- if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID10))
- return 1;
+ if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
+ (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[BTRFS_RAID_SINGLE].
+ tolerated_failures);
+
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (raid_type == BTRFS_RAID_SINGLE)
+ continue;
+ if (!(flags & btrfs_raid_group[raid_type]))
+ continue;
+ min_tolerated = min(min_tolerated,
+ btrfs_raid_array[raid_type].
+ tolerated_failures);
+ }
- if (flags & BTRFS_BLOCK_GROUP_RAID6)
- return 2;
+ if (min_tolerated == INT_MAX) {
+ pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
+ min_tolerated = 0;
+ }
- pr_warn("BTRFS: unknown raid type: %llu\n", flags);
- return 0;
+ return min_tolerated;
}
int btrfs_calc_num_tolerated_disk_barrier_failures(
@@ -3544,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
if (ret) {
mutex_unlock(
&root->fs_info->fs_devices->device_list_mutex);
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"errors while submitting device barriers.");
return ret;
}
@@ -3584,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
/* FUA is masked off if unsupported and can't be the reason */
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3602,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
}
mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
if (total_errors > max_errors) {
- btrfs_error(root->fs_info, -EIO,
+ btrfs_std_error(root->fs_info, -EIO,
"%d errors while writing supers", total_errors);
return -EIO;
}
@@ -3738,6 +3780,9 @@ void close_ctree(struct btrfs_root *root)
fs_info->closing = 1;
smp_mb();
+ /* wait for the qgroup rescan worker to stop */
+ btrfs_qgroup_wait_for_completion(fs_info);
+
/* wait for the uuid_scan task to finish */
down(&fs_info->uuid_tree_rescan_sem);
/* avoid complains from lockdep et al., set sem back to initial state */
@@ -3765,9 +3810,7 @@ void close_ctree(struct btrfs_root *root)
* block groups queued for removal, the deletion will be
* skipped when we quit the cleaner thread.
*/
- mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_delete_unused_bgs(root->fs_info);
- mutex_unlock(&root->fs_info->cleaner_mutex);
ret = btrfs_commit_super(root);
if (ret)
@@ -3790,7 +3833,7 @@ void close_ctree(struct btrfs_root *root)
percpu_counter_sum(&fs_info->delalloc_bytes));
}
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
btrfs_sysfs_remove_fsid(fs_info->fs_devices);
btrfs_free_fs_roots(fs_info);
@@ -4288,25 +4331,6 @@ again:
return 0;
}
-static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
-{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
-}
-
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
struct btrfs_root *root)
{
@@ -4318,7 +4342,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
cur_trans->state = TRANS_STATE_UNBLOCKED;
wake_up(&root->fs_info->transaction_wait);
- btrfs_free_pending_ordered(cur_trans, root->fs_info);
btrfs_destroy_delayed_inodes(root);
btrfs_assert_delayed_root_empty(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479ea859..adeb31830b9c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
int write_ctree_super(struct btrfs_trans_handle *trans,
struct btrfs_root *root, int max_mirrors);
struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
+ struct buffer_head **bh_ret);
int btrfs_commit_super(struct btrfs_root *root);
struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 8d052209f473..2513a7f53334 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -112,11 +112,11 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
u32 generation;
if (fh_type == FILEID_BTRFS_WITH_PARENT) {
- if (fh_len != BTRFS_FID_SIZE_CONNECTABLE)
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE)
return NULL;
root_objectid = fid->root_objectid;
} else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) {
- if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT)
+ if (fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT)
return NULL;
root_objectid = fid->parent_root_objectid;
} else
@@ -136,11 +136,11 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
u32 generation;
if ((fh_type != FILEID_BTRFS_WITH_PARENT ||
- fh_len != BTRFS_FID_SIZE_CONNECTABLE) &&
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE) &&
(fh_type != FILEID_BTRFS_WITH_PARENT_ROOT ||
- fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
+ fh_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) &&
(fh_type != FILEID_BTRFS_WITHOUT_PARENT ||
- fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE))
+ fh_len < BTRFS_FID_SIZE_NON_CONNECTABLE))
return NULL;
objectid = fid->objectid;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5411f0ab5683..acf3ed11cfb6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota);
+ int level, struct btrfs_key *ins);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 flags,
int force);
@@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
kfree(ctl);
}
+#ifdef CONFIG_BTRFS_DEBUG
+static void fragment_free_space(struct btrfs_root *root,
+ struct btrfs_block_group_cache *block_group)
+{
+ u64 start = block_group->key.objectid;
+ u64 len = block_group->key.offset;
+ u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
+ root->nodesize : root->sectorsize;
+ u64 step = chunk << 1;
+
+ while (len > chunk) {
+ btrfs_remove_free_space(block_group, start, chunk);
+ start += step;
+ if (len < step)
+ len = 0;
+ else
+ len -= step;
+ }
+}
+#endif
+
/*
* this is only called by cache_block_group, since we could have freed extents
* we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
u64 last = 0;
u32 nritems;
int ret = -ENOMEM;
+ bool wakeup = true;
caching_ctl = container_of(work, struct btrfs_caching_control, work);
block_group = caching_ctl->block_group;
@@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+#ifdef CONFIG_BTRFS_DEBUG
+ /*
+ * If we're fragmenting we don't want to make anybody think we can
+ * allocate from this block group until we've had a chance to fragment
+ * the free space.
+ */
+ if (btrfs_should_fragment_free_space(extent_root, block_group))
+ wakeup = false;
+#endif
/*
* We don't want to deadlock with somebody trying to allocate a new
* extent for the extent root while also trying to search the extent
@@ -441,7 +471,8 @@ next:
if (need_resched() ||
rwsem_is_contended(&fs_info->commit_root_sem)) {
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
up_read(&fs_info->commit_root_sem);
mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +495,8 @@ next:
key.offset = 0;
key.type = BTRFS_EXTENT_ITEM_KEY;
- caching_ctl->progress = last;
+ if (wakeup)
+ caching_ctl->progress = last;
btrfs_release_path(path);
goto next;
}
@@ -491,7 +523,8 @@ next:
if (total_found > (1024 * 1024 * 2)) {
total_found = 0;
- wake_up(&caching_ctl->wait);
+ if (wakeup)
+ wake_up(&caching_ctl->wait);
}
}
path->slots[0]++;
@@ -501,13 +534,27 @@ next:
total_found += add_new_free_space(block_group, fs_info, last,
block_group->key.objectid +
block_group->key.offset);
- caching_ctl->progress = (u64)-1;
-
spin_lock(&block_group->lock);
block_group->caching_ctl = NULL;
block_group->cached = BTRFS_CACHE_FINISHED;
spin_unlock(&block_group->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(extent_root, block_group)) {
+ u64 bytes_used;
+
+ spin_lock(&block_group->space_info->lock);
+ spin_lock(&block_group->lock);
+ bytes_used = block_group->key.offset -
+ btrfs_block_group_used(&block_group->item);
+ block_group->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&block_group->lock);
+ spin_unlock(&block_group->space_info->lock);
+ fragment_free_space(extent_root, block_group);
+ }
+#endif
+
+ caching_ctl->progress = (u64)-1;
err:
btrfs_free_path(path);
up_read(&fs_info->commit_root_sem);
@@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
}
}
spin_unlock(&cache->lock);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (ret == 1 &&
+ btrfs_should_fragment_free_space(fs_info->extent_root,
+ cache)) {
+ u64 bytes_used;
+
+ spin_lock(&cache->space_info->lock);
+ spin_lock(&cache->lock);
+ bytes_used = cache->key.offset -
+ btrfs_block_group_used(&cache->item);
+ cache->space_info->bytes_used += bytes_used >> 1;
+ spin_unlock(&cache->lock);
+ spin_unlock(&cache->space_info->lock);
+ fragment_free_space(fs_info->extent_root, cache);
+ }
+#endif
mutex_unlock(&caching_ctl->mutex);
wake_up(&caching_ctl->wait);
@@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 owner, u64 offset,
- int no_quota)
+ u64 root_objectid, u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ BTRFS_ADD_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
- num_bytes,
- parent, root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_REF, NULL, no_quota);
+ num_bytes, parent, root_objectid,
+ owner, offset, 0,
+ BTRFS_ADD_DELAYED_REF, NULL);
}
return ret;
}
@@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 num_bytes = node->num_bytes;
u64 refs;
int ret;
- int no_quota = node->no_quota;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
- if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
- no_quota = 1;
-
path->reada = 1;
path->leave_spinning = 1;
/* this will setup the path even if it fails to insert the back ref */
@@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent, ref_root,
extent_op->flags_to_set,
&extent_op->key,
- ref->level, &ins,
- node->no_quota);
+ ref->level, &ins);
} else if (node->action == BTRFS_ADD_DELAYED_REF) {
ret = __btrfs_inc_extent_ref(trans, root, node,
parent, ref_root,
@@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
node->num_bytes);
}
}
+
+ /* Also free its reserved qgroup space */
+ btrfs_qgroup_free_delayed_ref(root->fs_info,
+ head->qgroup_ref_root,
+ head->qgroup_reserved);
return ret;
}
@@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
}
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+ locked_ref);
/*
* locked_ref is the head node, so we have to go one
@@ -2828,6 +2904,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *head;
int ret;
int run_all = count == (unsigned long)-1;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
/* We'll clean this up in btrfs_cleanup_transaction */
if (trans->aborted)
@@ -2844,6 +2921,7 @@ again:
#ifdef SCRAMBLE_DELAYED_REFS
delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
#endif
+ trans->can_flush_pending_bgs = false;
ret = __btrfs_run_delayed_refs(trans, root, count);
if (ret < 0) {
btrfs_abort_transaction(trans, root, ret);
@@ -2893,6 +2971,7 @@ again:
}
out:
assert_qgroups_uptodate(trans);
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
return 0;
}
@@ -3106,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
int level;
int ret = 0;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, int);
+ u64, u64, u64, u64, u64, u64);
if (btrfs_test_is_dummy_root(root))
@@ -3147,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
key.offset -= btrfs_file_extent_offset(buf, fi);
ret = process_func(trans, root, bytenr, num_bytes,
parent, ref_root, key.objectid,
- key.offset, 1);
+ key.offset);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, i);
num_bytes = root->nodesize;
ret = process_func(trans, root, bytenr, num_bytes,
- parent, ref_root, level - 1, 0,
- 1);
+ parent, ref_root, level - 1, 0);
if (ret)
goto fail;
}
@@ -3336,6 +3414,15 @@ again:
spin_unlock(&block_group->lock);
/*
+ * We hit an ENOSPC when setting up the cache in this transaction, just
+ * skip doing the setup, we've already cleared the cache so we're safe.
+ */
+ if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
+ ret = -ENOSPC;
+ goto out_put;
+ }
+
+ /*
* Try to preallocate enough space based on how big the block group is.
* Keep in mind this has to include any pinned space which could end up
* taking up quite a bit since it's not folded into the other space
@@ -3348,16 +3435,26 @@ again:
num_pages *= 16;
num_pages *= PAGE_CACHE_SIZE;
- ret = btrfs_check_data_free_space(inode, num_pages, num_pages);
+ ret = btrfs_check_data_free_space(inode, 0, num_pages);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
num_pages, num_pages,
&alloc_hint);
+ /*
+ * Our cache requires contiguous chunks so that we don't modify a bunch
+ * of metadata or split extents when writing the cache out, which means
+ * we can enospc if we are heavily fragmented in addition to just normal
+ * out of space conditions. So if we hit this just skip setting up any
+ * other block groups for this transaction, maybe we'll unpin enough
+ * space the next time around.
+ */
if (!ret)
dcs = BTRFS_DC_SETUP;
- btrfs_free_reserved_data_space(inode, num_pages);
+ else if (ret == -ENOSPC)
+ set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
+ btrfs_free_reserved_data_space(inode, 0, num_pages);
out_put:
iput(inode);
@@ -3742,10 +3839,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
found->bytes_reserved = 0;
found->bytes_readonly = 0;
found->bytes_may_use = 0;
- if (total_bytes > 0)
- found->full = 0;
- else
- found->full = 1;
+ found->full = 0;
+ found->max_extent_size = 0;
found->force_alloc = CHUNK_ALLOC_NO_FORCE;
found->chunk_alloc = 0;
found->flush = 0;
@@ -3822,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
{
u64 num_devices = root->fs_info->fs_devices->rw_devices;
u64 target;
- u64 tmp;
+ u64 raid_type;
+ u64 allowed = 0;
/*
* see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
spin_unlock(&root->fs_info->balance_lock);
/* First, mask out the RAID levels which aren't possible */
- if (num_devices == 1)
- flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID5);
- if (num_devices < 3)
- flags &= ~BTRFS_BLOCK_GROUP_RAID6;
- if (num_devices < 4)
- flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-
- tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
- BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
- flags &= ~tmp;
-
- if (tmp & BTRFS_BLOCK_GROUP_RAID6)
- tmp = BTRFS_BLOCK_GROUP_RAID6;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
- tmp = BTRFS_BLOCK_GROUP_RAID5;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
- tmp = BTRFS_BLOCK_GROUP_RAID10;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
- tmp = BTRFS_BLOCK_GROUP_RAID1;
- else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
- tmp = BTRFS_BLOCK_GROUP_RAID0;
-
- return extended_to_chunk(flags | tmp);
+ for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
+ if (num_devices >= btrfs_raid_array[raid_type].devs_min)
+ allowed |= btrfs_raid_group[raid_type];
+ }
+ allowed &= flags;
+
+ if (allowed & BTRFS_BLOCK_GROUP_RAID6)
+ allowed = BTRFS_BLOCK_GROUP_RAID6;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
+ allowed = BTRFS_BLOCK_GROUP_RAID5;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
+ allowed = BTRFS_BLOCK_GROUP_RAID10;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
+ allowed = BTRFS_BLOCK_GROUP_RAID1;
+ else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
+ allowed = BTRFS_BLOCK_GROUP_RAID0;
+
+ flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
+
+ return extended_to_chunk(flags | allowed);
}
static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3903,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
return ret;
}
-/*
- * This will check the space that the inode allocates from to make sure we have
- * enough space for bytes.
- */
-int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
+int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
{
struct btrfs_space_info *data_sinfo;
struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4006,7 +4093,8 @@ commit_trans:
if (IS_ERR(trans))
return PTR_ERR(trans);
if (have_pinned_space >= 0 ||
- trans->transaction->have_free_bgs ||
+ test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
+ &trans->transaction->flags) ||
need_commit > 0) {
ret = btrfs_commit_transaction(trans, root);
if (ret)
@@ -4028,38 +4116,86 @@ commit_trans:
data_sinfo->flags, bytes, 1);
return -ENOSPC;
}
- ret = btrfs_qgroup_reserve(root, write_bytes);
- if (ret)
- goto out;
data_sinfo->bytes_may_use += bytes;
trace_btrfs_space_reservation(root->fs_info, "space_info",
data_sinfo->flags, bytes, 1);
-out:
spin_unlock(&data_sinfo->lock);
return ret;
}
/*
- * Called if we need to clear a data reservation for this inode.
+ * New check_data_free_space() with ability for precious data reservation
+ * Will replace old btrfs_check_data_free_space(), but for patch split,
+ * add a new function first and then replace it.
+ */
+int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ int ret;
+
+ /* align the range */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
+
+ ret = btrfs_alloc_data_chunk_ondemand(inode, len);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Use new btrfs_qgroup_reserve_data to reserve precious data space
+ *
+ * TODO: Find a good method to avoid reserve data space for NOCOW
+ * range, but don't impact performance on quota disable case.
+ */
+ ret = btrfs_qgroup_reserve_data(inode, start, len);
+ return ret;
+}
+
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will *NOT* use accurate qgroup reserved space API, just for case
+ * which we can't sleep and is sure it won't affect qgroup reserved space.
+ * Like clear_bit_hook().
*/
-void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
+void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
+ u64 len)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_space_info *data_sinfo;
- /* make sure bytes are sectorsize aligned */
- bytes = ALIGN(bytes, root->sectorsize);
+ /* Make sure the range is aligned to sectorsize */
+ len = round_up(start + len, root->sectorsize) -
+ round_down(start, root->sectorsize);
+ start = round_down(start, root->sectorsize);
data_sinfo = root->fs_info->data_sinfo;
spin_lock(&data_sinfo->lock);
- WARN_ON(data_sinfo->bytes_may_use < bytes);
- data_sinfo->bytes_may_use -= bytes;
+ if (WARN_ON(data_sinfo->bytes_may_use < len))
+ data_sinfo->bytes_may_use = 0;
+ else
+ data_sinfo->bytes_may_use -= len;
trace_btrfs_space_reservation(root->fs_info, "space_info",
- data_sinfo->flags, bytes, 0);
+ data_sinfo->flags, len, 0);
spin_unlock(&data_sinfo->lock);
}
+/*
+ * Called if we need to clear a data reservation for this inode
+ * Normally in a error case.
+ *
+ * This one will handle the per-indoe data rsv map for accurate reserved
+ * space framework.
+ */
+void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
+{
+ btrfs_free_reserved_data_space_noquota(inode, start, len);
+ btrfs_qgroup_free_data(inode, start, len);
+}
+
static void force_metadata_allocation(struct btrfs_fs_info *info)
{
struct list_head *head = &info->space_info;
@@ -4309,7 +4445,8 @@ out:
* the block groups that were made dirty during the lifetime of the
* transaction.
*/
- if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+ if (trans->can_flush_pending_bgs &&
+ trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
btrfs_create_pending_block_groups(trans, trans->root);
btrfs_trans_release_chunk_metadata(trans);
}
@@ -4890,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv(
{
struct btrfs_block_rsv *block_rsv = NULL;
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->csum_root && trans->adding_csums)
- block_rsv = trans->block_rsv;
-
- if (root == root->fs_info->uuid_root)
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
+ (root == root->fs_info->csum_root && trans->adding_csums) ||
+ (root == root->fs_info->uuid_root))
block_rsv = trans->block_rsv;
if (!block_rsv)
@@ -5339,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (root->fs_info->quota_enabled) {
/* One for parent inode, two for dir entries */
num_bytes = 3 * root->nodesize;
- ret = btrfs_qgroup_reserve(root, num_bytes);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes);
if (ret)
return ret;
} else {
@@ -5357,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (ret == -ENOSPC && use_global_rsv)
ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
- if (ret) {
- if (*qgroup_reserved)
- btrfs_qgroup_free(root, *qgroup_reserved);
- }
+ if (ret && *qgroup_reserved)
+ btrfs_qgroup_free_meta(root, *qgroup_reserved);
return ret;
}
@@ -5521,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
spin_unlock(&BTRFS_I(inode)->lock);
if (root->fs_info->quota_enabled) {
- ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize);
+ ret = btrfs_qgroup_reserve_meta(root,
+ nr_extents * root->nodesize);
if (ret)
goto out_fail;
}
ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
if (unlikely(ret)) {
- if (root->fs_info->quota_enabled)
- btrfs_qgroup_free(root, nr_extents * root->nodesize);
+ btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
goto out_fail;
}
@@ -5652,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
}
/**
- * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for
+ * delalloc
* @inode: inode we're writing to
- * @num_bytes: the number of bytes we want to allocate
+ * @start: start range we are writing to
+ * @len: how long the range we are writing to
+ *
+ * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
*
* This will do the following things
*
- * o reserve space in the data space info for num_bytes
- * o reserve space in the metadata space info based on number of outstanding
+ * o reserve space in data space info for num bytes
+ * and reserve precious corresponding qgroup space
+ * (Done in check_data_free_space)
+ *
+ * o reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
- * o add to the inodes ->delalloc_bytes
+ * also reserve metadata space in a per root over-reserve method.
+ * o add to the inodes->delalloc_bytes
* o add it to the fs_info's delalloc inodes list.
+ * (Above 3 all done in delalloc_reserve_metadata)
*
- * This will return 0 for success and -ENOSPC if there is no space left.
+ * Return 0 for success
+ * Return <0 for error(-ENOSPC or -EQUOT)
*/
-int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
{
int ret;
- ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes);
- if (ret)
- return ret;
-
- ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
- if (ret) {
- btrfs_free_reserved_data_space(inode, num_bytes);
+ ret = btrfs_check_data_free_space(inode, start, len);
+ if (ret < 0)
return ret;
- }
-
- return 0;
+ ret = btrfs_delalloc_reserve_metadata(inode, len);
+ if (ret < 0)
+ btrfs_free_reserved_data_space(inode, start, len);
+ return ret;
}
/**
* btrfs_delalloc_release_space - release data and metadata space for delalloc
* @inode: inode we're releasing space for
- * @num_bytes: the number of bytes we want to free up
+ * @start: start position of the space already reserved
+ * @len: the len of the space already reserved
*
* This must be matched with a call to btrfs_delalloc_reserve_space. This is
* called in the case that we don't need the metadata AND data reservations
@@ -5695,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
+ * Also it will handle the qgroup reserved space.
*/
-void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
{
- btrfs_delalloc_release_metadata(inode, num_bytes);
- btrfs_free_reserved_data_space(inode, num_bytes);
+ btrfs_delalloc_release_metadata(inode, len);
+ btrfs_free_reserved_data_space(inode, start, len);
}
static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6064,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
update_global_block_rsv(fs_info);
}
+/*
+ * Returns the free cluster for the given space info and sets empty_cluster to
+ * what it should be based on the mount options.
+ */
+static struct btrfs_free_cluster *
+fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
+ u64 *empty_cluster)
+{
+ struct btrfs_free_cluster *ret = NULL;
+ bool ssd = btrfs_test_opt(root, SSD);
+
+ *empty_cluster = 0;
+ if (btrfs_mixed_space_info(space_info))
+ return ret;
+
+ if (ssd)
+ *empty_cluster = 2 * 1024 * 1024;
+ if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
+ ret = &root->fs_info->meta_alloc_cluster;
+ if (!ssd)
+ *empty_cluster = 64 * 1024;
+ } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
+ ret = &root->fs_info->data_alloc_cluster;
+ }
+
+ return ret;
+}
+
static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
const bool return_free_space)
{
@@ -6071,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_block_group_cache *cache = NULL;
struct btrfs_space_info *space_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ struct btrfs_free_cluster *cluster = NULL;
u64 len;
+ u64 total_unpinned = 0;
+ u64 empty_cluster = 0;
bool readonly;
while (start <= end) {
@@ -6080,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
start >= cache->key.objectid + cache->key.offset) {
if (cache)
btrfs_put_block_group(cache);
+ total_unpinned = 0;
cache = btrfs_lookup_block_group(fs_info, start);
BUG_ON(!cache); /* Logic error */
+
+ cluster = fetch_cluster_info(root,
+ cache->space_info,
+ &empty_cluster);
+ empty_cluster <<= 1;
}
len = cache->key.objectid + cache->key.offset - start;
@@ -6094,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
}
start += len;
+ total_unpinned += len;
space_info = cache->space_info;
+ /*
+ * If this space cluster has been marked as fragmented and we've
+ * unpinned enough in this block group to potentially allow a
+ * cluster to be created inside of it go ahead and clear the
+ * fragmented check.
+ */
+ if (cluster && cluster->fragmented &&
+ total_unpinned > empty_cluster) {
+ spin_lock(&cluster->lock);
+ cluster->fragmented = 0;
+ spin_unlock(&cluster->lock);
+ }
+
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
cache->pinned -= len;
space_info->bytes_pinned -= len;
+ space_info->max_extent_size = 0;
percpu_counter_add(&space_info->total_bytes_pinned, -len);
if (cache->ro) {
space_info->bytes_readonly += len;
@@ -6232,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
int extent_slot = 0;
int found_extent = 0;
int num_to_del = 1;
- int no_quota = node->no_quota;
u32 item_size;
u64 refs;
u64 bytenr = node->bytenr;
@@ -6241,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
SKINNY_METADATA);
- if (!info->quota_enabled || !is_fstree(root_objectid))
- no_quota = 1;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -6569,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
buf->start, buf->len,
parent, root->root_key.objectid,
btrfs_header_level(buf),
- BTRFS_DROP_DELAYED_REF, NULL, 0);
+ BTRFS_DROP_DELAYED_REF, NULL);
BUG_ON(ret); /* -ENOMEM */
}
@@ -6617,7 +6804,7 @@ out:
/* Can return -ENOMEM */
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
- u64 owner, u64 offset, int no_quota)
+ u64 owner, u64 offset)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6640,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, (int)owner,
- BTRFS_DROP_DELAYED_REF, NULL, no_quota);
+ BTRFS_DROP_DELAYED_REF, NULL);
} else {
ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
num_bytes,
parent, root_objectid, owner,
- offset, BTRFS_DROP_DELAYED_REF,
- NULL, no_quota);
+ offset, 0,
+ BTRFS_DROP_DELAYED_REF, NULL);
}
return ret;
}
@@ -6832,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
struct btrfs_block_group_cache *block_group = NULL;
u64 search_start = 0;
u64 max_extent_size = 0;
- int empty_cluster = 2 * 1024 * 1024;
+ u64 empty_cluster = 0;
struct btrfs_space_info *space_info;
int loop = 0;
int index = __get_raid_index(flags);
@@ -6842,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
bool failed_alloc = false;
bool use_cluster = true;
bool have_caching_bg = false;
+ bool orig_have_caching_bg = false;
+ bool full_search = false;
WARN_ON(num_bytes < root->sectorsize);
ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6857,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
/*
- * If the space info is for both data and metadata it means we have a
- * small filesystem and we can't use the clustering stuff.
+ * If our free space is heavily fragmented we may not be able to make
+ * big contiguous allocations, so instead of doing the expensive search
+ * for free space, simply return ENOSPC with our max_extent_size so we
+ * can go ahead and search for a more manageable chunk.
+ *
+ * If our max_extent_size is large enough for our allocation simply
+ * disable clustering since we will likely not be able to find enough
+ * space to create a cluster and induce latency trying.
*/
- if (btrfs_mixed_space_info(space_info))
- use_cluster = false;
-
- if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
- last_ptr = &root->fs_info->meta_alloc_cluster;
- if (!btrfs_test_opt(root, SSD))
- empty_cluster = 64 * 1024;
- }
-
- if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
- btrfs_test_opt(root, SSD)) {
- last_ptr = &root->fs_info->data_alloc_cluster;
+ if (unlikely(space_info->max_extent_size)) {
+ spin_lock(&space_info->lock);
+ if (space_info->max_extent_size &&
+ num_bytes > space_info->max_extent_size) {
+ ins->offset = space_info->max_extent_size;
+ spin_unlock(&space_info->lock);
+ return -ENOSPC;
+ } else if (space_info->max_extent_size) {
+ use_cluster = false;
+ }
+ spin_unlock(&space_info->lock);
}
+ last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
if (last_ptr) {
spin_lock(&last_ptr->lock);
if (last_ptr->block_group)
hint_byte = last_ptr->window_start;
+ if (last_ptr->fragmented) {
+ /*
+ * We still set window_start so we can keep track of the
+ * last place we found an allocation to try and save
+ * some time.
+ */
+ hint_byte = last_ptr->window_start;
+ use_cluster = false;
+ }
spin_unlock(&last_ptr->lock);
}
search_start = max(search_start, first_logical_byte(root, 0));
search_start = max(search_start, hint_byte);
-
- if (!last_ptr)
- empty_cluster = 0;
-
if (search_start == hint_byte) {
block_group = btrfs_lookup_block_group(root->fs_info,
search_start);
@@ -6921,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
}
search:
have_caching_bg = false;
+ if (index == 0 || index == __get_raid_index(flags))
+ full_search = true;
down_read(&space_info->groups_sem);
list_for_each_entry(block_group, &space_info->block_groups[index],
list) {
@@ -6954,6 +7156,7 @@ search:
have_block_group:
cached = block_group_cache_done(block_group);
if (unlikely(!cached)) {
+ have_caching_bg = true;
ret = cache_block_group(block_group, 0);
BUG_ON(ret < 0);
ret = 0;
@@ -6968,7 +7171,7 @@ have_block_group:
* Ok we want to try and use the cluster allocator, so
* lets look there
*/
- if (last_ptr) {
+ if (last_ptr && use_cluster) {
struct btrfs_block_group_cache *used_block_group;
unsigned long aligned_cluster;
/*
@@ -7094,6 +7297,16 @@ refill_cluster:
}
unclustered_alloc:
+ /*
+ * We are doing an unclustered alloc, set the fragmented flag so
+ * we don't bother trying to setup a cluster again until we get
+ * more space.
+ */
+ if (unlikely(last_ptr)) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->fragmented = 1;
+ spin_unlock(&last_ptr->lock);
+ }
spin_lock(&block_group->free_space_ctl->tree_lock);
if (cached &&
block_group->free_space_ctl->free_space <
@@ -7126,8 +7339,6 @@ unclustered_alloc:
failed_alloc = true;
goto have_block_group;
} else if (!offset) {
- if (!cached)
- have_caching_bg = true;
goto loop;
}
checks:
@@ -7168,6 +7379,10 @@ loop:
}
up_read(&space_info->groups_sem);
+ if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
+ && !orig_have_caching_bg)
+ orig_have_caching_bg = true;
+
if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
goto search;
@@ -7184,7 +7399,20 @@ loop:
*/
if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
index = 0;
- loop++;
+ if (loop == LOOP_CACHING_NOWAIT) {
+ /*
+ * We want to skip the LOOP_CACHING_WAIT step if we
+ * don't have any unached bgs and we've alrelady done a
+ * full search through.
+ */
+ if (orig_have_caching_bg || !full_search)
+ loop = LOOP_CACHING_WAIT;
+ else
+ loop = LOOP_ALLOC_CHUNK;
+ } else {
+ loop++;
+ }
+
if (loop == LOOP_ALLOC_CHUNK) {
struct btrfs_trans_handle *trans;
int exist = 0;
@@ -7202,6 +7430,15 @@ loop:
ret = do_chunk_alloc(trans, root, flags,
CHUNK_ALLOC_FORCE);
+
+ /*
+ * If we can't allocate a new chunk we've already looped
+ * through at least once, move on to the NO_EMPTY_SIZE
+ * case.
+ */
+ if (ret == -ENOSPC)
+ loop = LOOP_NO_EMPTY_SIZE;
+
/*
* Do not bail out on ENOSPC since we
* can do more things.
@@ -7218,6 +7455,15 @@ loop:
}
if (loop == LOOP_NO_EMPTY_SIZE) {
+ /*
+ * Don't loop again if we already have no empty_size and
+ * no empty_cluster.
+ */
+ if (empty_size == 0 &&
+ empty_cluster == 0) {
+ ret = -ENOSPC;
+ goto out;
+ }
empty_size = 0;
empty_cluster = 0;
}
@@ -7226,11 +7472,20 @@ loop:
} else if (!ins->objectid) {
ret = -ENOSPC;
} else if (ins->objectid) {
+ if (!use_cluster && last_ptr) {
+ spin_lock(&last_ptr->lock);
+ last_ptr->window_start = ins->objectid;
+ spin_unlock(&last_ptr->lock);
+ }
ret = 0;
}
out:
- if (ret == -ENOSPC)
+ if (ret == -ENOSPC) {
+ spin_lock(&space_info->lock);
+ space_info->max_extent_size = max_extent_size;
+ spin_unlock(&space_info->lock);
ins->offset = max_extent_size;
+ }
return ret;
}
@@ -7279,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc)
{
- bool final_tried = false;
+ bool final_tried = num_bytes == min_alloc_size;
u64 flags;
int ret;
@@ -7428,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
u64 flags, struct btrfs_disk_key *key,
- int level, struct btrfs_key *ins,
- int no_quota)
+ int level, struct btrfs_key *ins)
{
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7510,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 root_objectid, u64 owner,
- u64 offset, struct btrfs_key *ins)
+ u64 offset, u64 ram_bytes,
+ struct btrfs_key *ins)
{
int ret;
@@ -7519,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
ins->offset, 0,
root_objectid, owner, offset,
- BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
+ ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
+ NULL);
return ret;
}
@@ -7733,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
ins.objectid, ins.offset,
parent, root_objectid, level,
BTRFS_ADD_DELAYED_EXTENT,
- extent_op, 0);
+ extent_op);
if (ret)
goto out_free_delayed;
}
@@ -8274,14 +8530,15 @@ skip:
ret = account_shared_subtree(trans, root, next,
generation, level - 1);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "Error "
"%d accounting shared subtree. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
- root->root_key.objectid, level - 1, 0, 0);
+ root->root_key.objectid, level - 1, 0);
BUG_ON(ret); /* -ENOMEM */
}
btrfs_tree_unlock(next);
@@ -8366,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
ret = account_leaf_items(trans, root, eb);
if (ret) {
- printk_ratelimited(KERN_ERR "BTRFS: %s Error "
+ btrfs_err_rl(root->fs_info,
+ "error "
"%d accounting leaf items. Quota "
- "is out of sync, rescan required.\n",
- root->fs_info->sb->s_id, ret);
+ "is out of sync, rescan required.",
+ ret);
}
}
/* make block locked assertion in clean_tree_block happy */
@@ -8668,7 +8926,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
}
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
- btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
+ btrfs_add_dropped_root(trans, root);
} else {
free_extent_buffer(root->node);
free_extent_buffer(root->commit_root);
@@ -8691,7 +8949,7 @@ out:
if (!for_reloc && root_dropped == false)
btrfs_add_dead_root(root);
if (err && err != -EAGAIN)
- btrfs_std_error(root->fs_info, err);
+ btrfs_std_error(root->fs_info, err, NULL);
return err;
}
@@ -8879,7 +9137,7 @@ again:
* back off and let this transaction commit
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (trans->transaction->dirty_bg_run) {
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
u64 transid = trans->transid;
mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9563,7 +9821,9 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
struct btrfs_block_group_item item;
struct btrfs_key key;
int ret = 0;
+ bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
+ trans->can_flush_pending_bgs = false;
list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
if (ret)
goto next;
@@ -9584,6 +9844,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
next:
list_del_init(&block_group->bg_list);
}
+ trans->can_flush_pending_bgs = can_flush_pending_bgs;
}
int btrfs_make_block_group(struct btrfs_trans_handle *trans,
@@ -9626,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
free_excluded_extents(root, cache);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(root, cache)) {
+ u64 new_bytes_used = size - bytes_used;
+
+ bytes_used += new_bytes_used >> 1;
+ fragment_free_space(root, cache);
+ }
+#endif
/*
* Call to ensure the corresponding space_info object is created and
* assigned to our block group, but don't update its counters just yet.
@@ -10010,22 +10279,25 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
block_group = list_first_entry(&fs_info->unused_bgs,
struct btrfs_block_group_cache,
bg_list);
- space_info = block_group->space_info;
list_del_init(&block_group->bg_list);
+
+ space_info = block_group->space_info;
+
if (ret || btrfs_mixed_space_info(space_info)) {
btrfs_put_block_group(block_group);
continue;
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_lock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->delete_unused_bgs_mutex);
/* Don't want to race with allocators so take the groups_sem */
down_write(&space_info->groups_sem);
spin_lock(&block_group->lock);
if (block_group->reserved ||
btrfs_block_group_used(&block_group->item) ||
- block_group->ro) {
+ block_group->ro ||
+ list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
* outstanding allocations in this block group. We do
@@ -10141,7 +10413,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
end_trans:
btrfs_end_transaction(trans, root);
next:
- mutex_unlock(&root->fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
@@ -10366,8 +10638,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
{
percpu_counter_dec(&root->subv_writers->counter);
/*
- * Make sure counter is updated before we wake up
- * waiters.
+ * Make sure counter is updated before we wake up waiters.
*/
smp_mb();
if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f1018cfbfefa..9abe18763a7f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
inode = tree->mapping->host;
isize = i_size_read(inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
- printk_ratelimited(KERN_DEBUG
- "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+ btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
+ "%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(inode), isize, start, end);
}
}
@@ -131,6 +131,25 @@ struct extent_page_data {
unsigned int sync_io:1;
};
+static void add_extent_changeset(struct extent_state *state, unsigned bits,
+ struct extent_changeset *changeset,
+ int set)
+{
+ int ret;
+
+ if (!changeset)
+ return;
+ if (set && (state->state & bits) == bits)
+ return;
+ if (!set && (state->state & bits) == 0)
+ return;
+ changeset->bytes_changed += state->end - state->start + 1;
+ ret = ulist_add(changeset->range_changed, state->start, state->end,
+ GFP_ATOMIC);
+ /* ENOMEM */
+ BUG_ON(ret < 0);
+}
+
static noinline void flush_write_bio(void *data);
static inline struct btrfs_fs_info *
tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
}
static void set_state_bits(struct extent_io_tree *tree,
- struct extent_state *state, unsigned *bits);
+ struct extent_state *state, unsigned *bits,
+ struct extent_changeset *changeset);
/*
* insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
struct extent_state *state, u64 start, u64 end,
struct rb_node ***p,
struct rb_node **parent,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
struct rb_node *node;
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
state->start = start;
state->end = end;
- set_state_bits(tree, state, bits);
+ set_state_bits(tree, state, bits, changeset);
node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
*/
static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits, int wake)
+ unsigned *bits, int wake,
+ struct extent_changeset *changeset)
{
struct extent_state *next;
unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
tree->dirty_bytes -= range;
}
clear_state_cb(tree, state, bits);
+ add_extent_changeset(state, bits_to_clear, changeset, 0);
state->state &= ~bits_to_clear;
if (wake)
wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
*
* This takes the tree lock, and returns 0 on success and < 0 on error.
*/
-int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
- unsigned bits, int wake, int delete,
- struct extent_state **cached_state,
- gfp_t mask)
+static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached_state,
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *cached;
@@ -594,7 +616,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
clear = 1;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@@ -671,7 +693,8 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake,
+ changeset);
goto next;
}
goto search_again;
@@ -692,13 +715,13 @@ hit_next:
if (wake)
wake_up(&state->wq);
- clear_state_bit(tree, prealloc, &bits, wake);
+ clear_state_bit(tree, prealloc, &bits, wake, changeset);
prealloc = NULL;
goto out;
}
- state = clear_state_bit(tree, state, &bits, wake);
+ state = clear_state_bit(tree, state, &bits, wake, changeset);
next:
if (last_end == (u64)-1)
goto out;
@@ -718,7 +741,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -789,7 +812,7 @@ out:
static void set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
- unsigned *bits)
+ unsigned *bits, struct extent_changeset *changeset)
{
unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
+ add_extent_changeset(state, bits_to_set, changeset, 1);
state->state |= bits_to_set;
}
@@ -835,7 +859,7 @@ static int __must_check
__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, unsigned exclusive_bits,
u64 *failed_start, struct extent_state **cached_state,
- gfp_t mask)
+ gfp_t mask, struct extent_changeset *changeset)
{
struct extent_state *state;
struct extent_state *prealloc = NULL;
@@ -850,7 +874,7 @@ __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
bits |= EXTENT_FIRST_DELALLOC;
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
prealloc = alloc_extent_state(mask);
BUG_ON(!prealloc);
}
@@ -873,7 +897,7 @@ again:
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -899,7 +923,7 @@ hit_next:
goto out;
}
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, changeset);
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, changeset);
if (err)
extent_io_tree_panic(tree, err);
@@ -1008,7 +1032,7 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, changeset);
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
@@ -1028,7 +1052,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
goto again;
}
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, 0, failed_start,
- cached_state, mask);
+ cached_state, mask, NULL);
}
@@ -1076,7 +1100,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
btrfs_debug_check_extent_io_range(tree, start, end);
again:
- if (!prealloc && (mask & __GFP_WAIT)) {
+ if (!prealloc && gfpflags_allow_blocking(mask)) {
/*
* Best effort, don't worry if extent state allocation fails
* here for the first iteration. We might have a cached state
@@ -1111,7 +1135,7 @@ again:
goto out;
}
err = insert_state(tree, prealloc, start, end,
- &p, &parent, &bits);
+ &p, &parent, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
* Just lock what we found and keep going
*/
if (state->start == start && state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
if (err)
goto out;
if (state->end <= end) {
- set_state_bits(tree, state, &bits);
+ set_state_bits(tree, state, &bits, NULL);
cache_state(state, cached_state);
- state = clear_state_bit(tree, state, &clear_bits, 0);
+ state = clear_state_bit(tree, state, &clear_bits, 0,
+ NULL);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
* the later extent.
*/
err = insert_state(tree, prealloc, start, this_end,
- NULL, NULL, &bits);
+ NULL, NULL, &bits, NULL);
if (err)
extent_io_tree_panic(tree, err);
cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
if (err)
extent_io_tree_panic(tree, err);
- set_state_bits(tree, prealloc, &bits);
+ set_state_bits(tree, prealloc, &bits, NULL);
cache_state(prealloc, cached_state);
- clear_state_bit(tree, prealloc, &clear_bits, 0);
+ clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
prealloc = NULL;
goto out;
}
@@ -1253,7 +1278,7 @@ search_again:
if (start > end)
goto out;
spin_unlock(&tree->lock);
- if (mask & __GFP_WAIT)
+ if (gfpflags_allow_blocking(mask))
cond_resched();
first_iteration = false;
goto again;
@@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
NULL, mask);
}
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
+{
+ /*
+ * We don't support EXTENT_LOCKED yet, as current changeset will
+ * record any bits changed, so for EXTENT_LOCKED case, it will
+ * either fail with -EEXIST or changeset will record the whole
+ * range.
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
+
+ return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
+ changeset);
+}
+
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, int wake, int delete,
+ struct extent_state **cached, gfp_t mask)
+{
+ return __clear_extent_bit(tree, start, end, bits, wake, delete,
+ cached, mask, NULL);
+}
+
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask)
{
@@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
}
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset)
+{
+ /*
+ * Don't support EXTENT_LOCKED case, same reason as
+ * set_record_extent_bits().
+ */
+ BUG_ON(bits & EXTENT_LOCKED);
+
+ return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
+ changeset);
+}
+
int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state, gfp_t mask)
{
@@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
while (1) {
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
EXTENT_LOCKED, &failed_start,
- cached_state, GFP_NOFS);
+ cached_state, GFP_NOFS, NULL);
if (err == -EEXIST) {
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
start = failed_start;
@@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
- &failed_start, NULL, GFP_NOFS);
+ &failed_start, NULL, GFP_NOFS, NULL);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
@@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
return -EIO;
}
- printk_ratelimited_in_rcu(KERN_INFO
- "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n",
+ btrfs_info_rl_in_rcu(fs_info,
+ "read error corrected: ino %llu off %llu (dev %s sector %llu)",
btrfs_ino(inode), start,
rcu_str_deref(dev->name), sector);
bio_put(bio);
@@ -2798,7 +2861,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
bio_end_io_t end_io_func,
int mirror_num,
unsigned long prev_bio_flags,
- unsigned long bio_flags)
+ unsigned long bio_flags,
+ bool force_bio_submit)
{
int ret = 0;
struct bio *bio;
@@ -2814,6 +2878,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
contig = bio_end_sector(bio) == sector;
if (prev_bio_flags != bio_flags || !contig ||
+ force_bio_submit ||
merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
bio_add_page(bio, page, page_size, offset) < page_size) {
ret = submit_one_bio(rw, bio, mirror_num,
@@ -2910,7 +2975,8 @@ static int __do_readpage(struct extent_io_tree *tree,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
struct inode *inode = page->mapping->host;
u64 start = page_offset(page);
@@ -2958,6 +3024,7 @@ static int __do_readpage(struct extent_io_tree *tree,
}
while (cur <= end) {
unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+ bool force_bio_submit = false;
if (cur >= last_byte) {
char *userpage;
@@ -3008,6 +3075,49 @@ static int __do_readpage(struct extent_io_tree *tree,
block_start = em->block_start;
if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
block_start = EXTENT_MAP_HOLE;
+
+ /*
+ * If we have a file range that points to a compressed extent
+ * and it's followed by a consecutive file range that points to
+ * to the same compressed extent (possibly with a different
+ * offset and/or length, so it either points to the whole extent
+ * or only part of it), we must make sure we do not submit a
+ * single bio to populate the pages for the 2 ranges because
+ * this makes the compressed extent read zero out the pages
+ * belonging to the 2nd range. Imagine the following scenario:
+ *
+ * File layout
+ * [0 - 8K] [8K - 24K]
+ * | |
+ * | |
+ * points to extent X, points to extent X,
+ * offset 4K, length of 8K offset 0, length 16K
+ *
+ * [extent X, compressed length = 4K uncompressed length = 16K]
+ *
+ * If the bio to read the compressed extent covers both ranges,
+ * it will decompress extent X into the pages belonging to the
+ * first range and then it will stop, zeroing out the remaining
+ * pages that belong to the other range that points to extent X.
+ * So here we make sure we submit 2 bios, one for the first
+ * range and another one for the third range. Both will target
+ * the same physical extent from disk, but we can't currently
+ * make the compressed bio endio callback populate the pages
+ * for both ranges because each compressed bio is tightly
+ * coupled with a single extent map, and each range can have
+ * an extent map with a different offset value relative to the
+ * uncompressed data of our extent and different lengths. This
+ * is a corner case so we prioritize correctness over
+ * non-optimal behavior (submitting 2 bios for the same extent).
+ */
+ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ prev_em_start && *prev_em_start != (u64)-1 &&
+ *prev_em_start != em->orig_start)
+ force_bio_submit = true;
+
+ if (prev_em_start)
+ *prev_em_start = em->orig_start;
+
free_extent_map(em);
em = NULL;
@@ -3023,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree,
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
- unlock_extent_cached(tree, cur, cur + iosize - 1,
- &cached, GFP_NOFS);
+ if (parent_locked)
+ free_extent_state(cached);
+ else
+ unlock_extent_cached(tree, cur,
+ cur + iosize - 1,
+ &cached, GFP_NOFS);
cur = cur + iosize;
pg_offset += iosize;
continue;
@@ -3057,7 +3171,8 @@ static int __do_readpage(struct extent_io_tree *tree,
bdev, bio, pnr,
end_bio_extent_readpage, mirror_num,
*bio_flags,
- this_bio_flag);
+ this_bio_flag,
+ force_bio_submit);
if (!ret) {
nr++;
*bio_flags = this_bio_flag;
@@ -3084,7 +3199,8 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
struct inode *inode;
struct btrfs_ordered_extent *ordered;
@@ -3104,7 +3220,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
for (index = 0; index < nr_pages; index++) {
__do_readpage(tree, pages[index], get_extent, em_cached, bio,
- mirror_num, bio_flags, rw);
+ mirror_num, bio_flags, rw, prev_em_start);
page_cache_release(pages[index]);
}
}
@@ -3114,7 +3230,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
int nr_pages, get_extent_t *get_extent,
struct extent_map **em_cached,
struct bio **bio, int mirror_num,
- unsigned long *bio_flags, int rw)
+ unsigned long *bio_flags, int rw,
+ u64 *prev_em_start)
{
u64 start = 0;
u64 end = 0;
@@ -3135,7 +3252,7 @@ static void __extent_readpages(struct extent_io_tree *tree,
index - first_index, start,
end, get_extent, em_cached,
bio, mirror_num, bio_flags,
- rw);
+ rw, prev_em_start);
start = page_start;
end = start + PAGE_CACHE_SIZE - 1;
first_index = index;
@@ -3146,7 +3263,8 @@ static void __extent_readpages(struct extent_io_tree *tree,
__do_contiguous_readpages(tree, &pages[first_index],
index - first_index, start,
end, get_extent, em_cached, bio,
- mirror_num, bio_flags, rw);
+ mirror_num, bio_flags, rw,
+ prev_em_start);
}
static int __extent_read_full_page(struct extent_io_tree *tree,
@@ -3172,7 +3290,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
}
ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
- bio_flags, rw);
+ bio_flags, rw, NULL);
return ret;
}
@@ -3198,7 +3316,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
int ret;
ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
- &bio_flags, READ);
+ &bio_flags, READ, NULL);
if (bio)
ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
return ret;
@@ -3451,7 +3569,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
sector, iosize, pg_offset,
bdev, &epd->bio, max_nr,
end_bio_extent_writepage,
- 0, 0, 0);
+ 0, 0, 0, false);
if (ret)
SetPageError(page);
}
@@ -3754,7 +3872,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
-1, end_bio_extent_buffer_writepage,
- 0, epd->bio_flags, bio_flags);
+ 0, epd->bio_flags, bio_flags, false);
epd->bio_flags = bio_flags;
if (ret) {
set_btree_ioerr(p);
@@ -4158,6 +4276,7 @@ int extent_readpages(struct extent_io_tree *tree,
struct page *page;
struct extent_map *em_cached = NULL;
int nr = 0;
+ u64 prev_em_start = (u64)-1;
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
page = list_entry(pages->prev, struct page, lru);
@@ -4174,12 +4293,12 @@ int extent_readpages(struct extent_io_tree *tree,
if (nr < ARRAY_SIZE(pagepool))
continue;
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ);
+ &bio, 0, &bio_flags, READ, &prev_em_start);
nr = 0;
}
if (nr)
__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
- &bio, 0, &bio_flags, READ);
+ &bio, 0, &bio_flags, READ, &prev_em_start);
if (em_cached)
free_extent_map(em_cached);
@@ -4267,7 +4386,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
u64 start = page_offset(page);
u64 end = start + PAGE_CACHE_SIZE - 1;
- if ((mask & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(mask) &&
page->mapping->host->i_size > 16 * 1024 * 1024) {
u64 len;
while (start <= end) {
@@ -5514,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu dst len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus src_offset %lu move "
+ "len %lu dst len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu dst len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info,
+ "memmove bogus dst_offset %lu move "
+ "len %lu dst len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
@@ -5560,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
unsigned long src_i;
if (src_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move "
- "len %lu len %lu\n", src_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
+ "len %lu len %lu", src_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset + len > dst->len) {
- printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move "
- "len %lu len %lu\n", dst_offset, len, dst->len);
+ btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
+ "len %lu len %lu", dst_offset, len, dst->len);
BUG_ON(1);
}
if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36898d3..f4c1ae11855f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
#define __EXTENTIO__
#include <linux/rbtree.h>
+#include "ulist.h"
/* bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
@@ -18,6 +19,7 @@
#define EXTENT_NEED_WAIT (1U << 13)
#define EXTENT_DAMAGED (1U << 14)
#define EXTENT_NORESERVE (1U << 15)
+#define EXTENT_QGROUP_RESERVED (1U << 16)
#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -161,6 +163,17 @@ struct extent_buffer {
#endif
};
+/*
+ * Structure to record how many bytes and which ranges are set/cleared
+ */
+struct extent_changeset {
+ /* How many bytes are set/cleared in this operation */
+ u64 bytes_changed;
+
+ /* Changed ranges */
+ struct ulist *range_changed;
+};
+
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
@@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state *cached_state);
int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask);
+int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, int wake, int delete,
struct extent_state **cached, gfp_t mask);
int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, gfp_t mask);
+int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
+ unsigned bits, gfp_t mask,
+ struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
unsigned bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index b823fac91c92..977e715f0bf2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -756,8 +756,16 @@ next_slot:
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid > ino ||
- key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
+
+ if (key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(key.objectid < ino) ||
+ key.type < BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(del_nr == 0);
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
break;
fi = btrfs_item_ptr(leaf, path->slots[0],
@@ -776,8 +784,8 @@ next_slot:
btrfs_file_extent_inline_len(leaf,
path->slots[0], fi);
} else {
- WARN_ON(1);
- extent_end = search_start;
+ /* can't happen */
+ BUG();
}
/*
@@ -847,7 +855,7 @@ next_slot:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
new_key.objectid,
- start - extent_offset, 1);
+ start - extent_offset);
BUG_ON(ret); /* -ENOMEM */
}
key.offset = start;
@@ -925,7 +933,7 @@ delete_extent_item:
disk_bytenr, num_bytes, 0,
root->root_key.objectid,
key.objectid, key.offset -
- extent_offset, 0);
+ extent_offset);
BUG_ON(ret); /* -ENOMEM */
inode_sub_bytes(inode,
extent_end - key.offset);
@@ -1204,7 +1212,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
root->root_key.objectid,
- ino, orig_offset, 1);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
if (split == start) {
@@ -1231,7 +1239,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
other_start = 0;
@@ -1248,7 +1256,7 @@ again:
del_nr++;
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
0, root->root_key.objectid,
- ino, orig_offset, 0);
+ ino, orig_offset);
BUG_ON(ret); /* -ENOMEM */
}
if (del_nr == 0) {
@@ -1469,7 +1477,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
u64 release_bytes = 0;
u64 lockstart;
u64 lockend;
- unsigned long first_index;
size_t num_written = 0;
int nrptrs;
int ret = 0;
@@ -1485,8 +1492,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
if (!pages)
return -ENOMEM;
- first_index = pos >> PAGE_CACHE_SHIFT;
-
while (iov_iter_count(i) > 0) {
size_t offset = pos & (PAGE_CACHE_SIZE - 1);
size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1515,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
}
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes);
- if (ret == -ENOSPC &&
- (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
- BTRFS_INODE_PREALLOC))) {
+
+ if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) {
ret = check_can_nocow(inode, pos, &write_bytes);
+ if (ret < 0)
+ break;
if (ret > 0) {
+ /*
+ * For nodata cow case, no need to reserve
+ * data space.
+ */
only_release_metadata = true;
/*
* our prealloc extent may be smaller than
@@ -1524,20 +1534,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
num_pages = DIV_ROUND_UP(write_bytes + offset,
PAGE_CACHE_SIZE);
reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
- ret = 0;
- } else {
- ret = -ENOSPC;
+ goto reserve_metadata;
}
}
-
- if (ret)
+ ret = btrfs_check_data_free_space(inode, pos, write_bytes);
+ if (ret < 0)
break;
+reserve_metadata:
ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
if (ret) {
if (!only_release_metadata)
- btrfs_free_reserved_data_space(inode,
- reserve_bytes);
+ btrfs_free_reserved_data_space(inode, pos,
+ write_bytes);
else
btrfs_end_write_no_snapshoting(root);
break;
@@ -1603,12 +1612,17 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
}
- if (only_release_metadata)
+ if (only_release_metadata) {
btrfs_delalloc_release_metadata(inode,
release_bytes);
- else
- btrfs_delalloc_release_space(inode,
+ } else {
+ u64 __pos;
+
+ __pos = round_down(pos, root->sectorsize) +
+ (dirty_pages << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode, __pos,
release_bytes);
+ }
}
release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1674,7 @@ again:
btrfs_end_write_no_snapshoting(root);
btrfs_delalloc_release_metadata(inode, release_bytes);
} else {
- btrfs_delalloc_release_space(inode, release_bytes);
+ btrfs_delalloc_release_space(inode, pos, release_bytes);
}
}
@@ -2266,7 +2280,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
u64 drop_end;
int ret = 0;
int err = 0;
- int rsv_count;
+ unsigned int rsv_count;
bool same_page;
bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
u64 ino_size;
@@ -2488,6 +2502,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
trans->block_rsv = &root->fs_info->trans_block_rsv;
/*
+ * If we are using the NO_HOLES feature we might have had already an
+ * hole that overlaps a part of the region [lockstart, lockend] and
+ * ends at (or beyond) lockend. Since we have no file extent items to
+ * represent holes, drop_end can be less than lockend and so we must
+ * make sure we have an extent map representing the existing hole (the
+ * call to __btrfs_drop_extents() might have dropped the existing extent
+ * map representing the existing hole), otherwise the fast fsync path
+ * will not record the existence of the hole region
+ * [existing_hole_start, lockend].
+ */
+ if (drop_end <= lockend)
+ drop_end = lockend + 1;
+ /*
* Don't insert file hole extent item if it's for a range beyond eof
* (because it's useless) or if it represents a 0 bytes range (when
* cur_offset == drop_end).
@@ -2541,17 +2568,61 @@ out_only_mutex:
return err;
}
+/* Helper structure to record which range is already reserved */
+struct falloc_range {
+ struct list_head list;
+ u64 start;
+ u64 len;
+};
+
+/*
+ * Helper function to add falloc range
+ *
+ * Caller should have locked the larger range of extent containing
+ * [start, len)
+ */
+static int add_falloc_range(struct list_head *head, u64 start, u64 len)
+{
+ struct falloc_range *prev = NULL;
+ struct falloc_range *range = NULL;
+
+ if (list_empty(head))
+ goto insert;
+
+ /*
+ * As fallocate iterate by bytenr order, we only need to check
+ * the last range.
+ */
+ prev = list_entry(head->prev, struct falloc_range, list);
+ if (prev->start + prev->len == start) {
+ prev->len += len;
+ return 0;
+ }
+insert:
+ range = kmalloc(sizeof(*range), GFP_NOFS);
+ if (!range)
+ return -ENOMEM;
+ range->start = start;
+ range->len = len;
+ list_add_tail(&range->list, head);
+ return 0;
+}
+
static long btrfs_fallocate(struct file *file, int mode,
loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
struct extent_state *cached_state = NULL;
+ struct falloc_range *range;
+ struct falloc_range *tmp;
+ struct list_head reserve_list;
u64 cur_offset;
u64 last_byte;
u64 alloc_start;
u64 alloc_end;
u64 alloc_hint = 0;
u64 locked_end;
+ u64 actual_end = 0;
struct extent_map *em;
int blocksize = BTRFS_I(inode)->root->sectorsize;
int ret;
@@ -2567,11 +2638,12 @@ static long btrfs_fallocate(struct file *file, int mode,
return btrfs_punch_hole(inode, offset, len);
/*
- * Make sure we have enough space before we do the
- * allocation.
+ * Only trigger disk allocation, don't trigger qgroup reserve
+ *
+ * For qgroup space, it will be checked later.
*/
- ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start);
- if (ret)
+ ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
+ if (ret < 0)
return ret;
mutex_lock(&inode->i_mutex);
@@ -2579,12 +2651,19 @@ static long btrfs_fallocate(struct file *file, int mode,
if (ret)
goto out;
+ /*
+ * TODO: Move these two operations after we have checked
+ * accurate reserved space, or fallocate can still fail but
+ * with page truncated or size expanded.
+ *
+ * But that's a minor problem and won't do much harm BTW.
+ */
if (alloc_start > inode->i_size) {
ret = btrfs_cont_expand(inode, i_size_read(inode),
alloc_start);
if (ret)
goto out;
- } else {
+ } else if (offset + len > inode->i_size) {
/*
* If we are fallocating from the end of the file onward we
* need to zero out the end of the page if i_size lands in the
@@ -2637,10 +2716,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
}
+ /* First, check if we exceed the qgroup limit */
+ INIT_LIST_HEAD(&reserve_list);
cur_offset = alloc_start;
while (1) {
- u64 actual_end;
-
em = btrfs_get_extent(inode, NULL, 0, cur_offset,
alloc_end - cur_offset, 0);
if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2732,82 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = min(extent_map_end(em), alloc_end);
actual_end = min_t(u64, extent_map_end(em), offset + len);
last_byte = ALIGN(last_byte, blocksize);
-
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
- ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
- last_byte - cur_offset,
- 1 << inode->i_blkbits,
- offset + len,
- &alloc_hint);
- } else if (actual_end > inode->i_size &&
- !(mode & FALLOC_FL_KEEP_SIZE)) {
- struct btrfs_trans_handle *trans;
- struct btrfs_root *root = BTRFS_I(inode)->root;
-
- /*
- * We didn't need to allocate any more space, but we
- * still extended the size of the file so we need to
- * update i_size and the inode item.
- */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- } else {
- inode->i_ctime = CURRENT_TIME;
- i_size_write(inode, actual_end);
- btrfs_ordered_update_i_size(inode, actual_end,
- NULL);
- ret = btrfs_update_inode(trans, root, inode);
- if (ret)
- btrfs_end_transaction(trans, root);
- else
- ret = btrfs_end_transaction(trans,
- root);
+ ret = add_falloc_range(&reserve_list, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0) {
+ free_extent_map(em);
+ break;
}
+ ret = btrfs_qgroup_reserve_data(inode, cur_offset,
+ last_byte - cur_offset);
+ if (ret < 0)
+ break;
}
free_extent_map(em);
- if (ret < 0)
- break;
-
cur_offset = last_byte;
- if (cur_offset >= alloc_end) {
- ret = 0;
+ if (cur_offset >= alloc_end)
break;
+ }
+
+ /*
+ * If ret is still 0, means we're OK to fallocate.
+ * Or just cleanup the list and exit.
+ */
+ list_for_each_entry_safe(range, tmp, &reserve_list, list) {
+ if (!ret)
+ ret = btrfs_prealloc_file_range(inode, mode,
+ range->start,
+ range->len, 1 << inode->i_blkbits,
+ offset + len, &alloc_hint);
+ list_del(&range->list);
+ kfree(range);
+ }
+ if (ret < 0)
+ goto out_unlock;
+
+ if (actual_end > inode->i_size &&
+ !(mode & FALLOC_FL_KEEP_SIZE)) {
+ struct btrfs_trans_handle *trans;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ /*
+ * We didn't need to allocate any more space, but we
+ * still extended the size of the file so we need to
+ * update i_size and the inode item.
+ */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ } else {
+ inode->i_ctime = CURRENT_TIME;
+ i_size_write(inode, actual_end);
+ btrfs_ordered_update_i_size(inode, actual_end, NULL);
+ ret = btrfs_update_inode(trans, root, inode);
+ if (ret)
+ btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_end_transaction(trans, root);
}
}
+out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state, GFP_NOFS);
out:
+ /*
+ * As we waited the extent range, the data_rsv_map must be empty
+ * in the range, as written data range will be released from it.
+ * And for prealloacted extent, it will also be released when
+ * its metadata is written.
+ * So this is completely used as cleanup.
+ */
+ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
mutex_unlock(&inode->i_mutex);
/* Let go of our reservation. */
- btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+ btrfs_free_reserved_data_space(inode, alloc_start,
+ alloc_end - alloc_start);
return ret;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66bd3ba..85a1f8621b51 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,8 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
}
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) &
- ~(__GFP_FS | __GFP_HIGHMEM));
+ mapping_gfp_constraint(inode->i_mapping,
+ ~(__GFP_FS | __GFP_HIGHMEM)));
return inode;
}
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
gen = io_ctl->cur;
if (le64_to_cpu(*gen) != generation) {
- printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
- "(%Lu) does not match inode (%Lu)\n", *gen,
- generation);
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "space cache generation (%llu) does not match inode (%llu)",
+ *gen, generation);
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
PAGE_CACHE_SIZE - offset);
btrfs_csum_final(crc, (char *)&crc);
if (val != crc) {
- printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
- "space cache\n");
+ btrfs_err_rl(io_ctl->root->fs_info,
+ "csum mismatch on free space cache");
io_ctl_unmap_page(io_ctl);
return -EIO;
}
@@ -1215,7 +1215,7 @@ out:
* @offset - the offset for the key we'll insert
*
* This function writes out a free space cache struct to disk for quick recovery
- * on mount. This will return 0 if it was successfull in writing the cache out,
+ * on mount. This will return 0 if it was successful in writing the cache out,
* or an errno if it was not.
*/
static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
*/
static int search_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *bitmap_info, u64 *offset,
- u64 *bytes)
+ u64 *bytes, bool for_alloc)
{
unsigned long found_bits = 0;
unsigned long max_bits = 0;
@@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
unsigned long next_zero;
unsigned long extent_bits;
+ /*
+ * Skip searching the bitmap if we don't have a contiguous section that
+ * is large enough for this allocation.
+ */
+ if (for_alloc &&
+ bitmap_info->max_extent_size &&
+ bitmap_info->max_extent_size < *bytes) {
+ *bytes = bitmap_info->max_extent_size;
+ return -1;
+ }
+
i = offset_to_bit(bitmap_info->offset, ctl->unit,
max_t(u64, *offset, bitmap_info->offset));
bits = bytes_to_bits(*bytes, ctl->unit);
for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
+ if (for_alloc && bits == 1) {
+ found_bits = 1;
+ break;
+ }
next_zero = find_next_zero_bit(bitmap_info->bitmap,
BITS_PER_BITMAP, i);
extent_bits = next_zero - i;
@@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
}
*bytes = (u64)(max_bits) * ctl->unit;
+ bitmap_info->max_extent_size = *bytes;
return -1;
}
@@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
if (entry->bitmap) {
u64 size = *bytes;
- ret = search_bitmap(ctl, entry, &tmp, &size);
+ ret = search_bitmap(ctl, entry, &tmp, &size, true);
if (!ret) {
*offset = tmp;
*bytes = size;
@@ -1874,7 +1890,8 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
search_bytes = min(search_bytes, end - search_start + 1);
- ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
+ ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
+ false);
if (ret < 0 || search_start != *offset)
return -EINVAL;
@@ -1919,7 +1936,7 @@ again:
search_start = *offset;
search_bytes = ctl->unit;
ret = search_bitmap(ctl, bitmap_info, &search_start,
- &search_bytes);
+ &search_bytes, false);
if (ret < 0 || search_start != *offset)
return -EAGAIN;
@@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
bitmap_set_bits(ctl, info, offset, bytes_to_set);
+ /*
+ * We set some bytes, we have no idea what the max extent size is
+ * anymore.
+ */
+ info->max_extent_size = 0;
+
return bytes_to_set;
}
@@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info)
{
struct btrfs_block_group_cache *block_group = ctl->private;
+ bool forced = false;
+
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
+ block_group))
+ forced = true;
+#endif
/*
* If we are below the extents threshold then we can add this as an
* extent, and don't have to deal with the bitmap
*/
- if (ctl->free_extents < ctl->extents_thresh) {
+ if (!forced && ctl->free_extents < ctl->extents_thresh) {
/*
* If this block group has some small extents we don't want to
* use up all of our free slots in the cache with them, we want
@@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
search_start = min_start;
search_bytes = bytes;
- err = search_bitmap(ctl, entry, &search_start, &search_bytes);
+ err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
if (err) {
if (search_bytes > *max_extent_size)
*max_extent_size = search_bytes;
@@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
unsigned long want_bits;
unsigned long min_bits;
unsigned long found_bits;
+ unsigned long max_bits = 0;
unsigned long start = 0;
unsigned long total_found = 0;
int ret;
@@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
want_bits = bytes_to_bits(bytes, ctl->unit);
min_bits = bytes_to_bits(min_bytes, ctl->unit);
+ /*
+ * Don't bother looking for a cluster in this bitmap if it's heavily
+ * fragmented.
+ */
+ if (entry->max_extent_size &&
+ entry->max_extent_size < cont1_bytes)
+ return -ENOSPC;
again:
found_bits = 0;
for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2829,19 @@ again:
BITS_PER_BITMAP, i);
if (next_zero - i >= min_bits) {
found_bits = next_zero - i;
+ if (found_bits > max_bits)
+ max_bits = found_bits;
break;
}
+ if (next_zero - i > max_bits)
+ max_bits = next_zero - i;
i = next_zero;
}
- if (!found_bits)
+ if (!found_bits) {
+ entry->max_extent_size = (u64)max_bits * ctl->unit;
return -ENOSPC;
+ }
if (!total_found) {
start = i;
@@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
spin_lock_init(&cluster->refill_lock);
cluster->root = RB_ROOT;
cluster->max_size = 0;
+ cluster->fragmented = false;
INIT_LIST_HEAD(&cluster->block_group_list);
cluster->block_group = NULL;
}
@@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
}
bytes = minlen;
- ret2 = search_bitmap(ctl, entry, &start, &bytes);
+ ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
if (ret2 || start >= end) {
spin_unlock(&ctl->tree_lock);
mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3376,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
u64 count = 1;
int ret;
- ret = search_bitmap(ctl, entry, &offset, &count);
+ ret = search_bitmap(ctl, entry, &offset, &count, true);
/* Logic error; Should be empty if it can't find anything */
ASSERT(!ret);
@@ -3532,6 +3577,7 @@ again:
spin_lock(&ctl->tree_lock);
info->offset = offset;
info->bytes = bytes;
+ info->max_extent_size = 0;
ret = link_free_space(ctl, info);
spin_unlock(&ctl->tree_lock);
if (ret)
@@ -3559,6 +3605,7 @@ again:
}
bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+
bytes -= bytes_added;
offset += bytes_added;
spin_unlock(&ctl->tree_lock);
@@ -3602,7 +3649,7 @@ have_info:
bit_off = offset;
bit_bytes = ctl->unit;
- ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+ ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
if (!ret) {
if (bit_off == offset) {
ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029ad3b1..f251865eb6f3 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
struct rb_node offset_index;
u64 offset;
u64 bytes;
+ u64 max_extent_size;
unsigned long *bitmap;
struct list_head list;
};
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c73f4d..be4d22a5022f 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
*/
if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
name, name_len, &extref)) {
- btrfs_std_error(root->fs_info, -ENOENT);
+ btrfs_std_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
goto out;
}
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582ac3f73..767a6056ac45 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
/* Just to make sure we have enough space */
prealloc += 8 * PAGE_CACHE_SIZE;
- ret = btrfs_delalloc_reserve_space(inode, prealloc);
+ ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
if (ret)
goto out_put;
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_space(inode, prealloc);
+ btrfs_delalloc_release_space(inode, 0, prealloc);
goto out_put;
}
- btrfs_free_reserved_data_space(inode, prealloc);
+ btrfs_free_reserved_data_space(inode, 0, prealloc);
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a0fa7253a2d7..994490d5fa64 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
btrfs_delalloc_release_metadata(inode, end + 1 - start);
btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
out:
+ /*
+ * Don't forget to free the reserved space, as for inlined extent
+ * it won't count as data extent, free them directly here.
+ * And at reserve time, it's always aligned to page size, so
+ * just free one page here.
+ */
+ btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
btrfs_free_path(path);
btrfs_end_transaction(trans, root);
return ret;
@@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
PAGE_CACHE_SHIFT;
+ /*
+ * atomic_sub_return implies a barrier for waitqueue_active
+ */
if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
5 * 1024 * 1024 &&
waitqueue_active(&root->fs_info->async_submit_wait))
@@ -1294,8 +1304,14 @@ next_slot:
num_bytes = 0;
btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
- if (found_key.objectid > ino ||
- found_key.type > BTRFS_EXTENT_DATA_KEY ||
+ if (found_key.objectid > ino)
+ break;
+ if (WARN_ON_ONCE(found_key.objectid < ino) ||
+ found_key.type < BTRFS_EXTENT_DATA_KEY) {
+ path->slots[0]++;
+ goto next_slot;
+ }
+ if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
found_key.offset > end)
break;
@@ -1766,7 +1782,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
&& do_list && !(state->state & EXTENT_NORESERVE))
- btrfs_free_reserved_data_space(inode, len);
+ btrfs_free_reserved_data_space_noquota(inode,
+ state->start, len);
__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
root->fs_info->delalloc_batch);
@@ -1861,15 +1878,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
u64 bio_offset)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
+ enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
int ret = 0;
int skip_sum;
- int metadata = 0;
int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
if (btrfs_is_free_space_inode(inode))
- metadata = 2;
+ metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
if (!(rw & REQ_WRITE)) {
ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1989,7 +2006,8 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (ret) {
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
@@ -2115,7 +2133,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
ins.type = BTRFS_EXTENT_ITEM_KEY;
ret = btrfs_alloc_reserved_file_extent(trans, root,
root->root_key.objectid,
- btrfs_ino(inode), file_pos, &ins);
+ btrfs_ino(inode), file_pos,
+ ram_bytes, &ins);
+ /*
+ * Release the reserved range from inode dirty range map, as it is
+ * already moved into delayed_ref_head
+ */
+ btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
out:
btrfs_free_path(path);
@@ -2573,7 +2597,7 @@ again:
ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
new->disk_len, 0,
backref->root_id, backref->inum,
- new->file_pos, 0); /* start - extent_offset */
+ new->file_pos); /* start - extent_offset */
if (ret) {
btrfs_abort_transaction(trans, root, ret);
goto out_free_path;
@@ -2599,7 +2623,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
return;
list_for_each_entry_safe(old, tmp, &new->head, list) {
- list_del(&old->list);
kfree(old);
}
kfree(new);
@@ -2824,6 +2847,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+
+ /*
+ * For mwrite(mmap + memset to write) case, we still reserve
+ * space for NOCOW range.
+ * As NOCOW won't cause a new delayed ref, just free the space
+ */
+ btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
+ ordered_extent->len);
btrfs_ordered_update_i_size(inode, 0, ordered_extent);
if (nolock)
trans = btrfs_join_transaction_nolock(root);
@@ -3018,8 +3049,6 @@ static int __readpage_endio_check(struct inode *inode,
char *kaddr;
u32 csum_expected;
u32 csum = ~(u32)0;
- static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
csum_expected = *(((u32 *)io_bio->csum) + icsum);
@@ -3032,9 +3061,8 @@ static int __readpage_endio_check(struct inode *inode,
kunmap_atomic(kaddr);
return 0;
zeroit:
- if (__ratelimit(&_rs))
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "csum failed ino %llu off %llu csum %u expected csum %u",
+ btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
+ "csum failed ino %llu off %llu csum %u expected csum %u",
btrfs_ino(inode), start, csum, csum_expected);
memset(kaddr + pgoff, 1, len);
flush_dcache_page(page);
@@ -4217,6 +4245,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
}
+static int truncate_inline_extent(struct inode *inode,
+ struct btrfs_path *path,
+ struct btrfs_key *found_key,
+ const u64 item_end,
+ const u64 new_size)
+{
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *fi;
+ u32 size = (u32)(new_size - found_key->offset);
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
+ fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
+ loff_t offset = new_size;
+ loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
+
+ /*
+ * Zero out the remaining of the last page of our inline extent,
+ * instead of directly truncating our inline extent here - that
+ * would be much more complex (decompressing all the data, then
+ * compressing the truncated data, which might be bigger than
+ * the size of the inline extent, resize the extent, etc).
+ * We release the path because to get the page we might need to
+ * read the extent item from disk (data not in the page cache).
+ */
+ btrfs_release_path(path);
+ return btrfs_truncate_page(inode, offset, page_end - offset, 0);
+ }
+
+ btrfs_set_file_extent_ram_bytes(leaf, fi, size);
+ size = btrfs_file_extent_calc_inline_size(size);
+ btrfs_truncate_item(root, path, size, 1);
+
+ if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
+ inode_sub_bytes(inode, item_end + 1 - new_size);
+
+ return 0;
+}
+
/*
* this can truncate away extent items, csum items and directory items.
* It starts at a high offset and removes keys until it can't find
@@ -4411,27 +4480,40 @@ search_again:
* special encodings
*/
if (!del_item &&
- btrfs_file_extent_compression(leaf, fi) == 0 &&
btrfs_file_extent_encryption(leaf, fi) == 0 &&
btrfs_file_extent_other_encoding(leaf, fi) == 0) {
- u32 size = new_size - found_key.offset;
-
- if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
- inode_sub_bytes(inode, item_end + 1 -
- new_size);
/*
- * update the ram bytes to properly reflect
- * the new size of our item
+ * Need to release path in order to truncate a
+ * compressed extent. So delete any accumulated
+ * extent items so far.
*/
- btrfs_set_file_extent_ram_bytes(leaf, fi, size);
- size =
- btrfs_file_extent_calc_inline_size(size);
- btrfs_truncate_item(root, path, size, 1);
+ if (btrfs_file_extent_compression(leaf, fi) !=
+ BTRFS_COMPRESS_NONE && pending_del_nr) {
+ err = btrfs_del_items(trans, root, path,
+ pending_del_slot,
+ pending_del_nr);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root,
+ err);
+ goto error;
+ }
+ pending_del_nr = 0;
+ }
+
+ err = truncate_inline_extent(inode, path,
+ &found_key,
+ item_end,
+ new_size);
+ if (err) {
+ btrfs_abort_transaction(trans,
+ root, err);
+ goto error;
+ }
} else if (test_bit(BTRFS_ROOT_REF_COWS,
&root->state)) {
- inode_sub_bytes(inode, item_end + 1 -
- found_key.offset);
+ inode_sub_bytes(inode, item_end + 1 - new_size);
}
}
delete:
@@ -4461,7 +4543,7 @@ delete:
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes, 0,
btrfs_header_owner(leaf),
- ino, extent_offset, 0);
+ ino, extent_offset);
BUG_ON(ret);
if (btrfs_should_throttle_delayed_refs(trans, root))
btrfs_async_run_delayed_refs(root,
@@ -4575,14 +4657,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
if ((offset & (blocksize - 1)) == 0 &&
(!len || ((len & (blocksize - 1)) == 0)))
goto out;
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ ret = btrfs_delalloc_reserve_space(inode,
+ round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
if (ret)
goto out;
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode,
+ round_down(from, PAGE_CACHE_SIZE),
+ PAGE_CACHE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -4650,7 +4735,8 @@ again:
out_unlock:
if (ret)
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start,
+ PAGE_CACHE_SIZE);
unlock_page(page);
page_cache_release(page);
out:
@@ -5048,6 +5134,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
spin_unlock(&io_tree->lock);
lock_extent_bits(io_tree, start, end, 0, &cached_state);
+
+ /*
+ * If still has DELALLOC flag, the extent didn't reach disk,
+ * and its reserved space won't be freed by delayed_ref.
+ * So we need to free its reserved space here.
+ * (Refer to comment in btrfs_invalidatepage, case 2)
+ *
+ * Note, end is the bytenr of last byte, so we need + 1 here.
+ */
+ if (state->state & EXTENT_DELALLOC)
+ btrfs_qgroup_free_data(inode, start, end - start + 1);
+
clear_extent_bit(io_tree, start, end,
EXTENT_LOCKED | EXTENT_DIRTY |
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -5084,7 +5182,8 @@ void btrfs_evict_inode(struct inode *inode)
goto no_delete;
}
/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ if (!special_file(inode->i_mode))
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(inode, 0, (u64)-1);
@@ -6267,9 +6366,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
/*
* 2 for inode item and ref
* 2 for dir items
@@ -7408,6 +7504,32 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
return em;
}
+struct btrfs_dio_data {
+ u64 outstanding_extents;
+ u64 reserve;
+};
+
+static void adjust_dio_outstanding_extents(struct inode *inode,
+ struct btrfs_dio_data *dio_data,
+ const u64 len)
+{
+ unsigned num_extents;
+
+ num_extents = (unsigned) div64_u64(len + BTRFS_MAX_EXTENT_SIZE - 1,
+ BTRFS_MAX_EXTENT_SIZE);
+ /*
+ * If we have an outstanding_extents count still set then we're
+ * within our reservation, otherwise we need to adjust our inode
+ * counter appropriately.
+ */
+ if (dio_data->outstanding_extents) {
+ dio_data->outstanding_extents -= num_extents;
+ } else {
+ spin_lock(&BTRFS_I(inode)->lock);
+ BTRFS_I(inode)->outstanding_extents += num_extents;
+ spin_unlock(&BTRFS_I(inode)->lock);
+ }
+}
static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create)
@@ -7415,10 +7537,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
struct extent_map *em;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
+ struct btrfs_dio_data *dio_data = NULL;
u64 start = iblock << inode->i_blkbits;
u64 lockstart, lockend;
u64 len = bh_result->b_size;
- u64 *outstanding_extents = NULL;
int unlock_bits = EXTENT_LOCKED;
int ret = 0;
@@ -7436,7 +7558,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* that anything that needs to check if there's a transction doesn't get
* confused.
*/
- outstanding_extents = current->journal_info;
+ dio_data = current->journal_info;
current->journal_info = NULL;
}
@@ -7444,8 +7566,11 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
* If this errors out it's because we couldn't invalidate pagecache for
* this range and we need to fallback to buffered.
*/
- if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
- return -ENOTBLK;
+ if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
+ create)) {
+ ret = -ENOTBLK;
+ goto err;
+ }
em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
if (IS_ERR(em)) {
@@ -7563,22 +7688,11 @@ unlock:
if (start + len > i_size_read(inode))
i_size_write(inode, start + len);
- /*
- * If we have an outstanding_extents count still set then we're
- * within our reservation, otherwise we need to adjust our inode
- * counter appropriately.
- */
- if (*outstanding_extents) {
- (*outstanding_extents)--;
- } else {
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
- spin_unlock(&BTRFS_I(inode)->lock);
- }
-
- current->journal_info = outstanding_extents;
- btrfs_free_reserved_data_space(inode, len);
- set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags);
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+ btrfs_free_reserved_data_space(inode, start, len);
+ WARN_ON(dio_data->reserve < len);
+ dio_data->reserve -= len;
+ current->journal_info = dio_data;
}
/*
@@ -7601,8 +7715,17 @@ unlock:
unlock_err:
clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
unlock_bits, 1, 0, &cached_state, GFP_NOFS);
- if (outstanding_extents)
- current->journal_info = outstanding_extents;
+err:
+ if (dio_data)
+ current->journal_info = dio_data;
+ /*
+ * Compensate the delalloc release we do in btrfs_direct_IO() when we
+ * write less data then expected, so that we don't underflow our inode's
+ * outstanding extents counter.
+ */
+ if (create && dio_data)
+ adjust_dio_outstanding_extents(inode, dio_data, len);
+
return ret;
}
@@ -8329,7 +8452,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- u64 outstanding_extents = 0;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_dio_data dio_data = { 0 };
size_t count = 0;
int flags = 0;
bool wakeup = true;
@@ -8364,10 +8488,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
mutex_unlock(&inode->i_mutex);
relock = true;
}
- ret = btrfs_delalloc_reserve_space(inode, count);
+ ret = btrfs_delalloc_reserve_space(inode, offset, count);
if (ret)
goto out;
- outstanding_extents = div64_u64(count +
+ dio_data.outstanding_extents = div64_u64(count +
BTRFS_MAX_EXTENT_SIZE - 1,
BTRFS_MAX_EXTENT_SIZE);
@@ -8376,7 +8500,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* do the accounting properly if we go over the number we
* originally calculated. Abuse current->journal_info for this.
*/
- current->journal_info = &outstanding_extents;
+ dio_data.reserve = round_up(count, root->sectorsize);
+ current->journal_info = &dio_data;
} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
&BTRFS_I(inode)->runtime_flags)) {
inode_dio_end(inode);
@@ -8391,18 +8516,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (iov_iter_rw(iter) == WRITE) {
current->journal_info = NULL;
if (ret < 0 && ret != -EIOCBQUEUED) {
- /*
- * If the error comes from submitting stage,
- * btrfs_get_blocsk_direct() has free'd data space,
- * and metadata space will be handled by
- * finish_ordered_fn, don't do that again to make
- * sure bytes_may_use is correct.
- */
- if (!test_and_clear_bit(BTRFS_INODE_DIO_READY,
- &BTRFS_I(inode)->runtime_flags))
- btrfs_delalloc_release_space(inode, count);
+ if (dio_data.reserve)
+ btrfs_delalloc_release_space(inode, offset,
+ dio_data.reserve);
} else if (ret >= 0 && (size_t)ret < count)
- btrfs_delalloc_release_space(inode,
+ btrfs_delalloc_release_space(inode, offset,
count - (size_t)ret);
}
out:
@@ -8561,6 +8679,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
}
}
+ /*
+ * Qgroup reserved space handler
+ * Page here will be either
+ * 1) Already written to disk
+ * In this case, its reserved space is released from data rsv map
+ * and will be freed by delayed_ref handler finally.
+ * So even we call qgroup_free_data(), it won't decrease reserved
+ * space.
+ * 2) Not written to disk
+ * This means the reserved space should be freed here.
+ */
+ btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end,
EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8611,7 +8741,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
u64 page_end;
sb_start_pagefault(inode->i_sb);
- ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+ page_start = page_offset(page);
+ page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+ ret = btrfs_delalloc_reserve_space(inode, page_start,
+ PAGE_CACHE_SIZE);
if (!ret) {
ret = file_update_time(vma->vm_file);
reserved = 1;
@@ -8630,8 +8764,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
again:
lock_page(page);
size = i_size_read(inode);
- page_start = page_offset(page);
- page_end = page_start + PAGE_CACHE_SIZE - 1;
if ((page->mapping != inode->i_mapping) ||
(page_start >= size)) {
@@ -8708,7 +8840,7 @@ out_unlock:
}
unlock_page(page);
out:
- btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+ btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
out_noreserve:
sb_end_pagefault(inode->i_sb);
return ret;
@@ -8997,6 +9129,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
+ btrfs_qgroup_check_reserved_leak(inode);
inode_tree_del(inode);
btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
free:
@@ -9633,6 +9766,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 cur_offset = start;
u64 i_size;
u64 cur_bytes;
+ u64 last_alloc = (u64)-1;
int ret = 0;
bool own_trans = true;
@@ -9649,6 +9783,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
cur_bytes = max(cur_bytes, min_size);
+ /*
+ * If we are severely fragmented we could end up with really
+ * small allocations, so if the allocator is returning small
+ * chunks lets make its job easier by only searching for those
+ * sized chunks.
+ */
+ cur_bytes = min(cur_bytes, last_alloc);
ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
*alloc_hint, &ins, 1, 0);
if (ret) {
@@ -9657,6 +9798,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
break;
}
+ last_alloc = ins.offset;
ret = insert_reserved_file_extent(trans, inode,
cur_offset, ins.objectid,
ins.offset, ins.offset,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0adf5422fce9..da94138eb85e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode,
- page_cnt << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
if (ret)
return ret;
i_done = 0;
@@ -1210,7 +1211,8 @@ again:
BTRFS_I(inode)->outstanding_extents++;
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode,
- (page_cnt - i_done) << PAGE_CACHE_SHIFT);
+ start_index << PAGE_CACHE_SHIFT,
+ (page_cnt - i_done) << PAGE_CACHE_SHIFT);
}
@@ -1235,7 +1237,9 @@ out:
unlock_page(pages[i]);
page_cache_release(pages[i]);
}
- btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
+ btrfs_delalloc_release_space(inode,
+ start_index << PAGE_CACHE_SHIFT,
+ page_cnt << PAGE_CACHE_SHIFT);
return ret;
}
@@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
break;
if (btrfs_defrag_cancelled(root->fs_info)) {
- printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n");
+ btrfs_debug(root->fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
@@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
new_size = div_u64(new_size, root->sectorsize);
new_size *= root->sectorsize;
- printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n",
+ btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
@@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n",
+ btrfs_err(info, "could not find root %llu",
sk->tree_id);
btrfs_free_path(path);
return -ENOENT;
@@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
- printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id);
+ btrfs_err(info, "could not find root %llu", tree_id);
ret = -ENOENT;
goto out;
}
@@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
- struct btrfs_device *next;
struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
int ret = 0;
@@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
fi_args->num_devices = fs_devices->num_devices;
memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
- list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+ list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
@@ -3203,41 +3206,6 @@ out:
return ret;
}
-/* Helper to check and see if this root currently has a ref on the given disk
- * bytenr. If it does then we need to update the quota for this root. This
- * doesn't do anything if quotas aren't enabled.
- */
-static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
- u64 disko)
-{
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
- struct ulist *roots;
- struct ulist_iterator uiter;
- struct ulist_node *root_node = NULL;
- int ret;
-
- if (!root->fs_info->quota_enabled)
- return 1;
-
- btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- ret = btrfs_find_all_roots(trans, root->fs_info, disko,
- tree_mod_seq_elem.seq, &roots);
- if (ret < 0)
- goto out;
- ret = 0;
- ULIST_ITER_INIT(&uiter);
- while ((root_node = ulist_next(roots, &uiter))) {
- if (root_node->val == root->objectid) {
- ret = 1;
- break;
- }
- }
- ulist_free(roots);
-out:
- btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
- return ret;
-}
-
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
@@ -3328,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode,
&BTRFS_I(inode)->runtime_flags);
}
+/*
+ * Make sure we do not end up inserting an inline extent into a file that has
+ * already other (non-inline) extents. If a file has an inline extent it can
+ * not have any other extents and the (single) inline extent must start at the
+ * file offset 0. Failing to respect these rules will lead to file corruption,
+ * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
+ *
+ * We can have extents that have been already written to disk or we can have
+ * dirty ranges still in delalloc, in which case the extent maps and items are
+ * created only when we run delalloc, and the delalloc ranges might fall outside
+ * the range we are currently locking in the inode's io tree. So we check the
+ * inode's i_size because of that (i_size updates are done while holding the
+ * i_mutex, which we are holding here).
+ * We also check to see if the inode has a size not greater than "datal" but has
+ * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
+ * protected against such concurrent fallocate calls by the i_mutex).
+ *
+ * If the file has no extents but a size greater than datal, do not allow the
+ * copy because we would need turn the inline extent into a non-inline one (even
+ * with NO_HOLES enabled). If we find our destination inode only has one inline
+ * extent, just overwrite it with the source inline extent if its size is less
+ * than the source extent's size, or we could copy the source inline extent's
+ * data into the destination inode's inline extent if the later is greater then
+ * the former.
+ */
+static int clone_copy_inline_extent(struct inode *src,
+ struct inode *dst,
+ struct btrfs_trans_handle *trans,
+ struct btrfs_path *path,
+ struct btrfs_key *new_key,
+ const u64 drop_start,
+ const u64 datal,
+ const u64 skip,
+ const u64 size,
+ char *inline_data)
+{
+ struct btrfs_root *root = BTRFS_I(dst)->root;
+ const u64 aligned_end = ALIGN(new_key->offset + datal,
+ root->sectorsize);
+ int ret;
+ struct btrfs_key key;
+
+ if (new_key->offset > 0)
+ return -EOPNOTSUPP;
+
+ key.objectid = btrfs_ino(dst);
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ else if (ret > 0)
+ goto copy_inline_extent;
+ }
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY) {
+ ASSERT(key.offset > 0);
+ return -EOPNOTSUPP;
+ }
+ } else if (i_size_read(dst) <= datal) {
+ struct btrfs_file_extent_item *ei;
+ u64 ext_len;
+
+ /*
+ * If the file size is <= datal, make sure there are no other
+ * extents following (can happen do to an fallocate call with
+ * the flag FALLOC_FL_KEEP_SIZE).
+ */
+ ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+ struct btrfs_file_extent_item);
+ /*
+ * If it's an inline extent, it can not have other extents
+ * following it.
+ */
+ if (btrfs_file_extent_type(path->nodes[0], ei) ==
+ BTRFS_FILE_EXTENT_INLINE)
+ goto copy_inline_extent;
+
+ ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
+ if (ext_len > aligned_end)
+ return -EOPNOTSUPP;
+
+ ret = btrfs_next_item(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret == 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key,
+ path->slots[0]);
+ if (key.objectid == btrfs_ino(dst) &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ return -EOPNOTSUPP;
+ }
+ }
+
+copy_inline_extent:
+ /*
+ * We have no extent items, or we have an extent at offset 0 which may
+ * or may not be inlined. All these cases are dealt the same way.
+ */
+ if (i_size_read(dst) > datal) {
+ /*
+ * If the destination inode has an inline extent...
+ * This would require copying the data from the source inline
+ * extent into the beginning of the destination's inline extent.
+ * But this is really complex, both extents can be compressed
+ * or just one of them, which would require decompressing and
+ * re-compressing data (which could increase the new compressed
+ * size, not allowing the compressed data to fit anymore in an
+ * inline extent).
+ * So just don't support this case for now (it should be rare,
+ * we are not really saving space when cloning inline extents).
+ */
+ return -EOPNOTSUPP;
+ }
+
+ btrfs_release_path(path);
+ ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
+ if (ret)
+ return ret;
+ ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
+ if (ret)
+ return ret;
+
+ if (skip) {
+ const u32 start = btrfs_file_extent_calc_inline_size(0);
+
+ memmove(inline_data + start, inline_data + start + skip, datal);
+ }
+
+ write_extent_buffer(path->nodes[0], inline_data,
+ btrfs_item_ptr_offset(path->nodes[0],
+ path->slots[0]),
+ size);
+ inode_add_bytes(dst, datal);
+
+ return 0;
+}
+
/**
* btrfs_clone() - clone a range from inode file to another
*
@@ -3352,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
u32 nritems;
int slot;
int ret;
- int no_quota;
const u64 len = olen_aligned;
- u64 last_disko = 0;
u64 last_dest_end = destoff;
ret = -ENOMEM;
@@ -3400,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
- no_quota = 1;
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
@@ -3552,35 +3661,13 @@ process_slot:
btrfs_set_file_extent_num_bytes(leaf, extent,
datal);
- /*
- * We need to look up the roots that point at
- * this bytenr and see if the new root does. If
- * it does not we need to make sure we update
- * quotas appropriately.
- */
- if (disko && root != BTRFS_I(src)->root &&
- disko != last_disko) {
- no_quota = check_ref(trans, root,
- disko);
- if (no_quota < 0) {
- btrfs_abort_transaction(trans,
- root,
- ret);
- btrfs_end_transaction(trans,
- root);
- ret = no_quota;
- goto out;
- }
- }
-
if (disko) {
inode_add_bytes(inode, datal);
ret = btrfs_inc_extent_ref(trans, root,
disko, diskl, 0,
root->root_key.objectid,
btrfs_ino(inode),
- new_key.offset - datao,
- no_quota);
+ new_key.offset - datao);
if (ret) {
btrfs_abort_transaction(trans,
root,
@@ -3594,21 +3681,6 @@ process_slot:
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
- u64 aligned_end = 0;
-
- /*
- * Don't copy an inline extent into an offset
- * greater than zero. Having an inline extent
- * at such an offset results in chaos as btrfs
- * isn't prepared for such cases. Just skip
- * this case for the same reasons as commented
- * at btrfs_ioctl_clone().
- */
- if (last_dest_end > 0) {
- ret = -EOPNOTSUPP;
- btrfs_end_transaction(trans, root);
- goto out;
- }
if (off > key.offset) {
skip = off - key.offset;
@@ -3626,42 +3698,22 @@ process_slot:
size -= skip + trim;
datal -= skip + trim;
- aligned_end = ALIGN(new_key.offset + datal,
- root->sectorsize);
- ret = btrfs_drop_extents(trans, root, inode,
- drop_start,
- aligned_end,
- 1);
+ ret = clone_copy_inline_extent(src, inode,
+ trans, path,
+ &new_key,
+ drop_start,
+ datal,
+ skip, size, buf);
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans,
- root, ret);
- btrfs_end_transaction(trans, root);
- goto out;
- }
-
- ret = btrfs_insert_empty_item(trans, root, path,
- &new_key, size);
- if (ret) {
- btrfs_abort_transaction(trans, root,
- ret);
+ root,
+ ret);
btrfs_end_transaction(trans, root);
goto out;
}
-
- if (skip) {
- u32 start =
- btrfs_file_extent_calc_inline_size(0);
- memmove(buf+start, buf+start+skip,
- datal);
- }
-
leaf = path->nodes[0];
slot = path->slots[0];
- write_extent_buffer(leaf, buf,
- btrfs_item_ptr_offset(leaf, slot),
- size);
- inode_add_bytes(inode, datal);
}
/* If we have an implicit hole (NO_HOLES feature). */
@@ -4639,6 +4691,11 @@ locked:
bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
}
+ if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
+ ret = -EINVAL;
+ goto out_bctl;
+ }
+
do_balance:
/*
* Ownership of bctl and mutually_exclusive_operation_running
@@ -4650,12 +4707,15 @@ do_balance:
need_unlock = false;
ret = btrfs_balance(bctl, bargs);
+ bctl = NULL;
if (arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
+out_bctl:
+ kfree(bctl);
out_bargs:
kfree(bargs);
out_unlock:
@@ -4806,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
/* update qgroup status and info */
err = btrfs_run_qgroups(trans, root->fs_info);
if (err < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"failed to update qgroup status and info\n");
err = btrfs_end_transaction(trans, root);
if (err && !ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf1b205..8077461fc56a 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
write_lock(&eb->lock);
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_inc(&eb->spinning_writers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_writers) &&
waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
BUG_ON(atomic_read(&eb->blocking_readers) == 0);
read_lock(&eb->lock);
atomic_inc(&eb->spinning_readers);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
}
btrfs_assert_tree_read_locked(eb);
WARN_ON(atomic_read(&eb->blocking_readers) == 0);
+ /*
+ * atomic_dec_and_test implies a barrier for waitqueue_active
+ */
if (atomic_dec_and_test(&eb->blocking_readers) &&
waitqueue_active(&eb->read_lock_wq))
wake_up(&eb->read_lock_wq);
@@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
if (blockers) {
WARN_ON(atomic_read(&eb->spinning_writers));
atomic_dec(&eb->blocking_writers);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&eb->write_lock_wq))
wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf1757e..8c27292ea9ea 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -409,6 +412,9 @@ have_entry:
if (entry->bytes_left == 0) {
ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+ /*
+ * Implicit memory barrier after test_and_set_bit
+ */
if (waitqueue_active(&entry->wait))
wake_up(&entry->wait);
} else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
spin_lock_irq(&log->log_extents_lock[index]);
while (!list_empty(&log->logged_list[index])) {
+ struct inode *inode;
ordered = list_first_entry(&log->logged_list[index],
struct btrfs_ordered_extent,
log_list);
list_del_init(&ordered->log_list);
+ inode = ordered->inode;
spin_unlock_irq(&log->log_extents_lock[index]);
if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
!test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
- struct inode *inode = ordered->inode;
u64 start = ordered->file_offset;
u64 end = ordered->file_offset + ordered->len - 1;
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
&ordered->flags));
/*
- * If our ordered extent completed it means it updated the
- * fs/subvol and csum trees already, so no need to make the
- * current transaction's commit wait for it, as we end up
- * holding memory unnecessarily and delaying the inode's iput
- * until the transaction commit (we schedule an iput for the
- * inode when the ordered extent's refcount drops to 0), which
- * prevents it from being evictable until the transaction
- * commits.
+ * In order to keep us from losing our ordered extent
+ * information when committing the transaction we have to make
+ * sure that any logged extents are completed when we go to
+ * commit the transaction. To do this we simply increase the
+ * current transactions pending_ordered counter and decrement it
+ * when the ordered extent completes.
*/
- if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags))
- btrfs_put_ordered_extent(ordered);
- else
- list_add_tail(&ordered->trans_list, &trans->ordered);
-
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ struct btrfs_ordered_inode_tree *tree;
+
+ tree = &BTRFS_I(inode)->ordered_tree;
+ spin_lock_irq(&tree->lock);
+ if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
+ set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
+ atomic_inc(&trans->transaction->pending_ordered);
+ }
+ spin_unlock_irq(&tree->lock);
+ }
+ btrfs_put_ordered_extent(ordered);
spin_lock_irq(&log->log_extents_lock[index]);
}
spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
struct btrfs_ordered_inode_tree *tree;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct rb_node *node;
+ bool dec_pending_ordered = false;
tree = &BTRFS_I(inode)->ordered_tree;
spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
if (tree->last == node)
tree->last = NULL;
set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+ if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
+ dec_pending_ordered = true;
spin_unlock_irq(&tree->lock);
+ /*
+ * The current running transaction is waiting on us, we need to let it
+ * know that we're complete and wake it up.
+ */
+ if (dec_pending_ordered) {
+ struct btrfs_transaction *trans;
+
+ /*
+ * The checks for trans are just a formality, it should be set,
+ * but if it isn't we don't want to deref/assert under the spin
+ * lock, so be nice and check if trans is set, but ASSERT() so
+ * if it isn't set a developer will notice.
+ */
+ spin_lock(&root->fs_info->trans_lock);
+ trans = root->fs_info->running_transaction;
+ if (trans)
+ atomic_inc(&trans->use_count);
+ spin_unlock(&root->fs_info->trans_lock);
+
+ ASSERT(trans);
+ if (trans) {
+ if (atomic_dec_and_test(&trans->pending_ordered))
+ wake_up(&trans->pending_wait);
+ btrfs_put_transaction(trans);
+ }
+ }
+
spin_lock(&root->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0fe43f..23c96059cef2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
* in the logging code. */
+#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
+ * complete in the current transaction. */
struct btrfs_ordered_extent {
/* logical offset in the file */
u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b04095..f9e60231f685 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
.extract = prop_compression_extract,
.inheritable = 1
},
- {
- .xattr_name = NULL
- }
};
void __init btrfs_props_init(void)
{
- struct prop_handler *p;
+ int i;
hash_init(prop_handlers_ht);
- for (p = &prop_handlers[0]; p->xattr_name; p++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ struct prop_handler *p = &prop_handlers[i];
u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
{
- const struct prop_handler *h;
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
+ int i;
if (!test_bit(BTRFS_INODE_HAS_PROPS,
&BTRFS_I(parent)->runtime_flags))
return 0;
- for (h = &prop_handlers[0]; h->xattr_name; h++) {
+ for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
+ const struct prop_handler *h = &prop_handlers[i];
const char *value;
u64 num_bytes;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1c5349..93e12c18ffd7 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
}
}
- /* For exclusive extent, free its reserved bytes too */
- if (nr_old_roots == 0 && nr_new_roots == 1 &&
- cur_new_count == nr_new_roots)
- qg->reserved -= num_bytes;
if (dirty)
qgroup_dirty(fs_info, qg);
}
@@ -2035,7 +2031,7 @@ out:
return ret;
}
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
+static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
@@ -2116,14 +2112,13 @@ out:
return ret;
}
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes)
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
{
struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
- struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist_node *unode;
struct ulist_iterator uiter;
- u64 ref_root = root->root_key.objectid;
int ret = 0;
if (!is_fstree(ref_root))
@@ -2169,6 +2164,11 @@ out:
spin_unlock(&fs_info->qgroup_lock);
}
+static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
+{
+ return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
+ num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
{
if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,17 +2188,16 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
*/
static int
qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct btrfs_trans_handle *trans,
- struct extent_buffer *scratch_leaf)
+ struct btrfs_trans_handle *trans)
{
struct btrfs_key found;
+ struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
u64 num_bytes;
int slot;
int ret;
- path->leave_spinning = 1;
mutex_lock(&fs_info->qgroup_rescan_lock);
ret = btrfs_search_slot_for_read(fs_info->extent_root,
&fs_info->qgroup_rescan_progress,
@@ -2229,7 +2228,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
- memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf));
+ scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
+ if (!scratch_leaf) {
+ ret = -ENOMEM;
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ goto out;
+ }
+ extent_buffer_get(scratch_leaf);
+ btrfs_tree_read_lock(scratch_leaf);
+ btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
slot = path->slots[0];
btrfs_release_path(path);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2262,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
goto out;
}
out:
+ if (scratch_leaf) {
+ btrfs_tree_read_unlock_blocking(scratch_leaf);
+ free_extent_buffer(scratch_leaf);
+ }
btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
return ret;
@@ -2266,19 +2277,15 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
qgroup_rescan_work);
struct btrfs_path *path;
struct btrfs_trans_handle *trans = NULL;
- struct extent_buffer *scratch_leaf = NULL;
int err = -ENOMEM;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
goto out;
- scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
- if (!scratch_leaf)
- goto out;
err = 0;
- while (!err) {
+ while (!err && !btrfs_fs_closing(fs_info)) {
trans = btrfs_start_transaction(fs_info->fs_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
@@ -2287,8 +2294,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
if (!fs_info->quota_enabled) {
err = -EINTR;
} else {
- err = qgroup_rescan_leaf(fs_info, path, trans,
- scratch_leaf);
+ err = qgroup_rescan_leaf(fs_info, path, trans);
}
if (err > 0)
btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,11 +2303,11 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
}
out:
- kfree(scratch_leaf);
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
@@ -2330,7 +2336,9 @@ out:
}
btrfs_end_transaction(trans, fs_info->quota_root);
- if (err >= 0) {
+ if (btrfs_fs_closing(fs_info)) {
+ btrfs_info(fs_info, "qgroup scan paused");
+ } else if (err >= 0) {
btrfs_info(fs_info, "qgroup scan completed%s",
err > 0 ? " (inconsistency flag cleared)" : "");
} else {
@@ -2378,12 +2386,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_progress, 0,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
+ init_completion(&fs_info->qgroup_rescan_completion);
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
- init_completion(&fs_info->qgroup_rescan_completion);
-
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
@@ -2486,3 +2493,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
+
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or doing
+ * nothing if the range is already reserved.
+ *
+ * Return 0 for successful reserve
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: this function may sleep for memory allocation.
+ */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
+{
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator uiter;
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ len == 0)
+ return 0;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ trace_btrfs_qgroup_reserve_data(inode, start, len,
+ changeset.bytes_changed,
+ QGROUP_RESERVE);
+ if (ret < 0)
+ goto cleanup;
+ ret = qgroup_reserve(root, changeset.bytes_changed);
+ if (ret < 0)
+ goto cleanup;
+
+ ulist_free(changeset.range_changed);
+ return ret;
+
+cleanup:
+ /* cleanup already reserved ranges */
+ ULIST_ITER_INIT(&uiter);
+ while ((unode = ulist_next(changeset.range_changed, &uiter)))
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
+ unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
+ GFP_NOFS);
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
+ int free)
+{
+ struct extent_changeset changeset;
+ int trace_op = QGROUP_RELEASE;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (!changeset.range_changed)
+ return -ENOMEM;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
+ &changeset);
+ if (ret < 0)
+ goto out;
+
+ if (free) {
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ trace_op = QGROUP_FREE;
+ }
+ trace_btrfs_qgroup_release_data(inode, start, len,
+ changeset.bytes_changed, trace_op);
+out:
+ ulist_free(changeset.range_changed);
+ return ret;
+}
+
+/*
+ * Free a reserved space range from io_tree and related qgroups
+ *
+ * Should be called when a range of pages get invalidated before reaching disk.
+ * Or for error cleanup case.
+ *
+ * For data written to disk, use btrfs_qgroup_release_data().
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 1);
+}
+
+/*
+ * Release a reserved space range from io_tree only.
+ *
+ * Should be called when a range of pages get written to disk and corresponding
+ * FILE_EXTENT is inserted into corresponding root.
+ *
+ * Since new qgroup accounting framework will only update qgroup numbers at
+ * commit_transaction() time, its reserved space shouldn't be freed from
+ * related qgroups.
+ *
+ * But we should release the range from io_tree, to allow further write to be
+ * COWed.
+ *
+ * NOTE: This function may sleep for memory allocation.
+ */
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
+{
+ return __btrfs_qgroup_release_data(inode, start, len, 0);
+}
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
+{
+ int ret;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
+ num_bytes == 0)
+ return 0;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ ret = qgroup_reserve(root, num_bytes);
+ if (ret < 0)
+ return ret;
+ atomic_add(num_bytes, &root->qgroup_meta_rsv);
+ return ret;
+}
+
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
+{
+ int reserved;
+
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
+ if (reserved == 0)
+ return;
+ qgroup_free(root, reserved);
+}
+
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
+{
+ if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
+ return;
+
+ BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
+ WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
+ atomic_sub(num_bytes, &root->qgroup_meta_rsv);
+ qgroup_free(root, num_bytes);
+}
+
+/*
+ * Check qgroup reserved space leaking, normally at destory inode
+ * time
+ */
+void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+{
+ struct extent_changeset changeset;
+ struct ulist_node *unode;
+ struct ulist_iterator iter;
+ int ret;
+
+ changeset.bytes_changed = 0;
+ changeset.range_changed = ulist_alloc(GFP_NOFS);
+ if (WARN_ON(!changeset.range_changed))
+ return;
+
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
+
+ WARN_ON(ret < 0);
+ if (WARN_ON(changeset.bytes_changed)) {
+ ULIST_ITER_INIT(&iter);
+ while ((unode = ulist_next(changeset.range_changed, &iter))) {
+ btrfs_warn(BTRFS_I(inode)->root->fs_info,
+ "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
+ inode->i_ino, unode->val, unode->aux);
+ }
+ qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
+ }
+ ulist_free(changeset.range_changed);
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcfa354c..ecb2c143ef75 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
struct ulist *old_roots;
};
+/*
+ * For qgroup event trace points only
+ */
+#define QGROUP_RESERVE (1<<0)
+#define QGROUP_RELEASE (1<<1)
+#define QGROUP_FREE (1<<2)
+
int btrfs_quota_enable(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
struct btrfs_qgroup_inherit *inherit);
-int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
-void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
-
+void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes);
+/*
+ * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
+ * called by everywhere, can't provide good trace for delayed ref case.
+ */
+static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
+ u64 ref_root, u64 num_bytes)
+{
+ btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
+ trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
+}
void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
u64 rfer, u64 excl);
#endif
+/* New io_tree based accurate qgroup reserve API */
+int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
+int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
+
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
+void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
+void btrfs_qgroup_check_reserved_leak(struct inode *inode);
#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265ca46f..1a33d3eb36de 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
}
goto done_nolock;
- } else if (waitqueue_active(&h->wait)) {
+ /*
+ * The barrier for this waitqueue_active is not needed,
+ * we're protected by h->lock and can't miss a wakeup.
+ */
+ } else if (waitqueue_active(&h->wait)) {
spin_unlock(&rbio->bio_list_lock);
spin_unlock_irqrestore(&h->lock, flags);
wake_up(&h->wait);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd16d5ba..619f92963e27 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
rec = kzalloc(sizeof(*rec), GFP_NOFS);
if (!rec) {
reada_extent_put(root->fs_info, re);
- return -1;
+ return -ENOMEM;
}
rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
u64 start;
u64 generation;
int level;
+ int ret;
struct extent_buffer *node;
static struct btrfs_key max_key = {
.objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
generation = btrfs_header_generation(node);
free_extent_buffer(node);
- if (reada_add_block(rc, start, &max_key, level, generation)) {
+ ret = reada_add_block(rc, start, &max_key, level, generation);
+ if (ret) {
kfree(rc);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(ret);
}
reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 303babeef505..b4ca5454ef1a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
num_bytes, parent,
btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
parent, btrfs_header_owner(leaf),
- key.objectid, key.offset, 1);
+ key.objectid, key.offset);
if (ret) {
btrfs_abort_transaction(trans, root, ret);
break;
@@ -1900,23 +1900,21 @@ again:
ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
path->nodes[level]->start,
- src->root_key.objectid, level - 1, 0,
- 1);
+ src->root_key.objectid, level - 1, 0);
BUG_ON(ret);
ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
0, dest->root_key.objectid, level - 1,
- 0, 1);
+ 0);
BUG_ON(ret);
btrfs_unlock_up_safe(path, 0);
@@ -2418,7 +2416,7 @@ again:
}
out:
if (ret) {
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
node->eb->start, blocksize,
upper->eb->start,
btrfs_header_owner(upper->eb),
- node->level, 0, 1);
+ node->level, 0);
BUG_ON(ret);
ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
BUG_ON(cluster->start != cluster->boundary[0]);
mutex_lock(&inode->i_mutex);
- ret = btrfs_check_data_free_space(inode, cluster->end +
- 1 - cluster->start, 0);
+ ret = btrfs_check_data_free_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
if (ret)
goto out;
@@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
break;
nr++;
}
- btrfs_free_reserved_data_space(inode, cluster->end +
- 1 - cluster->start);
+ btrfs_free_reserved_data_space(inode, cluster->start,
+ cluster->end + 1 - cluster->start);
out:
mutex_unlock(&inode->i_mutex);
return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728a639f..7cf8509deda7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
if (!need_reset && btrfs_root_generation(item)
!= btrfs_root_generation_v2(item)) {
if (btrfs_root_generation_v2(item) != 0) {
- printk(KERN_WARNING "BTRFS: mismatching "
+ btrfs_warn(eb->fs_info,
+ "mismatching "
"generation and generation_v2 "
"found in root item. This root "
"was probably mounted with an "
"older kernel. Resetting all "
- "new fields.\n");
+ "new fields.");
}
need_reset = 1;
}
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
int ret;
int slot;
unsigned long ptr;
- int old_len;
+ u32 old_len;
path = btrfs_alloc_path();
if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
trans = btrfs_join_transaction(tree_root);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to start trans to delete "
"orphan item");
break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
root_key.objectid);
btrfs_end_transaction(trans, tree_root);
if (err) {
- btrfs_error(tree_root->fs_info, err,
+ btrfs_std_error(tree_root->fs_info, err,
"Failed to delete root orphan "
"item");
break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d1144e8..2907a77fb1f6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -248,14 +248,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror);
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size);
+ struct scrub_block *sblock,
+ int retry_failed_mirror);
+static void scrub_recheck_block_checksum(struct scrub_block *sblock);
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
struct scrub_block *sblock_good);
static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
@@ -580,9 +575,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
* hold all of the paths here
*/
for (i = 0; i < ipath->fspath->elem_cnt; ++i)
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu, "
- "length %llu, links %u (path: %s)\n", swarn->errstr,
+ "length %llu, links %u (path: %s)", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset,
min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +587,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
return 0;
err:
- printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
+ btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
- "resolving failed with ret=%d\n", swarn->errstr,
+ "resolving failed with ret=%d", swarn->errstr,
swarn->logical, rcu_str_deref(swarn->dev->name),
(unsigned long long)swarn->sector, root, inum, offset, ret);
@@ -649,10 +644,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
item_size, &ref_root,
&ref_level);
- printk_in_rcu(KERN_WARNING
- "BTRFS: %s at logical %llu on dev %s, "
+ btrfs_warn_in_rcu(fs_info,
+ "%s at logical %llu on dev %s, "
"sector %llu: metadata %s (level %d) in tree "
- "%llu\n", errstr, swarn.logical,
+ "%llu", errstr, swarn.logical,
rcu_str_deref(dev->name),
(unsigned long long)swarn.sector,
ref_level ? "node" : "leaf",
@@ -850,8 +845,8 @@ out:
btrfs_dev_replace_stats_inc(
&sctx->dev_root->fs_info->dev_replace.
num_uncorrectable_read_errors);
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "unable to fixup (nodatasum) error at logical %llu on dev %s",
fixup->logical, rcu_str_deref(fixup->dev->name));
}
@@ -889,11 +884,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
struct btrfs_fs_info *fs_info;
u64 length;
u64 logical;
- u64 generation;
unsigned int failed_mirror_index;
unsigned int is_metadata;
unsigned int have_csum;
- u8 *csum;
struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
struct scrub_block *sblock_bad;
int ret;
@@ -918,13 +911,11 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
}
length = sblock_to_check->page_count * PAGE_SIZE;
logical = sblock_to_check->pagev[0]->logical;
- generation = sblock_to_check->pagev[0]->generation;
BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
is_metadata = !(sblock_to_check->pagev[0]->flags &
BTRFS_EXTENT_FLAG_DATA);
have_csum = sblock_to_check->pagev[0]->have_csum;
- csum = sblock_to_check->pagev[0]->csum;
dev = sblock_to_check->pagev[0]->dev;
if (sctx->is_dev_replace && !is_metadata && !have_csum) {
@@ -987,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
sblock_bad = sblocks_for_recheck + failed_mirror_index;
/* build and submit the bios for the failed mirror, check checksums */
- scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
- csum, generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen) {
@@ -1101,9 +1091,7 @@ nodatasum_case:
sblock_other = sblocks_for_recheck + mirror_index;
/* build and submit the bios, check checksums */
- scrub_recheck_block(fs_info, sblock_other, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size, 0);
+ scrub_recheck_block(fs_info, sblock_other, 0);
if (!sblock_other->header_error &&
!sblock_other->checksum_error &&
@@ -1215,9 +1203,7 @@ nodatasum_case:
* is verified, but most likely the data comes out
* of the page cache.
*/
- scrub_recheck_block(fs_info, sblock_bad,
- is_metadata, have_csum, csum,
- generation, sctx->csum_size, 1);
+ scrub_recheck_block(fs_info, sblock_bad, 1);
if (!sblock_bad->header_error &&
!sblock_bad->checksum_error &&
sblock_bad->no_io_error_seen)
@@ -1230,8 +1216,8 @@ corrected_error:
sctx->stat.corrected_errors++;
sblock_to_check->data_corrected = 1;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: fixed up error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
} else {
@@ -1239,8 +1225,8 @@ did_not_correct_error:
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
+ btrfs_err_rl_in_rcu(fs_info,
+ "unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
}
@@ -1318,6 +1304,9 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
u64 length = original_sblock->page_count * PAGE_SIZE;
u64 logical = original_sblock->pagev[0]->logical;
+ u64 generation = original_sblock->pagev[0]->generation;
+ u64 flags = original_sblock->pagev[0]->flags;
+ u64 have_csum = original_sblock->pagev[0]->have_csum;
struct scrub_recover *recover;
struct btrfs_bio *bbio;
u64 sublen;
@@ -1372,6 +1361,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck + mirror_index;
sblock->sctx = sctx;
+
page = kzalloc(sizeof(*page), GFP_NOFS);
if (!page) {
leave_nomem:
@@ -1383,7 +1373,15 @@ leave_nomem:
}
scrub_page_get(page);
sblock->pagev[page_index] = page;
+ page->sblock = sblock;
+ page->flags = flags;
+ page->generation = generation;
page->logical = logical;
+ page->have_csum = have_csum;
+ if (have_csum)
+ memcpy(page->csum,
+ original_sblock->pagev[0]->csum,
+ sctx->csum_size);
scrub_stripe_index_and_offset(logical,
bbio->map_type,
@@ -1474,15 +1472,12 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
* the pages that are errored in the just handled mirror can be repaired.
*/
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock, int is_metadata,
- int have_csum, u8 *csum, u64 generation,
- u16 csum_size, int retry_failed_mirror)
+ struct scrub_block *sblock,
+ int retry_failed_mirror)
{
int page_num;
sblock->no_io_error_seen = 1;
- sblock->header_error = 0;
- sblock->checksum_error = 0;
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio;
@@ -1518,9 +1513,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
}
if (sblock->no_io_error_seen)
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- csum_size);
+ scrub_recheck_block_checksum(sblock);
return;
}
@@ -1535,61 +1528,16 @@ static inline int scrub_check_fsid(u8 fsid[],
return !ret;
}
-static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
- struct scrub_block *sblock,
- int is_metadata, int have_csum,
- const u8 *csum, u64 generation,
- u16 csum_size)
+static void scrub_recheck_block_checksum(struct scrub_block *sblock)
{
- int page_num;
- u8 calculated_csum[BTRFS_CSUM_SIZE];
- u32 crc = ~(u32)0;
- void *mapped_buffer;
-
- WARN_ON(!sblock->pagev[0]->page);
- if (is_metadata) {
- struct btrfs_header *h;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- h = (struct btrfs_header *)mapped_buffer;
-
- if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
- !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
- memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
- BTRFS_UUID_SIZE)) {
- sblock->header_error = 1;
- } else if (generation != btrfs_stack_header_generation(h)) {
- sblock->header_error = 1;
- sblock->generation_error = 1;
- }
- csum = h->csum;
- } else {
- if (!have_csum)
- return;
-
- mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
- }
-
- for (page_num = 0;;) {
- if (page_num == 0 && is_metadata)
- crc = btrfs_csum_data(
- ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
- crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
- else
- crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
-
- kunmap_atomic(mapped_buffer);
- page_num++;
- if (page_num >= sblock->page_count)
- break;
- WARN_ON(!sblock->pagev[page_num]->page);
-
- mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
- }
+ sblock->header_error = 0;
+ sblock->checksum_error = 0;
+ sblock->generation_error = 0;
- btrfs_csum_final(crc, calculated_csum);
- if (memcmp(calculated_csum, csum, csum_size))
- sblock->checksum_error = 1;
+ if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
+ scrub_checksum_data(sblock);
+ else
+ scrub_checksum_tree_block(sblock);
}
static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
@@ -1626,9 +1574,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
int ret;
if (!page_bad->dev->bdev) {
- printk_ratelimited(KERN_WARNING "BTRFS: "
+ btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
"scrub_repair_page_from_good_copy(bdev == NULL) "
- "is unexpected!\n");
+ "is unexpected");
return -EIO;
}
@@ -1833,6 +1781,18 @@ static int scrub_checksum(struct scrub_block *sblock)
u64 flags;
int ret;
+ /*
+ * No need to initialize these stats currently,
+ * because this function only use return value
+ * instead of these stats value.
+ *
+ * Todo:
+ * always use stats
+ */
+ sblock->header_error = 0;
+ sblock->generation_error = 0;
+ sblock->checksum_error = 0;
+
WARN_ON(sblock->page_count < 1);
flags = sblock->pagev[0]->flags;
ret = 0;
@@ -1858,7 +1818,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
struct page *page;
void *buffer;
u32 crc = ~(u32)0;
- int fail = 0;
u64 len;
int index;
@@ -1889,9 +1848,9 @@ static int scrub_checksum_data(struct scrub_block *sblock)
btrfs_csum_final(crc, csum);
if (memcmp(csum, on_disk_csum, sctx->csum_size))
- fail = 1;
+ sblock->checksum_error = 1;
- return fail;
+ return sblock->checksum_error;
}
static int scrub_checksum_tree_block(struct scrub_block *sblock)
@@ -1907,8 +1866,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
u64 mapped_size;
void *p;
u32 crc = ~(u32)0;
- int fail = 0;
- int crc_fail = 0;
u64 len;
int index;
@@ -1923,19 +1880,20 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
* a) don't have an extent buffer and
* b) the page is already kmapped
*/
-
if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
- ++fail;
+ sblock->header_error = 1;
- if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
- ++fail;
+ if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
+ sblock->header_error = 1;
+ sblock->generation_error = 1;
+ }
if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
- ++fail;
+ sblock->header_error = 1;
if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
BTRFS_UUID_SIZE))
- ++fail;
+ sblock->header_error = 1;
len = sctx->nodesize - BTRFS_CSUM_SIZE;
mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1960,9 +1918,9 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
btrfs_csum_final(crc, calculated_csum);
if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
- ++crc_fail;
+ sblock->checksum_error = 1;
- return fail || crc_fail;
+ return sblock->header_error || sblock->checksum_error;
}
static int scrub_checksum_super(struct scrub_block *sblock)
@@ -2176,40 +2134,28 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
{
struct scrub_block *sblock = container_of(work, struct scrub_block, work);
struct scrub_ctx *sctx = sblock->sctx;
- struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
- unsigned int is_metadata;
- unsigned int have_csum;
- u8 *csum;
- u64 generation;
u64 logical;
struct btrfs_device *dev;
- is_metadata = !(sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA);
- have_csum = sblock->pagev[0]->have_csum;
- csum = sblock->pagev[0]->csum;
- generation = sblock->pagev[0]->generation;
logical = sblock->pagev[0]->logical;
dev = sblock->pagev[0]->dev;
- if (sblock->no_io_error_seen) {
- scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
- have_csum, csum, generation,
- sctx->csum_size);
- }
+ if (sblock->no_io_error_seen)
+ scrub_recheck_block_checksum(sblock);
if (!sblock->no_io_error_seen) {
spin_lock(&sctx->stat_lock);
sctx->stat.read_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: I/O error rebulding logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
- printk_ratelimited_in_rcu(KERN_ERR
- "BTRFS: failed to rebuild valid logical %llu for dev %s\n",
+ btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
+ "failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
} else {
scrub_write_block_to_dev_replace(sblock);
@@ -2500,8 +2446,7 @@ static void scrub_block_complete(struct scrub_block *sblock)
}
}
-static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
- u8 *csum)
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
{
struct btrfs_ordered_sum *sum = NULL;
unsigned long index;
@@ -2565,7 +2510,7 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
++sctx->stat.no_csum;
if (sctx->is_dev_replace && !have_csum) {
@@ -2703,7 +2648,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
if (flags & BTRFS_EXTENT_FLAG_DATA) {
/* push csums to sbio */
- have_csum = scrub_find_csum(sctx, logical, l, csum);
+ have_csum = scrub_find_csum(sctx, logical, csum);
if (have_csum == 0)
goto skip;
}
@@ -3012,6 +2957,9 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
logic_start + map->stripe_len)) {
btrfs_err(fs_info, "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
key.objectid, logic_start);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
again:
@@ -3361,6 +3309,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
"scrub: tree block %llu spanning "
"stripes, ignored. logical=%llu",
key.objectid, logical);
+ spin_lock(&sctx->stat_lock);
+ sctx->stat.uncorrectable_errors++;
+ spin_unlock(&sctx->stat_lock);
goto next;
}
@@ -4375,8 +4326,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
if (!dev)
return -EIO;
if (!dev->bdev) {
- printk_ratelimited(KERN_WARNING
- "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+ btrfs_warn_rl(dev->dev_root->fs_info,
+ "scrub write_page_nocow(bdev == NULL) is unexpected");
return -EIO;
}
bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index aa72bfd28f7d..355a458cba1a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
}
if (cur_clone_root) {
- if (compressed != BTRFS_COMPRESS_NONE) {
- /*
- * Offsets given by iterate_extent_inodes() are relative
- * to the start of the extent, we need to add logical
- * offset from the file extent item.
- * (See why at backref.c:check_extent_in_eb())
- */
- cur_clone_root->offset += btrfs_file_extent_offset(eb,
- fi);
- }
*found = cur_clone_root;
ret = 0;
} else {
@@ -1920,10 +1910,12 @@ static int did_overwrite_ref(struct send_ctx *sctx,
/*
* We know that it is or will be overwritten. Check this now.
* The current inode being processed might have been the one that caused
- * inode 'ino' to be orphanized, therefore ow_inode can actually be the
- * same as sctx->send_progress.
+ * inode 'ino' to be orphanized, therefore check if ow_inode matches
+ * the current inode being processed.
*/
- if (ow_inode <= sctx->send_progress)
+ if ((ow_inode < sctx->send_progress) ||
+ (ino != sctx->cur_ino && ow_inode == sctx->cur_ino &&
+ gen == sctx->cur_inode_gen))
ret = 1;
else
ret = 0;
@@ -2351,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
}
TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
- TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
- sctx->send_root->root_item.uuid);
+
+ if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.received_uuid);
+ else
+ TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
+ sctx->send_root->root_item.uuid);
+
TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
le64_to_cpu(sctx->send_root->root_item.ctransid));
if (parent_root) {
@@ -2562,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
} else if (S_ISSOCK(mode)) {
cmd = BTRFS_SEND_C_MKSOCK;
} else {
- printk(KERN_WARNING "btrfs: unexpected inode type %o",
+ btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
(int)(mode & S_IFMT));
ret = -ENOTSUPP;
goto out;
@@ -4685,6 +4683,171 @@ tlv_put_failure:
return ret;
}
+static int send_extent_data(struct send_ctx *sctx,
+ const u64 offset,
+ const u64 len)
+{
+ u64 sent = 0;
+
+ if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
+ return send_update_extent(sctx, offset, len);
+
+ while (sent < len) {
+ u64 size = len - sent;
+ int ret;
+
+ if (size > BTRFS_SEND_READ_SIZE)
+ size = BTRFS_SEND_READ_SIZE;
+ ret = send_write(sctx, offset + sent, size);
+ if (ret < 0)
+ return ret;
+ if (!ret)
+ break;
+ sent += ret;
+ }
+ return 0;
+}
+
+static int clone_range(struct send_ctx *sctx,
+ struct clone_root *clone_root,
+ const u64 disk_byte,
+ u64 data_offset,
+ u64 offset,
+ u64 len)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ /*
+ * We can't send a clone operation for the entire range if we find
+ * extent items in the respective range in the source file that
+ * refer to different extents or if we find holes.
+ * So check for that and do a mix of clone and regular write/copy
+ * operations if needed.
+ *
+ * Example:
+ *
+ * mkfs.btrfs -f /dev/sda
+ * mount /dev/sda /mnt
+ * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
+ * cp --reflink=always /mnt/foo /mnt/bar
+ * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
+ * btrfs subvolume snapshot -r /mnt /mnt/snap
+ *
+ * If when we send the snapshot and we are processing file bar (which
+ * has a higher inode number than foo) we blindly send a clone operation
+ * for the [0, 100K[ range from foo to bar, the receiver ends up getting
+ * a file bar that matches the content of file foo - iow, doesn't match
+ * the content from bar in the original filesystem.
+ */
+ key.objectid = clone_root->ino;
+ key.type = BTRFS_EXTENT_DATA_KEY;
+ key.offset = clone_root->offset;
+ ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+ if (ret > 0 && path->slots[0] > 0) {
+ btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
+ if (key.objectid == clone_root->ino &&
+ key.type == BTRFS_EXTENT_DATA_KEY)
+ path->slots[0]--;
+ }
+
+ while (true) {
+ struct extent_buffer *leaf = path->nodes[0];
+ int slot = path->slots[0];
+ struct btrfs_file_extent_item *ei;
+ u8 type;
+ u64 ext_len;
+ u64 clone_len;
+
+ if (slot >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(clone_root->root, path);
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
+ break;
+ continue;
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+
+ /*
+ * We might have an implicit trailing hole (NO_HOLES feature
+ * enabled). We deal with it after leaving this loop.
+ */
+ if (key.objectid != clone_root->ino ||
+ key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(leaf, ei);
+ if (type == BTRFS_FILE_EXTENT_INLINE) {
+ ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
+ ext_len = PAGE_CACHE_ALIGN(ext_len);
+ } else {
+ ext_len = btrfs_file_extent_num_bytes(leaf, ei);
+ }
+
+ if (key.offset + ext_len <= clone_root->offset)
+ goto next;
+
+ if (key.offset > clone_root->offset) {
+ /* Implicit hole, NO_HOLES feature enabled. */
+ u64 hole_len = key.offset - clone_root->offset;
+
+ if (hole_len > len)
+ hole_len = len;
+ ret = send_extent_data(sctx, offset, hole_len);
+ if (ret < 0)
+ goto out;
+
+ len -= hole_len;
+ if (len == 0)
+ break;
+ offset += hole_len;
+ clone_root->offset += hole_len;
+ data_offset += hole_len;
+ }
+
+ if (key.offset >= clone_root->offset + len)
+ break;
+
+ clone_len = min_t(u64, ext_len, len);
+
+ if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
+ btrfs_file_extent_offset(leaf, ei) == data_offset)
+ ret = send_clone(sctx, offset, clone_len, clone_root);
+ else
+ ret = send_extent_data(sctx, offset, clone_len);
+
+ if (ret < 0)
+ goto out;
+
+ len -= clone_len;
+ if (len == 0)
+ break;
+ offset += clone_len;
+ clone_root->offset += clone_len;
+ data_offset += clone_len;
+next:
+ path->slots[0]++;
+ }
+
+ if (len > 0)
+ ret = send_extent_data(sctx, offset, len);
+ else
+ ret = 0;
+out:
+ btrfs_free_path(path);
+ return ret;
+}
+
static int send_write_or_clone(struct send_ctx *sctx,
struct btrfs_path *path,
struct btrfs_key *key,
@@ -4693,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
int ret = 0;
struct btrfs_file_extent_item *ei;
u64 offset = key->offset;
- u64 pos = 0;
u64 len;
- u32 l;
u8 type;
u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
@@ -4723,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
}
if (clone_root && IS_ALIGNED(offset + len, bs)) {
- ret = send_clone(sctx, offset, len, clone_root);
- } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
- ret = send_update_extent(sctx, offset, len);
+ u64 disk_byte;
+ u64 data_offset;
+
+ disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
+ data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
+ ret = clone_range(sctx, clone_root, disk_byte, data_offset,
+ offset, len);
} else {
- while (pos < len) {
- l = len - pos;
- if (l > BTRFS_SEND_READ_SIZE)
- l = BTRFS_SEND_READ_SIZE;
- ret = send_write(sctx, pos + offset, l);
- if (ret < 0)
- goto out;
- if (!ret)
- break;
- pos += ret;
- }
- ret = 0;
+ ret = send_extent_data(sctx, offset, len);
}
out:
return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2b07b3581781..24154e422945 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
}
}
-#ifdef CONFIG_PRINTK
/*
* __btrfs_std_error decodes expected errors from the caller and
* invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
+#ifdef CONFIG_PRINTK
const char *errstr;
+#endif
/*
* Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
return;
+#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
if (fmt) {
struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
sb->s_id, function, line, errno, errstr);
}
+#endif
/* Don't go through full error handling during mount */
save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
btrfs_handle_error(fs_info);
}
+#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
va_end(args);
}
-
-#else
-
-void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
- unsigned int line, int errno, const char *fmt, ...)
-{
- struct super_block *sb = fs_info->sb;
-
- /*
- * Special case: if the error is EROFS, and we're already
- * under MS_RDONLY, then it is safe here.
- */
- if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
- return;
-
- /* Don't go through full error handling during mount */
- if (sb->s_flags & MS_BORN) {
- save_error_info(fs_info);
- btrfs_handle_error(fs_info);
- }
-}
#endif
/*
@@ -320,6 +303,9 @@ enum {
Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
Opt_datasum, Opt_treelog, Opt_noinode_cache,
+#ifdef CONFIG_BTRFS_DEBUG
+ Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+#endif
Opt_err,
};
@@ -372,6 +358,11 @@ static match_table_t tokens = {
{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
{Opt_fatal_errors, "fatal_errors=%s"},
{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_DEBUG
+ {Opt_fragment_data, "fragment=data"},
+ {Opt_fragment_metadata, "fragment=metadata"},
+ {Opt_fragment_all, "fragment=all"},
+#endif
{Opt_err, NULL},
};
@@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
}
break;
+#ifdef CONFIG_BTRFS_DEBUG
+ case Opt_fragment_all:
+ btrfs_info(root->fs_info, "fragmenting all space");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_metadata:
+ btrfs_info(root->fs_info, "fragmenting metadata");
+ btrfs_set_opt(info->mount_opt,
+ FRAGMENT_METADATA);
+ break;
+ case Opt_fragment_data:
+ btrfs_info(root->fs_info, "fragmenting data");
+ btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ break;
+#endif
case Opt_err:
btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
ret = -EINVAL;
@@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",fatal_errors=panic");
if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
seq_printf(seq, ",commit=%d", info->commit_interval);
+#ifdef CONFIG_BTRFS_DEBUG
+ if (btrfs_test_opt(root, FRAGMENT_DATA))
+ seq_puts(seq, ",fragment=data");
+ if (btrfs_test_opt(root, FRAGMENT_METADATA))
+ seq_puts(seq, ",fragment=metadata");
+#endif
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
seq_puts(seq, ",subvol=");
@@ -1658,9 +1671,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
* groups on disk until we're mounted read-write again
* unless we clean them up here.
*/
- mutex_lock(&root->fs_info->cleaner_mutex);
btrfs_delete_unused_bgs(fs_info);
- mutex_unlock(&root->fs_info->cleaner_mutex);
btrfs_dev_replace_suspend_for_unmount(fs_info);
btrfs_scrub_cancel(fs_info);
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc2b9bb..e0ac85949067 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
NULL,
};
-static void btrfs_release_super_kobj(struct kobject *kobj)
+static void btrfs_release_fsid_kobj(struct kobject *kobj)
{
struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
- memset(&fs_devs->super_kobj, 0, sizeof(struct kobject));
+ memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
complete(&fs_devs->kobj_unregister);
}
static struct kobj_type btrfs_ktype = {
.sysfs_ops = &kobj_sysfs_ops,
- .release = btrfs_release_super_kobj,
+ .release = btrfs_release_fsid_kobj,
};
static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
{
if (kobj->ktype != &btrfs_ktype)
return NULL;
- return container_of(kobj, struct btrfs_fs_devices, super_kobj);
+ return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
}
static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
attrs[0] = &fa->kobj_attr.attr;
if (add) {
int ret;
- ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj,
+ ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
if (ret)
return ret;
} else
- sysfs_unmerge_group(&fs_info->fs_devices->super_kobj,
+ sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
&agroup);
}
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
fs_devs->device_dir_kobj = NULL;
}
- if (fs_devs->super_kobj.state_initialized) {
- kobject_del(&fs_devs->super_kobj);
- kobject_put(&fs_devs->super_kobj);
+ if (fs_devs->fsid_kobj.state_initialized) {
+ kobject_del(&fs_devs->fsid_kobj);
+ kobject_put(&fs_devs->fsid_kobj);
wait_for_completion(&fs_devs->kobj_unregister);
}
}
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
}
}
-void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
+void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
{
btrfs_reset_fs_info_ptr(fs_info);
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
kobject_put(fs_info->space_info_kobj);
}
addrm_unknown_feature_attrs(fs_info, false);
- sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group);
- sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs);
- btrfs_kobj_rm_device(fs_info->fs_devices, NULL);
+ sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
+ sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
}
const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
/* when one_device is NULL, it removes all device links */
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
{
if (!fs_devs->device_dir_kobj)
fs_devs->device_dir_kobj = kobject_create_and_add("devices",
- &fs_devs->super_kobj);
+ &fs_devs->fsid_kobj);
if (!fs_devs->device_dir_kobj)
return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
return 0;
}
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device)
{
int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
int error;
init_completion(&fs_devs->kobj_unregister);
- fs_devs->super_kobj.kset = btrfs_kset;
- error = kobject_init_and_add(&fs_devs->super_kobj,
+ fs_devs->fsid_kobj.kset = btrfs_kset;
+ error = kobject_init_and_add(&fs_devs->fsid_kobj,
&btrfs_ktype, parent, "%pU", fs_devs->fsid);
return error;
}
-int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
+int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
{
int error;
struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
- struct kobject *super_kobj = &fs_devs->super_kobj;
+ struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
btrfs_set_fs_info_ptr(fs_info);
- error = btrfs_kobj_add_device(fs_devs, NULL);
+ error = btrfs_sysfs_add_device_link(fs_devs, NULL);
if (error)
return error;
- error = sysfs_create_files(super_kobj, btrfs_attrs);
+ error = sysfs_create_files(fsid_kobj, btrfs_attrs);
if (error) {
- btrfs_kobj_rm_device(fs_devs, NULL);
+ btrfs_sysfs_rm_device_link(fs_devs, NULL);
return error;
}
- error = sysfs_create_group(super_kobj,
+ error = sysfs_create_group(fsid_kobj,
&btrfs_feature_attr_group);
if (error)
goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
goto failure;
fs_info->space_info_kobj = kobject_create_and_add("allocation",
- super_kobj);
+ fsid_kobj);
if (!fs_info->space_info_kobj) {
error = -ENOMEM;
goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
return 0;
failure:
- btrfs_sysfs_remove_one(fs_info);
+ btrfs_sysfs_remove_mounted(fs_info);
return error;
}
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527bcc15..9c09522125a6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
extern const char * const btrfs_feature_set_names[3];
extern struct kobj_type space_info_ktype;
extern struct kobj_type btrfs_raid_ktype;
-int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
-int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices,
+int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *one_device);
int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
struct kobject *parent);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfde39ee..c8c3d70c31ff 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include "btrfs-tests.h"
#include "../ctree.h"
+#include "../disk-io.h"
#include "../free-space-cache.h"
#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
kfree(cache);
return NULL;
}
+ cache->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!cache->fs_info) {
+ kfree(cache->free_space_ctl);
+ kfree(cache);
+ return NULL;
+ }
cache->key.objectid = 0;
cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
int btrfs_test_free_space_cache(void)
{
struct btrfs_block_group_cache *cache;
- int ret;
+ struct btrfs_root *root = NULL;
+ int ret = -ENOMEM;
test_msg("Running btrfs free space cache tests\n");
@@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void)
return 0;
}
+ root = btrfs_alloc_dummy_root();
+ if (!root)
+ goto out;
+
+ root->fs_info = btrfs_alloc_dummy_fs_info();
+ if (!root->fs_info)
+ goto out;
+
+ root->fs_info->extent_root = root;
+ cache->fs_info = root->fs_info;
+
ret = test_extents(cache);
if (ret)
goto out;
@@ -904,6 +923,7 @@ out:
__btrfs_remove_free_space_cache(cache->free_space_ctl);
kfree(cache->free_space_ctl);
kfree(cache);
+ btrfs_free_dummy_root(root);
test_msg("Free space cache tests finished\n");
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8f259b3a66b3..418c6a2ad7d8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
static void clear_btree_io_tree(struct extent_io_tree *tree)
{
spin_lock(&tree->lock);
+ /*
+ * Do a single barrier for the waitqueue_active check here, the state
+ * of the waitqueue should not change once clear_btree_io_tree is
+ * called.
+ */
+ smp_mb();
while (!RB_EMPTY_ROOT(&tree->state)) {
struct rb_node *node;
struct extent_state *state;
@@ -117,6 +123,18 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans,
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
+
+ /* We can free old roots now. */
+ spin_lock(&trans->dropped_roots_lock);
+ while (!list_empty(&trans->dropped_roots)) {
+ root = list_first_entry(&trans->dropped_roots,
+ struct btrfs_root, root_list);
+ list_del_init(&root->root_list);
+ spin_unlock(&trans->dropped_roots_lock);
+ btrfs_drop_and_free_fs_root(fs_info, root);
+ spin_lock(&trans->dropped_roots_lock);
+ }
+ spin_unlock(&trans->dropped_roots_lock);
up_write(&fs_info->commit_root_sem);
}
@@ -214,25 +232,22 @@ loop:
extwriter_counter_init(cur_trans, type);
init_waitqueue_head(&cur_trans->writer_wait);
init_waitqueue_head(&cur_trans->commit_wait);
+ init_waitqueue_head(&cur_trans->pending_wait);
cur_trans->state = TRANS_STATE_RUNNING;
/*
* One for this trans handle, one so it will live on until we
* commit the transaction.
*/
atomic_set(&cur_trans->use_count, 2);
- cur_trans->have_free_bgs = 0;
+ atomic_set(&cur_trans->pending_ordered, 0);
+ cur_trans->flags = 0;
cur_trans->start_time = get_seconds();
- cur_trans->dirty_bg_run = 0;
+
+ memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
cur_trans->delayed_refs.href_root = RB_ROOT;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
- cur_trans->delayed_refs.num_heads_ready = 0;
- cur_trans->delayed_refs.pending_csums = 0;
- cur_trans->delayed_refs.num_heads = 0;
- cur_trans->delayed_refs.flushing = 0;
- cur_trans->delayed_refs.run_delayed_start = 0;
- cur_trans->delayed_refs.qgroup_to_skip = 0;
/*
* although the tree mod log is per file system and not per transaction,
@@ -252,14 +267,15 @@ loop:
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
INIT_LIST_HEAD(&cur_trans->pending_chunks);
INIT_LIST_HEAD(&cur_trans->switch_commits);
- INIT_LIST_HEAD(&cur_trans->pending_ordered);
INIT_LIST_HEAD(&cur_trans->dirty_bgs);
INIT_LIST_HEAD(&cur_trans->io_bgs);
+ INIT_LIST_HEAD(&cur_trans->dropped_roots);
mutex_init(&cur_trans->cache_write_mutex);
cur_trans->num_dirty_bgs = 0;
spin_lock_init(&cur_trans->dirty_bgs_lock);
INIT_LIST_HEAD(&cur_trans->deleted_bgs);
spin_lock_init(&cur_trans->deleted_bgs_lock);
+ spin_lock_init(&cur_trans->dropped_roots_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
fs_info->btree_inode->i_mapping);
@@ -336,6 +352,24 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
}
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
+{
+ struct btrfs_transaction *cur_trans = trans->transaction;
+
+ /* Add ourselves to the transaction dropped list */
+ spin_lock(&cur_trans->dropped_roots_lock);
+ list_add_tail(&root->root_list, &cur_trans->dropped_roots);
+ spin_unlock(&cur_trans->dropped_roots_lock);
+
+ /* Make sure we don't try to update the root at commit time */
+ spin_lock(&root->fs_info->fs_roots_radix_lock);
+ radix_tree_tag_clear(&root->fs_info->fs_roots_radix,
+ (unsigned long)root->root_key.objectid,
+ BTRFS_ROOT_TRANS_TAG);
+ spin_unlock(&root->fs_info->fs_roots_radix_lock);
+}
+
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
@@ -415,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
}
static struct btrfs_trans_handle *
-start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
- enum btrfs_reserve_flush_enum flush)
+start_transaction(struct btrfs_root *root, unsigned int num_items,
+ unsigned int type, enum btrfs_reserve_flush_enum flush)
{
struct btrfs_trans_handle *h;
struct btrfs_transaction *cur_trans;
@@ -446,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
* the appropriate flushing if need be.
*/
if (num_items > 0 && root != root->fs_info->chunk_root) {
- if (root->fs_info->quota_enabled &&
- is_fstree(root->root_key.objectid)) {
- qgroup_reserved = num_items * root->nodesize;
- ret = btrfs_qgroup_reserve(root, qgroup_reserved);
- if (ret)
- return ERR_PTR(ret);
- }
+ qgroup_reserved = num_items * root->nodesize;
+ ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
+ if (ret)
+ return ERR_PTR(ret);
num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
/*
@@ -470,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
goto reserve_fail;
}
again:
- h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+ h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
if (!h) {
ret = -ENOMEM;
goto alloc_fail;
@@ -511,25 +542,13 @@ again:
h->transid = cur_trans->transid;
h->transaction = cur_trans;
- h->blocks_used = 0;
- h->bytes_reserved = 0;
- h->chunk_bytes_reserved = 0;
h->root = root;
- h->delayed_ref_updates = 0;
h->use_count = 1;
- h->adding_csums = 0;
- h->block_rsv = NULL;
- h->orig_rsv = NULL;
- h->aborted = 0;
- h->qgroup_reserved = 0;
- h->delayed_ref_elem.seq = 0;
+
h->type = type;
- h->allocating_chunk = false;
- h->reloc_reserved = false;
- h->sync = false;
+ h->can_flush_pending_bgs = true;
INIT_LIST_HEAD(&h->qgroup_ref_list);
INIT_LIST_HEAD(&h->new_bgs);
- INIT_LIST_HEAD(&h->ordered);
smp_mb();
if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -546,7 +565,6 @@ again:
h->bytes_reserved = num_bytes;
h->reloc_reserved = reloc_reserved;
}
- h->qgroup_reserved = qgroup_reserved;
got_it:
btrfs_record_root_in_trans(h, root);
@@ -564,20 +582,20 @@ alloc_fail:
btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
num_bytes);
reserve_fail:
- if (qgroup_reserved)
- btrfs_qgroup_free(root, qgroup_reserved);
+ btrfs_qgroup_free_meta(root, qgroup_reserved);
return ERR_PTR(ret);
}
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items)
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_ALL);
}
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items)
+ struct btrfs_root *root,
+ unsigned int num_items)
{
return start_transaction(root, num_items, TRANS_START,
BTRFS_RESERVE_FLUSH_LIMIT);
@@ -761,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (!list_empty(&trans->new_bgs))
btrfs_create_pending_block_groups(trans, root);
- if (!list_empty(&trans->ordered)) {
- spin_lock(&info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
- spin_unlock(&info->trans_lock);
- }
-
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
@@ -782,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
must_run_delayed_refs = 2;
}
- if (trans->qgroup_reserved) {
- /*
- * the same root has to be passed here between start_transaction
- * and end_transaction. Subvolume quota depends on this.
- */
- btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
-
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
@@ -823,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
atomic_dec(&cur_trans->num_writers);
extwriter_counter_dec(cur_trans, trans->type);
+ /*
+ * Make sure counter is updated before we wake up waiters.
+ */
smp_mb();
if (waitqueue_active(&cur_trans->writer_wait))
wake_up(&cur_trans->writer_wait);
@@ -1205,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
spin_lock(&fs_info->fs_roots_radix_lock);
if (err)
break;
+ btrfs_qgroup_free_meta_all(root);
}
}
spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1762,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
}
static inline void
-btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans,
- struct btrfs_fs_info *fs_info)
+btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
{
- struct btrfs_ordered_extent *ordered;
-
- spin_lock(&fs_info->trans_lock);
- while (!list_empty(&cur_trans->pending_ordered)) {
- ordered = list_first_entry(&cur_trans->pending_ordered,
- struct btrfs_ordered_extent,
- trans_list);
- list_del_init(&ordered->trans_list);
- spin_unlock(&fs_info->trans_lock);
-
- wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
- &ordered->flags));
- btrfs_put_ordered_extent(ordered);
- spin_lock(&fs_info->trans_lock);
- }
- spin_unlock(&fs_info->trans_lock);
+ wait_event(cur_trans->pending_wait,
+ atomic_read(&cur_trans->pending_ordered) == 0);
}
int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1809,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_metadata(trans, root);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
cur_trans = trans->transaction;
@@ -1832,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
return ret;
}
- if (!cur_trans->dirty_bg_run) {
+ if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
int run_it = 0;
/* this mutex is also taken before trying to set
@@ -1841,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
* after a extents from that block group have been
* allocated for cache files. btrfs_set_block_group_ro
* will wait for the transaction to commit if it
- * finds dirty_bg_run = 1
+ * finds BTRFS_TRANS_DIRTY_BG_RUN set.
*
- * The dirty_bg_run flag is also used to make sure only
- * one process starts all the block group IO. It wouldn't
+ * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
+ * only one process starts all the block group IO. It wouldn't
* hurt to have more than one go through, but there's no
* real advantage to it either.
*/
mutex_lock(&root->fs_info->ro_block_group_mutex);
- if (!cur_trans->dirty_bg_run) {
+ if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
+ &cur_trans->flags))
run_it = 1;
- cur_trans->dirty_bg_run = 1;
- }
mutex_unlock(&root->fs_info->ro_block_group_mutex);
if (run_it)
@@ -1864,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
spin_lock(&root->fs_info->trans_lock);
- list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
spin_unlock(&root->fs_info->trans_lock);
atomic_inc(&cur_trans->use_count);
@@ -1923,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_wait_delalloc_flush(root->fs_info);
- btrfs_wait_pending_ordered(cur_trans, root->fs_info);
+ btrfs_wait_pending_ordered(cur_trans);
btrfs_scrub_pause(root);
/*
@@ -2103,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
ret = btrfs_write_and_wait_transaction(trans, root);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Error while writing out transaction");
mutex_unlock(&root->fs_info->tree_log_mutex);
goto scrub_continue;
@@ -2123,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_finish_extent_commit(trans, root);
- if (cur_trans->have_free_bgs)
+ if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
btrfs_clear_space_info_full(root->fs_info);
root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2165,10 +2151,6 @@ cleanup_transaction:
btrfs_trans_release_metadata(trans, root);
btrfs_trans_release_chunk_metadata(trans);
trans->block_rsv = NULL;
- if (trans->qgroup_reserved) {
- btrfs_qgroup_free(root, trans->qgroup_reserved);
- trans->qgroup_reserved = 0;
- }
btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
if (current->journal_info == trans)
current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index edc2fbc262d7..b05b2f64d913 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
TRANS_STATE_MAX = 6,
};
+#define BTRFS_TRANS_HAVE_FREE_BGS 0
+#define BTRFS_TRANS_DIRTY_BG_RUN 1
+#define BTRFS_TRANS_CACHE_ENOSPC 2
+
struct btrfs_transaction {
u64 transid;
/*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
*/
atomic_t num_writers;
atomic_t use_count;
+ atomic_t pending_ordered;
- /*
- * true if there is free bgs operations in this transaction
- */
- int have_free_bgs;
+ unsigned long flags;
/* Be protected by fs_info->trans_lock when we want to change it. */
enum btrfs_trans_state state;
@@ -59,12 +61,13 @@ struct btrfs_transaction {
unsigned long start_time;
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
+ wait_queue_head_t pending_wait;
struct list_head pending_snapshots;
struct list_head pending_chunks;
- struct list_head pending_ordered;
struct list_head switch_commits;
struct list_head dirty_bgs;
struct list_head io_bgs;
+ struct list_head dropped_roots;
u64 num_dirty_bgs;
/*
@@ -76,9 +79,9 @@ struct btrfs_transaction {
spinlock_t dirty_bgs_lock;
struct list_head deleted_bgs;
spinlock_t deleted_bgs_lock;
+ spinlock_t dropped_roots_lock;
struct btrfs_delayed_ref_root delayed_refs;
int aborted;
- int dirty_bg_run;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -105,7 +108,6 @@ struct btrfs_trans_handle {
u64 transid;
u64 bytes_reserved;
u64 chunk_bytes_reserved;
- u64 qgroup_reserved;
unsigned long use_count;
unsigned long blocks_reserved;
unsigned long blocks_used;
@@ -116,6 +118,7 @@ struct btrfs_trans_handle {
short aborted;
short adding_csums;
bool allocating_chunk;
+ bool can_flush_pending_bgs;
bool reloc_reserved;
bool sync;
unsigned int type;
@@ -126,7 +129,6 @@ struct btrfs_trans_handle {
*/
struct btrfs_root *root;
struct seq_list delayed_ref_elem;
- struct list_head ordered;
struct list_head qgroup_ref_list;
struct list_head new_bgs;
};
@@ -182,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
int btrfs_end_transaction(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
- int num_items);
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_start_transaction_lflush(
- struct btrfs_root *root, int num_items);
+ struct btrfs_root *root,
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
@@ -216,5 +219,6 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
-
+void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root);
#endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace73383..323e12cc9d2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
void btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&root->log_writer_wait))
wake_up(&root->log_writer_wait);
}
@@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
ret = btrfs_inc_extent_ref(trans, root,
ins.objectid, ins.offset,
0, root->root_key.objectid,
- key->objectid, offset, 0);
+ key->objectid, offset);
if (ret)
goto out;
} else {
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&log_root_tree->log_mutex);
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- smp_mb();
+ /*
+ * Implicit memory barrier after atomic_dec_and_test
+ */
if (waitqueue_active(&log_root_tree->log_writer_wait))
wake_up(&log_root_tree->log_writer_wait);
}
@@ -2950,6 +2954,9 @@ out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
mutex_unlock(&log_root_tree->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
wake_up(&log_root_tree->log_commit_wait[index2]);
out:
@@ -2961,6 +2968,9 @@ out:
atomic_set(&root->log_commit[index1], 0);
mutex_unlock(&root->log_mutex);
+ /*
+ * The barrier before waitqueue_active is implied by mutex_unlock
+ */
if (waitqueue_active(&root->log_commit_wait[index1]))
wake_up(&root->log_commit_wait[index1]);
return ret;
@@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
ret = walk_log_tree(trans, log_root_tree, &wc);
if (ret) {
- btrfs_error(fs_info, ret, "Failed to pin buffers while "
+ btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
"recovering log root tree.");
goto error;
}
@@ -5328,7 +5338,7 @@ again:
ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
if (ret < 0) {
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't find tree log root.");
goto error;
}
@@ -5346,7 +5356,7 @@ again:
log = btrfs_read_fs_root(log_root_tree, &found_key);
if (IS_ERR(log)) {
ret = PTR_ERR(log);
- btrfs_error(fs_info, ret,
+ btrfs_std_error(fs_info, ret,
"Couldn't read tree log root.");
goto error;
}
@@ -5361,7 +5371,7 @@ again:
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
- btrfs_error(fs_info, ret, "Couldn't read target root "
+ btrfs_std_error(fs_info, ret, "Couldn't read target root "
"for tree log recovery.");
goto error;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc735869c18..a6df8fdc1312 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
#include "dev-replace.h"
#include "sysfs.h"
+const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = {
+ .sub_stripes = 2,
+ .dev_stripes = 1,
+ .devs_max = 0, /* 0 == as many as possible */
+ .devs_min = 4,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID1] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 2,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 2,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_DUP] = {
+ .sub_stripes = 1,
+ .dev_stripes = 2,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID0] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_SINGLE] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 1,
+ .devs_min = 1,
+ .tolerated_failures = 0,
+ .devs_increment = 1,
+ .ncopies = 1,
+ },
+ [BTRFS_RAID_RAID5] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 2,
+ .tolerated_failures = 1,
+ .devs_increment = 1,
+ .ncopies = 2,
+ },
+ [BTRFS_RAID_RAID6] = {
+ .sub_stripes = 1,
+ .dev_stripes = 1,
+ .devs_max = 0,
+ .devs_min = 3,
+ .tolerated_failures = 2,
+ .devs_increment = 1,
+ .ncopies = 3,
+ },
+};
+
+const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+ [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
+ [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
+ [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
+ [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
+ [BTRFS_RAID_SINGLE] = 0,
+ [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
+ [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
+};
+
static int init_first_rw_device(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_device *device);
@@ -156,8 +232,8 @@ static struct btrfs_device *__alloc_device(void)
spin_lock_init(&dev->reada_lock);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0);
- INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
- INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
+ INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
return dev;
}
@@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
if (IS_ERR(*bdev)) {
ret = PTR_ERR(*bdev);
- printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
goto error;
}
@@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
}
invalidate_bdev(*bdev);
*bh = btrfs_read_dev_super(*bdev);
- if (!*bh) {
- ret = -EINVAL;
+ if (IS_ERR(*bh)) {
+ ret = PTR_ERR(*bh);
blkdev_put(*bdev, flags);
goto error;
}
@@ -345,6 +420,9 @@ loop_lock:
pending = pending->bi_next;
cur->bi_next = NULL;
+ /*
+ * atomic_dec_return implies a barrier for waitqueue_active
+ */
if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
waitqueue_active(&fs_info->async_submit_wait))
wake_up(&fs_info->async_submit_wait);
@@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
- struct btrfs_device *new_device;
- struct rcu_string *name;
-
- if (device->bdev)
- fs_devices->open_devices--;
-
- if (device->writeable &&
- device->devid != BTRFS_DEV_REPLACE_DEVID) {
- list_del_init(&device->dev_alloc_list);
- fs_devices->rw_devices--;
- }
-
- if (device->missing)
- fs_devices->missing_devices--;
-
- new_device = btrfs_alloc_device(NULL, &device->devid,
- device->uuid);
- BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
-
- /* Safe because we are under uuid_mutex */
- if (device->name) {
- name = rcu_string_strdup(device->name->str, GFP_NOFS);
- BUG_ON(!name); /* -ENOMEM */
- rcu_assign_pointer(new_device->name, name);
- }
-
- list_replace_rcu(&device->dev_list, &new_device->dev_list);
- new_device->fs_devices = device->fs_devices;
-
- call_rcu(&device->rcu, free_device);
+ btrfs_close_one_device(device);
}
mutex_unlock(&fs_devices->device_list_mutex);
@@ -1402,7 +1451,7 @@ again:
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_dev_extent);
} else {
- btrfs_error(root->fs_info, ret, "Slot search failed");
+ btrfs_std_error(root->fs_info, ret, "Slot search failed");
goto out;
}
@@ -1410,10 +1459,10 @@ again:
ret = btrfs_del_item(trans, root, path);
if (ret) {
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to remove dev extent item");
} else {
- trans->transaction->have_free_bgs = 1;
+ set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
}
out:
btrfs_free_path(path);
@@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
if (device->bdev) {
device->fs_devices->open_devices--;
/* remove sysfs entry */
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
}
call_rcu(&device->rcu, free_device);
@@ -1924,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
if (srcdev->writeable) {
fs_devices->rw_devices--;
/* zero out the old super if it is writable */
- btrfs_scratch_superblock(srcdev);
+ btrfs_scratch_superblocks(srcdev->bdev,
+ rcu_str_deref(srcdev->name));
}
if (srcdev->bdev)
@@ -1971,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
WARN_ON(!tgtdev);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev);
+ btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
if (tgtdev->bdev) {
- btrfs_scratch_superblock(tgtdev);
+ btrfs_scratch_superblocks(tgtdev->bdev,
+ rcu_str_deref(tgtdev->name));
fs_info->fs_devices->open_devices--;
}
fs_info->fs_devices->num_devices--;
@@ -2041,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
}
}
- if (!*device) {
- btrfs_err(root->fs_info, "no missing device found");
- return -ENOENT;
- }
+ if (!*device)
+ return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
return 0;
} else {
@@ -2309,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
tmp + 1);
/* add sysfs device entry */
- btrfs_kobj_add_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
/*
* we've got more storage, clear any full flags on the space
@@ -2350,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
*/
snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
root->fs_info->fsid);
- if (kobject_rename(&root->fs_info->fs_devices->super_kobj,
+ if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
fsid_buf))
- pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n");
+ btrfs_warn(root->fs_info,
+ "sysfs: failed to create fsid for sprout");
}
root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2368,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
ret = btrfs_relocate_sys_chunks(root);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to relocate sys chunks after "
"device initialization. This can be fixed "
"using the \"btrfs balance\" command.");
@@ -2388,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
error_trans:
btrfs_end_transaction(trans, root);
rcu_string_free(device->name);
- btrfs_kobj_rm_device(root->fs_info->fs_devices, device);
+ btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
kfree(device);
error:
blkdev_put(bdev, FMODE_EXCL);
@@ -2613,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
else if (ret > 0) { /* Logic error or corruption */
- btrfs_error(root->fs_info, -ENOENT,
+ btrfs_std_error(root->fs_info, -ENOENT,
"Failed lookup while freeing chunk.");
ret = -ENOENT;
goto out;
@@ -2621,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret < 0)
- btrfs_error(root->fs_info, ret,
+ btrfs_std_error(root->fs_info, ret,
"Failed to delete chunk item.");
out:
btrfs_free_path(path);
@@ -2806,7 +2856,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
- btrfs_std_error(root->fs_info, ret);
+ btrfs_std_error(root->fs_info, ret, NULL);
return ret;
}
@@ -3009,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
* (albeit full) chunks.
*/
if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->data.usage = 90;
}
if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->sys.usage = 90;
}
if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+ !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
!(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
bctl->meta.usage = 90;
@@ -3074,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
struct btrfs_balance_args *bargs)
{
struct btrfs_block_group_cache *cache;
+ u64 chunk_used;
+ u64 user_thresh_min;
+ u64 user_thresh_max;
+ int ret = 1;
+
+ cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+ chunk_used = btrfs_block_group_used(&cache->item);
+
+ if (bargs->usage_min == 0)
+ user_thresh_min = 0;
+ else
+ user_thresh_min = div_factor_fine(cache->key.offset,
+ bargs->usage_min);
+
+ if (bargs->usage_max == 0)
+ user_thresh_max = 1;
+ else if (bargs->usage_max > 100)
+ user_thresh_max = cache->key.offset;
+ else
+ user_thresh_max = div_factor_fine(cache->key.offset,
+ bargs->usage_max);
+
+ if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
+ ret = 0;
+
+ btrfs_put_block_group(cache);
+ return ret;
+}
+
+static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
+ u64 chunk_offset, struct btrfs_balance_args *bargs)
+{
+ struct btrfs_block_group_cache *cache;
u64 chunk_used, user_thresh;
int ret = 1;
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
chunk_used = btrfs_block_group_used(&cache->item);
- if (bargs->usage == 0)
+ if (bargs->usage_min == 0)
user_thresh = 1;
else if (bargs->usage > 100)
user_thresh = cache->key.offset;
@@ -3170,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
return 1;
}
+static int chunk_stripes_range_filter(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_balance_args *bargs)
+{
+ int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+
+ if (bargs->stripes_min <= num_stripes
+ && num_stripes <= bargs->stripes_max)
+ return 0;
+
+ return 1;
+}
+
static int chunk_soft_convert_filter(u64 chunk_type,
struct btrfs_balance_args *bargs)
{
@@ -3216,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root,
if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
return 0;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
+ chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
+ return 0;
}
/* devid filter */
@@ -3236,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
}
+ /* stripes filter */
+ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
+ chunk_stripes_range_filter(leaf, chunk, bargs)) {
+ return 0;
+ }
+
/* soft profile changing mode */
if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3250,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root,
return 0;
else
bargs->limit--;
+ } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
+ /*
+ * Same logic as the 'limit' filter; the minimum cannot be
+ * determined here because we do not have the global informatoin
+ * about the count of all chunks that satisfy the filters.
+ */
+ if (bargs->limit_max == 0)
+ return 0;
+ else
+ bargs->limit_max--;
}
return 1;
@@ -3264,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
u64 old_size;
u64 size_to_free;
+ u64 chunk_type;
struct btrfs_chunk *chunk;
struct btrfs_path *path;
struct btrfs_key key;
@@ -3274,9 +3393,14 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
int ret;
int enospc_errors = 0;
bool counting = true;
+ /* The single value limit and min/max limits use the same bytes in the */
u64 limit_data = bctl->data.limit;
u64 limit_meta = bctl->meta.limit;
u64 limit_sys = bctl->sys.limit;
+ u32 count_data = 0;
+ u32 count_meta = 0;
+ u32 count_sys = 0;
+ int chunk_reserved = 0;
/* step one make some room on all the devices */
devices = &fs_info->fs_devices->devices;
@@ -3317,6 +3441,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->balance_lock);
again:
if (!counting) {
+ /*
+ * The single value limit and min/max limits use the same bytes
+ * in the
+ */
bctl->data.limit = limit_data;
bctl->meta.limit = limit_meta;
bctl->sys.limit = limit_sys;
@@ -3364,6 +3492,7 @@ again:
}
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ chunk_type = btrfs_chunk_type(leaf, chunk);
if (!counting) {
spin_lock(&fs_info->balance_lock);
@@ -3373,6 +3502,7 @@ again:
ret = should_balance_chunk(chunk_root, leaf, chunk,
found_key.offset);
+
btrfs_release_path(path);
if (!ret) {
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3384,9 +3514,50 @@ again:
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
+
+ if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+ count_data++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+ count_sys++;
+ else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+ count_meta++;
+
goto loop;
}
+ /*
+ * Apply limit_min filter, no need to check if the LIMITS
+ * filter is used, limit_min is 0 by default
+ */
+ if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
+ count_data < bctl->data.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
+ count_meta < bctl->meta.limit_min)
+ || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
+ count_sys < bctl->sys.limit_min)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto loop;
+ }
+
+ if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) {
+ trans = btrfs_start_transaction(chunk_root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ ret = PTR_ERR(trans);
+ goto error;
+ }
+
+ ret = btrfs_force_chunk_alloc(trans, chunk_root,
+ BTRFS_BLOCK_GROUP_DATA);
+ if (ret < 0) {
+ mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ goto error;
+ }
+
+ btrfs_end_transaction(trans, chunk_root);
+ chunk_reserved = 1;
+ }
+
ret = btrfs_relocate_chunk(chunk_root,
found_key.offset);
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3461,11 +3632,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
unset_balance_control(fs_info);
ret = del_balance_item(fs_info->tree_root);
if (ret)
- btrfs_std_error(fs_info, ret);
+ btrfs_std_error(fs_info, ret, NULL);
atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
}
+/* Non-zero return value signifies invalidity */
+static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
+ u64 allowed)
+{
+ return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+ (!alloc_profile_is_valid(bctl_arg->target, 1) ||
+ (bctl_arg->target & ~allowed)));
+}
+
/*
* Should be called with both balance and volume mutexes held
*/
@@ -3523,27 +3703,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
if (num_devices > 3)
allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
BTRFS_BLOCK_GROUP_RAID6);
- if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->data.target, 1) ||
- (bctl->data.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->data, allowed)) {
btrfs_err(fs_info, "unable to start balance with target "
"data profile %llu",
bctl->data.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->meta.target, 1) ||
- (bctl->meta.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->meta, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target metadata profile %llu",
bctl->meta.target);
ret = -EINVAL;
goto out;
}
- if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
- (!alloc_profile_is_valid(bctl->sys.target, 1) ||
- (bctl->sys.target & ~allowed))) {
+ if (validate_convert_profile(&bctl->sys, allowed)) {
btrfs_err(fs_info,
"unable to start balance with target system profile %llu",
bctl->sys.target);
@@ -4285,65 +4459,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
return 0;
}
-static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
- [BTRFS_RAID_RAID10] = {
- .sub_stripes = 2,
- .dev_stripes = 1,
- .devs_max = 0, /* 0 == as many as possible */
- .devs_min = 4,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID1] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 2,
- .devs_min = 2,
- .devs_increment = 2,
- .ncopies = 2,
- },
- [BTRFS_RAID_DUP] = {
- .sub_stripes = 1,
- .dev_stripes = 2,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID0] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_SINGLE] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 1,
- .devs_min = 1,
- .devs_increment = 1,
- .ncopies = 1,
- },
- [BTRFS_RAID_RAID5] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 2,
- .devs_increment = 1,
- .ncopies = 2,
- },
- [BTRFS_RAID_RAID6] = {
- .sub_stripes = 1,
- .dev_stripes = 1,
- .devs_max = 0,
- .devs_min = 3,
- .devs_increment = 1,
- .ncopies = 3,
- },
-};
-
static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
{
/* TODO allow them to set a preferred stripe size */
@@ -6594,8 +6709,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
BUG_ON(!path);
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "error %d while searching for dev_stats item for device %s!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "error %d while searching for dev_stats item for device %s",
ret, rcu_str_deref(device->name));
goto out;
}
@@ -6605,8 +6720,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
/* need to delete old one and insert a new one */
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "delete too small dev_stats item for device %s failed %d!\n",
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "delete too small dev_stats item for device %s failed %d",
rcu_str_deref(device->name), ret);
goto out;
}
@@ -6619,9 +6734,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
- printk_in_rcu(KERN_WARNING "BTRFS: "
- "insert dev_stats item for device %s failed %d!\n",
- rcu_str_deref(device->name), ret);
+ btrfs_warn_in_rcu(dev_root->fs_info,
+ "insert dev_stats item for device %s failed %d",
+ rcu_str_deref(device->name), ret);
goto out;
}
}
@@ -6675,8 +6790,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
{
if (!dev->dev_stats_valid)
return;
- printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6695,8 +6810,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
if (i == BTRFS_DEV_STAT_VALUES_MAX)
return; /* all values == 0, suppress message */
- printk_in_rcu(KERN_INFO "BTRFS: "
- "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+ btrfs_info_in_rcu(dev->dev_root->fs_info,
+ "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
rcu_str_deref(dev->name),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6740,22 +6855,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
return 0;
}
-int btrfs_scratch_superblock(struct btrfs_device *device)
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
{
struct buffer_head *bh;
struct btrfs_super_block *disk_super;
+ int copy_num;
- bh = btrfs_read_dev_super(device->bdev);
- if (!bh)
- return -EINVAL;
- disk_super = (struct btrfs_super_block *)bh->b_data;
+ if (!bdev)
+ return;
- memset(&disk_super->magic, 0, sizeof(disk_super->magic));
- set_buffer_dirty(bh);
- sync_dirty_buffer(bh);
- brelse(bh);
+ for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
+ copy_num++) {
- return 0;
+ if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
+ continue;
+
+ disk_super = (struct btrfs_super_block *)bh->b_data;
+
+ memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+ set_buffer_dirty(bh);
+ sync_dirty_buffer(bh);
+ brelse(bh);
+ }
+
+ /* Notify udev that device has changed */
+ btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
+ /* Update ctime/mtime for device path for libblkid */
+ update_dev_time(device_path);
}
/*
@@ -6823,3 +6950,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
fs_devices = fs_devices->seed;
}
}
+
+void btrfs_close_one_device(struct btrfs_device *device)
+{
+ struct btrfs_fs_devices *fs_devices = device->fs_devices;
+ struct btrfs_device *new_device;
+ struct rcu_string *name;
+
+ if (device->bdev)
+ fs_devices->open_devices--;
+
+ if (device->writeable &&
+ device->devid != BTRFS_DEV_REPLACE_DEVID) {
+ list_del_init(&device->dev_alloc_list);
+ fs_devices->rw_devices--;
+ }
+
+ if (device->missing)
+ fs_devices->missing_devices--;
+
+ new_device = btrfs_alloc_device(NULL, &device->devid,
+ device->uuid);
+ BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+ /* Safe because we are under uuid_mutex */
+ if (device->name) {
+ name = rcu_string_strdup(device->name->str, GFP_NOFS);
+ BUG_ON(!name); /* -ENOMEM */
+ rcu_assign_pointer(new_device->name, name);
+ }
+
+ list_replace_rcu(&device->dev_list, &new_device->dev_list);
+ new_device->fs_devices = device->fs_devices;
+
+ call_rcu(&device->rcu, free_device);
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2ca784a14e84..ec5712372732 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
struct btrfs_fs_info *fs_info;
/* sysfs kobjects */
- struct kobject super_kobj;
+ struct kobject fsid_kobj;
struct kobject *device_dir_kobj;
struct completion kobj_unregister;
};
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
int dev_stripes; /* stripes per dev */
int devs_max; /* max devs to use */
int devs_min; /* min devs needed */
+ int tolerated_failures; /* max tolerated fail devs */
int devs_increment; /* ndevs has to be a multiple of this */
int ncopies; /* how many copies to data has */
};
+extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
+
+extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
+
struct map_lookup {
u64 type;
int io_align;
@@ -375,6 +380,20 @@ struct map_lookup {
#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
+#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
+#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
+#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8)
+
+#define BTRFS_BALANCE_ARGS_MASK \
+ (BTRFS_BALANCE_ARGS_PROFILES | \
+ BTRFS_BALANCE_ARGS_USAGE | \
+ BTRFS_BALANCE_ARGS_DEVID | \
+ BTRFS_BALANCE_ARGS_DRANGE | \
+ BTRFS_BALANCE_ARGS_VRANGE | \
+ BTRFS_BALANCE_ARGS_LIMIT | \
+ BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
+ BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
+ BTRFS_BALANCE_ARGS_USAGE_RANGE)
/*
* Profile changing flags. When SOFT is set we won't relocate chunk if
@@ -474,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
struct btrfs_device *tgtdev);
-int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
u64 logical, u64 len, int mirror_num);
unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
@@ -547,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
struct list_head *btrfs_get_fs_uuids(void);
void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
+void btrfs_close_one_device(struct btrfs_device *device);
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 6f518c90e1c1..1fcd7b6e7564 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -313,8 +313,10 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
- if (found_key.type != BTRFS_XATTR_ITEM_KEY)
+ if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
+ if (found_key.type < BTRFS_XATTR_ITEM_KEY)
+ goto next;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
if (verify_dir_item(root, leaf, di))
diff --git a/fs/buffer.c b/fs/buffer.c
index 82283abb2795..4f4cd959da7c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -999,7 +999,7 @@ grow_dev_page(struct block_device *bdev, sector_t block,
int ret = 0; /* Will call free_more_memory() */
gfp_t gfp_mask;
- gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp;
+ gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
/*
* XXX: __getblk_slow() can not really deal with failure and
@@ -2420,9 +2420,9 @@ EXPORT_SYMBOL(block_commit_write);
* unlock the page.
*
* Direct callers of this function should protect against filesystem freezing
- * using sb_start_write() - sb_end_write() functions.
+ * using sb_start_pagefault() - sb_end_pagefault() functions.
*/
-int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
+int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
get_block_t get_block)
{
struct page *page = vmf->page;
@@ -2459,26 +2459,6 @@ out_unlock:
unlock_page(page);
return ret;
}
-EXPORT_SYMBOL(__block_page_mkwrite);
-
-int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
- get_block_t get_block)
-{
- int ret;
- struct super_block *sb = file_inode(vma->vm_file)->i_sb;
-
- sb_start_pagefault(sb);
-
- /*
- * Update file times before taking page lock. We may end up failing the
- * fault so this update may be superfluous but who really cares...
- */
- file_update_time(vma->vm_file);
-
- ret = __block_page_mkwrite(vma, vmf, get_block);
- sb_end_pagefault(sb);
- return block_page_mkwrite_return(ret);
-}
EXPORT_SYMBOL(block_page_mkwrite);
/*
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index aecd0859eacb..9c4b737a54df 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -30,7 +30,7 @@ extern unsigned cachefiles_debug;
#define CACHEFILES_DEBUG_KLEAVE 2
#define CACHEFILES_DEBUG_KDEBUG 4
-#define cachefiles_gfp (__GFP_WAIT | __GFP_NORETRY | __GFP_NOMEMALLOC)
+#define cachefiles_gfp (__GFP_RECLAIM | __GFP_NORETRY | __GFP_NOMEMALLOC)
/*
* node records
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index fc1056f5c96a..c4b893453e0e 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -655,6 +655,8 @@ lookup_again:
aops = d_backing_inode(object->dentry)->i_mapping->a_ops;
if (!aops->bmap)
goto check_error;
+ if (object->dentry->d_sb->s_blocksize > PAGE_SIZE)
+ goto check_error;
object->backer = object->dentry;
} else {
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 3cbb0e834694..7a6b02f72787 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -414,9 +414,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
op->op.flags &= FSCACHE_OP_KEEP_FLAGS;
@@ -711,9 +708,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op,
ASSERT(inode->i_mapping->a_ops->readpages);
/* calculate the shift required to use bmap */
- if (inode->i_sb->s_blocksize > PAGE_SIZE)
- goto all_enobufs;
-
shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits;
pagevec_init(&pagevec, 0);
@@ -905,6 +899,15 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
cache = container_of(object->fscache.cache,
struct cachefiles_cache, cache);
+ pos = (loff_t)page->index << PAGE_SHIFT;
+
+ /* We mustn't write more data than we have, so we have to beware of a
+ * partial page at EOF.
+ */
+ eof = object->fscache.store_limit_l;
+ if (pos >= eof)
+ goto error;
+
/* write the page to the backing filesystem and let it store it in its
* own time */
path.mnt = cache->mnt;
@@ -912,40 +915,38 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page)
file = dentry_open(&path, O_RDWR | O_LARGEFILE, cache->cache_cred);
if (IS_ERR(file)) {
ret = PTR_ERR(file);
- } else {
- pos = (loff_t) page->index << PAGE_SHIFT;
-
- /* we mustn't write more data than we have, so we have
- * to beware of a partial page at EOF */
- eof = object->fscache.store_limit_l;
- len = PAGE_SIZE;
- if (eof & ~PAGE_MASK) {
- ASSERTCMP(pos, <, eof);
- if (eof - pos < PAGE_SIZE) {
- _debug("cut short %llx to %llx",
- pos, eof);
- len = eof - pos;
- ASSERTCMP(pos + len, ==, eof);
- }
- }
-
- data = kmap(page);
- ret = __kernel_write(file, data, len, &pos);
- kunmap(page);
- if (ret != len)
- ret = -EIO;
- fput(file);
+ goto error_2;
}
- if (ret < 0) {
- if (ret == -EIO)
- cachefiles_io_error_obj(
- object, "Write page to backing file failed");
- ret = -ENOBUFS;
+ len = PAGE_SIZE;
+ if (eof & ~PAGE_MASK) {
+ if (eof - pos < PAGE_SIZE) {
+ _debug("cut short %llx to %llx",
+ pos, eof);
+ len = eof - pos;
+ ASSERTCMP(pos + len, ==, eof);
+ }
}
- _leave(" = %d", ret);
- return ret;
+ data = kmap(page);
+ ret = __kernel_write(file, data, len, &pos);
+ kunmap(page);
+ fput(file);
+ if (ret != len)
+ goto error_eio;
+
+ _leave(" = 0");
+ return 0;
+
+error_eio:
+ ret = -EIO;
+error_2:
+ if (ret == -EIO)
+ cachefiles_io_error_obj(object,
+ "Write page to backing file failed");
+error:
+ _leave(" = -ENOBUFS [%d]", ret);
+ return -ENOBUFS;
}
/*
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 9d23e788d1df..b7d218a168fb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1283,8 +1283,8 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
int ret1;
struct address_space *mapping = inode->i_mapping;
struct page *page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) &
- ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page) {
ret = VM_FAULT_OOM;
goto out;
@@ -1428,7 +1428,8 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
if (i_size_read(inode) == 0)
return;
page = find_or_create_page(mapping, 0,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping,
+ ~__GFP_FS));
if (!page)
return;
if (PageUptodate(page)) {
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 834f9f3723fb..a4766ded1ba7 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -88,7 +88,7 @@ static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
const struct ceph_inode_info* ci = cookie_netfs_data;
uint16_t klen;
- /* use ceph virtual inode (id + snaphot) */
+ /* use ceph virtual inode (id + snapshot) */
klen = sizeof(ci->i_vino);
if (klen > maxbuf)
return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 27b566874bc1..c69e1253b47b 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1655,9 +1655,8 @@ retry_locked:
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */
- (file_wanted == 0 || /* no open files */
- (revoking & (CEPH_CAP_FILE_CACHE|
- CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */
+ (revoking & (CEPH_CAP_FILE_CACHE|
+ CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
!tried_invalidate) {
dout("check_caps trying to invalidate on %p\n", inode);
if (try_nonblocking_invalidate(inode) < 0) {
@@ -1971,49 +1970,46 @@ out:
}
/*
- * wait for any uncommitted directory operations to commit.
+ * wait for any unsafe requests to complete.
*/
-static int unsafe_dirop_wait(struct inode *inode)
+static int unsafe_request_wait(struct inode *inode)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_dirops;
- struct ceph_mds_request *req;
- u64 last_tid;
- int ret = 0;
-
- if (!S_ISDIR(inode->i_mode))
- return 0;
+ struct ceph_mds_request *req1 = NULL, *req2 = NULL;
+ int ret, err = 0;
spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- req = list_last_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- last_tid = req->r_tid;
-
- do {
- ceph_mdsc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
+ if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
+ req1 = list_last_entry(&ci->i_unsafe_dirops,
+ struct ceph_mds_request,
+ r_unsafe_dir_item);
+ ceph_mdsc_get_request(req1);
+ }
+ if (!list_empty(&ci->i_unsafe_iops)) {
+ req2 = list_last_entry(&ci->i_unsafe_iops,
+ struct ceph_mds_request,
+ r_unsafe_target_item);
+ ceph_mdsc_get_request(req2);
+ }
+ spin_unlock(&ci->i_unsafe_lock);
- dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n",
- inode, req->r_tid, last_tid);
- ret = !wait_for_completion_timeout(&req->r_safe_completion,
- ceph_timeout_jiffies(req->r_timeout));
+ dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+ inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
+ if (req1) {
+ ret = !wait_for_completion_timeout(&req1->r_safe_completion,
+ ceph_timeout_jiffies(req1->r_timeout));
if (ret)
- ret = -EIO; /* timed out */
-
- ceph_mdsc_put_request(req);
-
- spin_lock(&ci->i_unsafe_lock);
- if (ret || list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_mds_request,
- r_unsafe_dir_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
- return ret;
+ err = -EIO;
+ ceph_mdsc_put_request(req1);
+ }
+ if (req2) {
+ ret = !wait_for_completion_timeout(&req2->r_safe_completion,
+ ceph_timeout_jiffies(req2->r_timeout));
+ if (ret)
+ err = -EIO;
+ ceph_mdsc_put_request(req2);
+ }
+ return err;
}
int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
@@ -2039,7 +2035,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
dirty = try_flush_caps(inode, &flush_tid);
dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
- ret = unsafe_dirop_wait(inode);
+ ret = unsafe_request_wait(inode);
/*
* only wait on non-file metadata writeback (the mds
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0c62868b5c56..3c68e6aee2f0 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -34,6 +34,74 @@
* need to wait for MDS acknowledgement.
*/
+/*
+ * Calculate the length sum of direct io vectors that can
+ * be combined into one page vector.
+ */
+static size_t dio_get_pagev_size(const struct iov_iter *it)
+{
+ const struct iovec *iov = it->iov;
+ const struct iovec *iovend = iov + it->nr_segs;
+ size_t size;
+
+ size = iov->iov_len - it->iov_offset;
+ /*
+ * An iov can be page vectored when both the current tail
+ * and the next base are page aligned.
+ */
+ while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
+ (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
+ size += iov->iov_len;
+ }
+ dout("dio_get_pagevlen len = %zu\n", size);
+ return size;
+}
+
+/*
+ * Allocate a page vector based on (@it, @nbytes).
+ * The return value is the tuple describing a page vector,
+ * that is (@pages, @page_align, @num_pages).
+ */
+static struct page **
+dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
+ size_t *page_align, int *num_pages)
+{
+ struct iov_iter tmp_it = *it;
+ size_t align;
+ struct page **pages;
+ int ret = 0, idx, npages;
+
+ align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+ (PAGE_SIZE - 1);
+ npages = calc_pages_for(align, nbytes);
+ pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
+ if (!pages) {
+ pages = vmalloc(sizeof(*pages) * npages);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (idx = 0; idx < npages; ) {
+ size_t start;
+ ret = iov_iter_get_pages(&tmp_it, pages + idx, nbytes,
+ npages - idx, &start);
+ if (ret < 0)
+ goto fail;
+
+ iov_iter_advance(&tmp_it, ret);
+ nbytes -= ret;
+ idx += (ret + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ }
+
+ BUG_ON(nbytes != 0);
+ *num_pages = npages;
+ *page_align = align;
+ dout("dio_get_pages_alloc: got %d pages align %zu\n", npages, align);
+ return pages;
+fail:
+ ceph_put_page_vector(pages, idx, false);
+ return ERR_PTR(ret);
+}
/*
* Prepare an open request. Preallocate ceph_cap to avoid an
@@ -458,11 +526,10 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
size_t start;
ssize_t n;
- n = iov_iter_get_pages_alloc(i, &pages, INT_MAX, &start);
- if (n < 0)
- return n;
-
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
+ n = dio_get_pagev_size(i);
+ pages = dio_get_pages_alloc(i, n, &start, &num_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
ret = striped_read(inode, off, n,
pages, num_pages, checkeof,
@@ -592,7 +659,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
CEPH_OSD_FLAG_WRITE;
while (iov_iter_count(from) > 0) {
- u64 len = iov_iter_single_seg_count(from);
+ u64 len = dio_get_pagev_size(from);
size_t start;
ssize_t n;
@@ -611,14 +678,14 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
- n = iov_iter_get_pages_alloc(from, &pages, len, &start);
- if (unlikely(n < 0)) {
- ret = n;
+ n = len;
+ pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+ if (IS_ERR(pages)) {
ceph_osdc_put_request(req);
+ ret = PTR_ERR(pages);
break;
}
- num_pages = (n + start + PAGE_SIZE - 1) / PAGE_SIZE;
/*
* throw out any page cache pages in this range. this
* may block.
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 96d2bd829902..498dcfa2dcdb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -452,6 +452,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_unsafe_writes);
INIT_LIST_HEAD(&ci->i_unsafe_dirops);
+ INIT_LIST_HEAD(&ci->i_unsafe_iops);
spin_lock_init(&ci->i_unsafe_lock);
ci->i_snap_realm = NULL;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index 6706bde9ad1b..a2cb0c254060 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -228,12 +228,12 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
file, lock_cmd, wait, fl);
if (!err) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
if (err) {
ceph_lock_message(CEPH_LOCK_FLOCK,
CEPH_MDS_OP_SETFILELOCK,
file, CEPH_LOCK_UNLOCK, 0, fl);
- dout("got %d on flock_lock_file_wait, undid lock", err);
+ dout("got %d on locks_lock_file_wait, undid lock", err);
}
}
return err;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 51cb02da75d9..e7b130a637f9 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -633,13 +633,8 @@ static void __register_request(struct ceph_mds_client *mdsc,
mdsc->oldest_tid = req->r_tid;
if (dir) {
- struct ceph_inode_info *ci = ceph_inode(dir);
-
ihold(dir);
- spin_lock(&ci->i_unsafe_lock);
req->r_unsafe_dir = dir;
- list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
- spin_unlock(&ci->i_unsafe_lock);
}
}
@@ -665,13 +660,20 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
rb_erase(&req->r_node, &mdsc->request_tree);
RB_CLEAR_NODE(&req->r_node);
- if (req->r_unsafe_dir) {
+ if (req->r_unsafe_dir && req->r_got_unsafe) {
struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
-
spin_lock(&ci->i_unsafe_lock);
list_del_init(&req->r_unsafe_dir_item);
spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_target_inode && req->r_got_unsafe) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_del_init(&req->r_unsafe_target_item);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
+ if (req->r_unsafe_dir) {
iput(req->r_unsafe_dir);
req->r_unsafe_dir = NULL;
}
@@ -1430,6 +1432,13 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
if ((used | wanted) & CEPH_CAP_ANY_WR)
goto out;
}
+ /* The inode has cached pages, but it's no longer used.
+ * we can safely drop it */
+ if (wanted == 0 && used == CEPH_CAP_FILE_CACHE &&
+ !(oissued & CEPH_CAP_FILE_CACHE)) {
+ used = 0;
+ oissued = 0;
+ }
if ((used | wanted) & ~oissued & mine)
goto out; /* we need these caps */
@@ -1438,7 +1447,7 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
/* we aren't the only cap.. just remove us */
__ceph_remove_cap(cap, true);
} else {
- /* try to drop referring dentries */
+ /* try dropping referring dentries */
spin_unlock(&ci->i_ceph_lock);
d_prune_aliases(inode);
dout("trim_caps_cb %p cap %p pruned, count now %d\n",
@@ -1704,6 +1713,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
req->r_started = jiffies;
req->r_resend_mds = -1;
INIT_LIST_HEAD(&req->r_unsafe_dir_item);
+ INIT_LIST_HEAD(&req->r_unsafe_target_item);
req->r_fmode = -1;
kref_init(&req->r_kref);
INIT_LIST_HEAD(&req->r_wait);
@@ -1935,7 +1945,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
len = sizeof(*head) +
pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64)) +
- sizeof(struct timespec);
+ sizeof(struct ceph_timespec);
/* calculate (max) length for cap releases */
len += sizeof(struct ceph_mds_request_release) *
@@ -2477,6 +2487,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
} else {
req->r_got_unsafe = true;
list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
+ if (req->r_unsafe_dir) {
+ struct ceph_inode_info *ci =
+ ceph_inode(req->r_unsafe_dir);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_dir_item,
+ &ci->i_unsafe_dirops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
}
dout("handle_reply tid %lld result %d\n", tid, result);
@@ -2518,6 +2536,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
up_read(&mdsc->snap_rwsem);
if (realm)
ceph_put_snap_realm(mdsc, realm);
+
+ if (err == 0 && req->r_got_unsafe && req->r_target_inode) {
+ struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
+ spin_lock(&ci->i_unsafe_lock);
+ list_add_tail(&req->r_unsafe_target_item, &ci->i_unsafe_iops);
+ spin_unlock(&ci->i_unsafe_lock);
+ }
out_err:
mutex_lock(&mdsc->mutex);
if (!req->r_aborted) {
@@ -3917,17 +3942,19 @@ static struct ceph_msg *mds_alloc_msg(struct ceph_connection *con,
return msg;
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_sign_message(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int mds_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_mds_session *s = con->private;
+ struct ceph_mds_session *s = msg->con->private;
struct ceph_auth_handshake *auth = &s->s_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3940,8 +3967,8 @@ static const struct ceph_connection_operations mds_con_ops = {
.invalidate_authorizer = invalidate_authorizer,
.peer_reset = peer_reset,
.alloc_msg = mds_alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = mds_sign_message,
+ .check_message_signature = mds_check_message_signature,
};
/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f575eafe2261..ccf11ef0ca87 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -236,6 +236,9 @@ struct ceph_mds_request {
struct inode *r_unsafe_dir;
struct list_head r_unsafe_dir_item;
+ /* unsafe requests that modify the target inode */
+ struct list_head r_unsafe_target_item;
+
struct ceph_mds_session *r_session;
int r_attempts; /* resend attempts */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2f2460d23a06..75b7d125ce66 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -342,6 +342,7 @@ struct ceph_inode_info {
struct list_head i_unsafe_writes; /* uncommitted sync writes */
struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
spinlock_t i_unsafe_lock;
struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index f4cf200b3c76..6908080e9b6d 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -42,7 +42,7 @@ cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
goto error;
/* attach the data */
- key->payload.data = payload;
+ key->payload.data[0] = payload;
ret = 0;
error:
@@ -52,7 +52,7 @@ error:
static void
cifs_spnego_key_destroy(struct key *key)
{
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
@@ -167,7 +167,7 @@ cifs_get_spnego_key(struct cifs_ses *sesInfo)
#ifdef CONFIG_CIFS_DEBUG2
if (cifsFYI && !IS_ERR(spnego_key)) {
- struct cifs_spnego_msg *msg = spnego_key->payload.data;
+ struct cifs_spnego_msg *msg = spnego_key->payload.data[0];
cifs_dump_mem("SPNEGO reply blob:", msg->data, min(1024U,
msg->secblob_len + msg->sesskey_len));
}
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1ea780bc6376..3f93125916bf 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -58,16 +58,15 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
* dereference payload.data!
*/
if (prep->datalen <= sizeof(key->payload)) {
- key->payload.value = 0;
- memcpy(&key->payload.value, prep->data, prep->datalen);
- key->datalen = prep->datalen;
- return 0;
+ key->payload.data[0] = NULL;
+ memcpy(&key->payload, prep->data, prep->datalen);
+ } else {
+ payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
+ if (!payload)
+ return -ENOMEM;
+ key->payload.data[0] = payload;
}
- payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL);
- if (!payload)
- return -ENOMEM;
- key->payload.data = payload;
key->datalen = prep->datalen;
return 0;
}
@@ -76,7 +75,7 @@ static inline void
cifs_idmap_key_destroy(struct key *key)
{
if (key->datalen > sizeof(key->payload))
- kfree(key->payload.data);
+ kfree(key->payload.data[0]);
}
static struct key_type cifs_idmap_key_type = {
@@ -233,8 +232,8 @@ id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
* it could be.
*/
ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
- (struct cifs_sid *)&sidkey->payload.value :
- (struct cifs_sid *)sidkey->payload.data;
+ (struct cifs_sid *)&sidkey->payload :
+ (struct cifs_sid *)sidkey->payload.data[0];
ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
if (ksid_size > sidkey->datalen) {
@@ -307,14 +306,14 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
if (sidtype == SIDOWNER) {
kuid_t uid;
uid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(uid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t));
uid = make_kuid(&init_user_ns, id);
if (uid_valid(uid))
fuid = uid;
} else {
kgid_t gid;
gid_t id;
- memcpy(&id, &sidkey->payload.value, sizeof(gid_t));
+ memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t));
gid = make_kgid(&init_user_ns, id);
if (gid_valid(gid))
fgid = gid;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index aa0dc2573374..afa09fce8151 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -444,6 +444,48 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp)
return 0;
}
+/* Server has provided av pairs/target info in the type 2 challenge
+ * packet and we have plucked it and stored within smb session.
+ * We parse that blob here to find the server given timestamp
+ * as part of ntlmv2 authentication (or local current time as
+ * default in case of failure)
+ */
+static __le64
+find_timestamp(struct cifs_ses *ses)
+{
+ unsigned int attrsize;
+ unsigned int type;
+ unsigned int onesize = sizeof(struct ntlmssp2_name);
+ unsigned char *blobptr;
+ unsigned char *blobend;
+ struct ntlmssp2_name *attrptr;
+
+ if (!ses->auth_key.len || !ses->auth_key.response)
+ return 0;
+
+ blobptr = ses->auth_key.response;
+ blobend = blobptr + ses->auth_key.len;
+
+ while (blobptr + onesize < blobend) {
+ attrptr = (struct ntlmssp2_name *) blobptr;
+ type = le16_to_cpu(attrptr->type);
+ if (type == NTLMSSP_AV_EOL)
+ break;
+ blobptr += 2; /* advance attr type */
+ attrsize = le16_to_cpu(attrptr->length);
+ blobptr += 2; /* advance attr size */
+ if (blobptr + attrsize > blobend)
+ break;
+ if (type == NTLMSSP_AV_TIMESTAMP) {
+ if (attrsize == sizeof(u64))
+ return *((__le64 *)blobptr);
+ }
+ blobptr += attrsize; /* advance attr value */
+ }
+
+ return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+}
+
static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
const struct nls_table *nls_cp)
{
@@ -641,6 +683,7 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
struct ntlmv2_resp *ntlmv2;
char ntlmv2_hash[16];
unsigned char *tiblob = NULL; /* target info blob */
+ __le64 rsp_timestamp;
if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
if (!ses->domainName) {
@@ -659,6 +702,12 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
}
}
+ /* Must be within 5 minutes of the server (or in range +/-2h
+ * in case of Mac OS X), so simply carry over server timestamp
+ * (as Windows 7 does)
+ */
+ rsp_timestamp = find_timestamp(ses);
+
baselen = CIFS_SESS_KEY_SIZE + sizeof(struct ntlmv2_resp);
tilen = ses->auth_key.len;
tiblob = ses->auth_key.response;
@@ -675,8 +724,8 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
(ses->auth_key.response + CIFS_SESS_KEY_SIZE);
ntlmv2->blob_signature = cpu_to_le32(0x00000101);
ntlmv2->reserved = 0;
- /* Must be within 5 minutes of the server */
- ntlmv2->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
+ ntlmv2->time = rsp_timestamp;
+
get_random_bytes(&ntlmv2->client_chal, sizeof(ntlmv2->client_chal));
ntlmv2->reserved2 = 0;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 6a1119e87fbb..cbc0f4bca0c0 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -325,8 +325,11 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
static void
cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
{
- if (ses->sectype == Unspecified)
+ if (ses->sectype == Unspecified) {
+ if (ses->user_name == NULL)
+ seq_puts(s, ",sec=none");
return;
+ }
seq_puts(s, ",sec=");
@@ -451,6 +454,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_puts(s, ",nocase");
if (tcon->retry)
seq_puts(s, ",hard");
+ if (tcon->use_persistent)
+ seq_puts(s, ",persistenthandles");
+ else if (tcon->use_resilient)
+ seq_puts(s, ",resilienthandles");
if (tcon->unix_ext)
seq_puts(s, ",unix");
else
@@ -918,9 +925,7 @@ const struct file_operations cifs_file_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -936,9 +941,7 @@ const struct file_operations cifs_file_strict_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -954,9 +957,7 @@ const struct file_operations cifs_file_direct_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
@@ -972,9 +973,7 @@ const struct file_operations cifs_file_nobrl_ops = {
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -989,9 +988,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
.mmap = cifs_file_strict_mmap,
.splice_read = generic_file_splice_read,
.llseek = cifs_llseek,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
};
@@ -1006,9 +1003,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
.flush = cifs_flush,
.mmap = cifs_file_mmap,
.splice_read = generic_file_splice_read,
-#ifdef CONFIG_CIFS_POSIX
.unlocked_ioctl = cifs_ioctl,
-#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
.setlease = cifs_setlease,
.fallocate = cifs_fallocate,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 27aea110e923..c3cc1609025f 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
-#define CIFS_VERSION "2.07"
+#define CIFS_VERSION "2.08"
#endif /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index b406a32deb1f..2b510c537a0d 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -493,7 +493,10 @@ struct smb_vol {
bool mfsymlinks:1; /* use Minshall+French Symlinks */
bool multiuser:1;
bool rwpidforward:1; /* pid forward for read/write operations */
- bool nosharesock;
+ bool nosharesock:1;
+ bool persistent:1;
+ bool nopersistent:1;
+ bool resilient:1; /* noresilient not required since not fored for CA */
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
@@ -895,6 +898,8 @@ struct cifs_tcon {
bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
bool broken_sparse_sup; /* if server or share does not support sparse */
bool need_reconnect:1; /* connection reset, tid now invalid */
+ bool use_resilient:1; /* use resilient instead of durable handles */
+ bool use_persistent:1; /* use persistent instead of durable handles */
#ifdef CONFIG_CIFS_SMB2
bool print:1; /* set if connection to printer share */
bool bad_network_name:1; /* set if ret status STATUS_BAD_NETWORK_NAME */
@@ -1015,6 +1020,7 @@ struct cifs_fid {
__u64 persistent_fid; /* persist file id for smb2 */
__u64 volatile_fid; /* volatile file id for smb2 */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */
+ __u8 create_guid[16];
#endif
struct cifs_pending_open *pending_open;
unsigned int epoch;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 773f4dc77630..ecb0803bdb0e 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -87,6 +87,8 @@ enum {
Opt_sign, Opt_seal, Opt_noac,
Opt_fsc, Opt_mfsymlinks,
Opt_multiuser, Opt_sloppy, Opt_nosharesock,
+ Opt_persistent, Opt_nopersistent,
+ Opt_resilient, Opt_noresilient,
/* Mount options which take numeric value */
Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -169,6 +171,10 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_multiuser, "multiuser" },
{ Opt_sloppy, "sloppy" },
{ Opt_nosharesock, "nosharesock" },
+ { Opt_persistent, "persistenthandles"},
+ { Opt_nopersistent, "nopersistenthandles"},
+ { Opt_resilient, "resilienthandles"},
+ { Opt_noresilient, "noresilienthandles"},
{ Opt_backupuid, "backupuid=%s" },
{ Opt_backupgid, "backupgid=%s" },
@@ -1497,6 +1503,33 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
case Opt_nosharesock:
vol->nosharesock = true;
break;
+ case Opt_nopersistent:
+ vol->nopersistent = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_persistent:
+ vol->persistent = true;
+ if ((vol->nopersistent) || (vol->resilient)) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_resilient:
+ vol->resilient = true;
+ if (vol->persistent) {
+ cifs_dbg(VFS,
+ "persistenthandles mount options conflict\n");
+ goto cifs_parse_mount_err;
+ }
+ break;
+ case Opt_noresilient:
+ vol->resilient = false; /* already the default */
+ break;
/* Numeric Values */
case Opt_backupuid:
@@ -2325,13 +2358,14 @@ static int
cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
{
int rc = 0;
- char *desc, *delim, *payload;
+ const char *delim, *payload;
+ char *desc;
ssize_t len;
struct key *key;
struct TCP_Server_Info *server = ses->server;
struct sockaddr_in *sa;
struct sockaddr_in6 *sa6;
- struct user_key_payload *upayload;
+ const struct user_key_payload *upayload;
desc = kmalloc(CIFSCREDS_DESC_SIZE, GFP_KERNEL);
if (!desc)
@@ -2374,14 +2408,14 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
}
down_read(&key->sem);
- upayload = key->payload.data;
+ upayload = user_key_payload(key);
if (IS_ERR_OR_NULL(upayload)) {
rc = upayload ? PTR_ERR(upayload) : -EINVAL;
goto out_key_put;
}
/* find first : in payload */
- payload = (char *)upayload->data;
+ payload = upayload->data;
delim = strnchr(payload, upayload->datalen, ':');
cifs_dbg(FYI, "payload=%s\n", payload);
if (!delim) {
@@ -2654,6 +2688,42 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
cifs_dbg(FYI, "DFS disabled (%d)\n", tcon->Flags);
}
tcon->seal = volume_info->seal;
+ tcon->use_persistent = false;
+ /* check if SMB2 or later, CIFS does not support persistent handles */
+ if (volume_info->persistent) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB3 or later required for persistent handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#ifdef CONFIG_CIFS_SMB2
+ } else if (ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ tcon->use_persistent = true;
+ else /* persistent handles requested but not supported */ {
+ cifs_dbg(VFS,
+ "Persistent handles not supported on share\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+ }
+#ifdef CONFIG_CIFS_SMB2
+ } else if ((tcon->capabilities & SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY)
+ && (ses->server->capabilities & SMB2_GLOBAL_CAP_PERSISTENT_HANDLES)
+ && (volume_info->nopersistent == false)) {
+ cifs_dbg(FYI, "enabling persistent handles\n");
+ tcon->use_persistent = true;
+#endif /* CONFIG_CIFS_SMB2 */
+ } else if (volume_info->resilient) {
+ if (ses->server->vals->protocol_id == 0) {
+ cifs_dbg(VFS,
+ "SMB2.1 or later required for resilient handles\n");
+ rc = -EOPNOTSUPP;
+ goto out_fail;
+ }
+ tcon->use_resilient = true;
+ }
+
/*
* We can have only one retry value for a connection to a share so for
* resources mounted more than once to the same server share the last
@@ -3502,6 +3572,15 @@ try_mount_again:
goto mount_fail_check;
}
+#ifdef CONFIG_CIFS_SMB2
+ if ((volume_info->persistent == true) && ((ses->server->capabilities &
+ SMB2_GLOBAL_CAP_PERSISTENT_HANDLES) == 0)) {
+ cifs_dbg(VFS, "persistent handles not supported by server\n");
+ rc = -EOPNOTSUPP;
+ goto mount_fail_check;
+ }
+#endif /* CONFIG_CIFS_SMB2*/
+
/* search for existing tcon to this server share */
tcon = cifs_get_tcon(ses, volume_info);
if (IS_ERR(tcon)) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e2a6af1508af..0068e82217c3 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1553,7 +1553,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
out:
if (flock->fl_flags & FL_POSIX && !rc)
- rc = posix_lock_file_wait(file, flock);
+ rc = locks_lock_file_wait(file, flock);
return rc;
}
@@ -3380,6 +3380,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
struct page *page, *tpage;
unsigned int expected_index;
int rc;
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
INIT_LIST_HEAD(tmplist);
@@ -3392,7 +3393,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
*/
__set_page_locked(page);
rc = add_to_page_cache_locked(page, mapping,
- page->index, GFP_KERNEL);
+ page->index, gfp);
/* give up if we can't stick it in the cache */
if (rc) {
@@ -3418,8 +3419,7 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
break;
__set_page_locked(page);
- if (add_to_page_cache_locked(page, mapping, page->index,
- GFP_KERNEL)) {
+ if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
__clear_page_locked(page);
break;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f621b44cb800..6b66dd5d1540 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -2034,7 +2034,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon = NULL;
struct TCP_Server_Info *server;
- struct cifs_io_parms io_parms;
/*
* To avoid spurious oplock breaks from server, in the case of
@@ -2056,18 +2055,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
rc = -ENOSYS;
cifsFileInfo_put(open_file);
cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc);
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- unsigned int bytes_written;
-
- io_parms.netfid = open_file->fid.netfid;
- io_parms.pid = open_file->pid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = attrs->ia_size;
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written,
- NULL, NULL, 1);
- cifs_dbg(FYI, "Wrt seteof rc %d\n", rc);
- }
} else
rc = -EINVAL;
@@ -2093,28 +2080,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
else
rc = -ENOSYS;
cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
- if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
- __u16 netfid;
- int oplock = 0;
- rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN,
- GENERIC_WRITE, CREATE_NOT_DIR, &netfid,
- &oplock, NULL, cifs_sb->local_nls,
- cifs_remap(cifs_sb));
- if (rc == 0) {
- unsigned int bytes_written;
-
- io_parms.netfid = netfid;
- io_parms.pid = current->tgid;
- io_parms.tcon = tcon;
- io_parms.offset = 0;
- io_parms.length = attrs->ia_size;
- rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, NULL,
- NULL, 1);
- cifs_dbg(FYI, "wrt seteof rc %d\n", rc);
- CIFSSMBClose(xid, tcon, netfid);
- }
- }
if (tlink)
cifs_put_tlink(tlink);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index c63f5227b681..35cf990f87d3 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -67,6 +67,12 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
goto out_drop_write;
}
+ if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+ rc = -EBADF;
+ cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+ goto out_fput;
+ }
+
if ((!src_file.file->private_data) || (!dst_file->private_data)) {
rc = -EBADF;
cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
@@ -79,9 +85,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
src_tcon = tlink_tcon(smb_file_src->tlink);
target_tcon = tlink_tcon(smb_file_target->tlink);
- /* check if source and target are on same tree connection */
- if (src_tcon != target_tcon) {
- cifs_dbg(VFS, "file copy src and target on different volume\n");
+ /* check source and target on same server (or volume if dup_extents) */
+ if (dup_extents && (src_tcon != target_tcon)) {
+ cifs_dbg(VFS, "source and target of copy not on same share\n");
+ goto out_fput;
+ }
+
+ if (!dup_extents && (src_tcon->ses != target_tcon->ses)) {
+ cifs_dbg(VFS, "source and target of copy not on same server\n");
goto out_fput;
}
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index b1eede3678a9..0557c45e9c33 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -84,7 +84,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
dentry = d_hash_and_lookup(parent, name);
- if (unlikely(IS_ERR(dentry)))
+ if (IS_ERR(dentry))
return;
if (dentry) {
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index bce6fdcd5d48..59727e32ed0f 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -988,7 +988,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
goto out;
}
- msg = spnego_key->payload.data;
+ msg = spnego_key->payload.data[0];
/*
* check version field to make sure that cifs.upcall is
* sending us a response in an expected form
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2ab297dae5a7..f9e766f464be 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -43,6 +43,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
struct smb2_file_all_info *smb2_data = NULL;
__u8 smb2_oplock[17];
struct cifs_fid *fid = oparms->fid;
+ struct network_resiliency_req nr_ioctl_req;
smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb);
if (smb2_path == NULL) {
@@ -67,6 +68,24 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
if (rc)
goto out;
+
+ if (oparms->tcon->use_resilient) {
+ nr_ioctl_req.Timeout = 0; /* use server default (120 seconds) */
+ nr_ioctl_req.Reserved = 0;
+ rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
+ fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY, true,
+ (char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
+ NULL, NULL /* no return info */);
+ if (rc == -EOPNOTSUPP) {
+ cifs_dbg(VFS,
+ "resiliency not supported by server, disabling\n");
+ oparms->tcon->use_resilient = false;
+ } else if (rc)
+ cifs_dbg(FYI, "error %d setting resiliency\n", rc);
+
+ rc = 0;
+ }
+
if (buf) {
/* open response does not have IndexNumber field - get it */
rc = SMB2_get_srv_num(xid, oparms->tcon, fid->persistent_fid,
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index df91bcf56d67..53ccdde6ff18 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -50,9 +50,13 @@ change_conf(struct TCP_Server_Info *server)
break;
default:
server->echoes = true;
- server->oplocks = true;
+ if (enable_oplocks) {
+ server->oplocks = true;
+ server->oplock_credits = 1;
+ } else
+ server->oplocks = false;
+
server->echo_credits = 1;
- server->oplock_credits = 1;
}
server->credits -= server->echo_credits + server->oplock_credits;
return 0;
@@ -806,7 +810,6 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
cfile->fid.volatile_fid, cfile->pid, &eof, false);
}
-#ifdef CONFIG_CIFS_SMB311
static int
smb2_duplicate_extents(const unsigned int xid,
struct cifsFileInfo *srcfile,
@@ -850,8 +853,6 @@ smb2_duplicate_extents(const unsigned int xid,
duplicate_extents_out:
return rc;
}
-#endif /* CONFIG_CIFS_SMB311 */
-
static int
smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
@@ -1699,6 +1700,7 @@ struct smb_version_operations smb30_operations = {
.create_lease_buf = smb3_create_lease_buf,
.parse_lease_buf = smb3_parse_lease_buf,
.clone_range = smb2_clone_range,
+ .duplicate_extents = smb2_duplicate_extents,
.validate_negotiate = smb3_validate_negotiate,
.wp_retry_size = smb2_wp_retry_size,
.dir_needs_close = smb2_dir_needs_close,
@@ -1836,7 +1838,7 @@ struct smb_version_values smb21_values = {
struct smb_version_values smb30_values = {
.version_string = SMB30_VERSION_STRING,
.protocol_id = SMB30_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1856,7 +1858,7 @@ struct smb_version_values smb30_values = {
struct smb_version_values smb302_values = {
.version_string = SMB302_VERSION_STRING,
.protocol_id = SMB302_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1877,7 +1879,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 070fb2ad85ce..767555518d40 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -46,6 +46,7 @@
#include "smb2status.h"
#include "smb2glob.h"
#include "cifspdu.h"
+#include "cifs_spnego.h"
/*
* The following table defines the expected "StructureSize" of SMB2 requests
@@ -486,19 +487,15 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
cifs_dbg(FYI, "missing security blob on negprot\n");
rc = cifs_enable_signing(server, ses->sign);
-#ifdef CONFIG_SMB2_ASN1 /* BB REMOVEME when updated asn1.c ready */
if (rc)
goto neg_exit;
- if (blob_length)
+ if (blob_length) {
rc = decode_negTokenInit(security_blob, blob_length, server);
- if (rc == 1)
- rc = 0;
- else if (rc == 0) {
- rc = -EIO;
- goto neg_exit;
+ if (rc == 1)
+ rc = 0;
+ else if (rc == 0)
+ rc = -EIO;
}
-#endif
-
neg_exit:
free_rsp_buf(resp_buftype, rsp);
return rc;
@@ -592,7 +589,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
struct TCP_Server_Info *server = ses->server;
u16 blob_length = 0;
- char *security_blob;
+ struct key *spnego_key = NULL;
+ char *security_blob = NULL;
char *ntlmssp_blob = NULL;
bool use_spnego = false; /* else use raw ntlmssp */
@@ -620,7 +618,8 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
ses->ntlmssp->sesskey_per_smbsess = true;
/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
- ses->sectype = RawNTLMSSP;
+ if (ses->sectype != Kerberos && ses->sectype != RawNTLMSSP)
+ ses->sectype = RawNTLMSSP;
ssetup_ntlmssp_authenticate:
if (phase == NtLmChallenge)
@@ -649,7 +648,48 @@ ssetup_ntlmssp_authenticate:
iov[0].iov_base = (char *)req;
/* 4 for rfc1002 length field and 1 for pad */
iov[0].iov_len = get_rfc1002_length(req) + 4 - 1;
- if (phase == NtLmNegotiate) {
+
+ if (ses->sectype == Kerberos) {
+#ifdef CONFIG_CIFS_UPCALL
+ struct cifs_spnego_msg *msg;
+
+ spnego_key = cifs_get_spnego_key(ses);
+ if (IS_ERR(spnego_key)) {
+ rc = PTR_ERR(spnego_key);
+ spnego_key = NULL;
+ goto ssetup_exit;
+ }
+
+ msg = spnego_key->payload.data[0];
+ /*
+ * check version field to make sure that cifs.upcall is
+ * sending us a response in an expected form
+ */
+ if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) {
+ cifs_dbg(VFS,
+ "bad cifs.upcall version. Expected %d got %d",
+ CIFS_SPNEGO_UPCALL_VERSION, msg->version);
+ rc = -EKEYREJECTED;
+ goto ssetup_exit;
+ }
+ ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
+ GFP_KERNEL);
+ if (!ses->auth_key.response) {
+ cifs_dbg(VFS,
+ "Kerberos can't allocate (%u bytes) memory",
+ msg->sesskey_len);
+ rc = -ENOMEM;
+ goto ssetup_exit;
+ }
+ ses->auth_key.len = msg->sesskey_len;
+ blob_length = msg->secblob_len;
+ iov[1].iov_base = msg->data + msg->sesskey_len;
+ iov[1].iov_len = blob_length;
+#else
+ rc = -EOPNOTSUPP;
+ goto ssetup_exit;
+#endif /* CONFIG_CIFS_UPCALL */
+ } else if (phase == NtLmNegotiate) { /* if not krb5 must be ntlmssp */
ntlmssp_blob = kmalloc(sizeof(struct _NEGOTIATE_MESSAGE),
GFP_KERNEL);
if (ntlmssp_blob == NULL) {
@@ -672,6 +712,8 @@ ssetup_ntlmssp_authenticate:
/* with raw NTLMSSP we don't encapsulate in SPNEGO */
security_blob = ntlmssp_blob;
}
+ iov[1].iov_base = security_blob;
+ iov[1].iov_len = blob_length;
} else if (phase == NtLmAuthenticate) {
req->hdr.SessionId = ses->Suid;
ntlmssp_blob = kzalloc(sizeof(struct _NEGOTIATE_MESSAGE) + 500,
@@ -699,6 +741,8 @@ ssetup_ntlmssp_authenticate:
} else {
security_blob = ntlmssp_blob;
}
+ iov[1].iov_base = security_blob;
+ iov[1].iov_len = blob_length;
} else {
cifs_dbg(VFS, "illegal ntlmssp phase\n");
rc = -EIO;
@@ -710,8 +754,6 @@ ssetup_ntlmssp_authenticate:
cpu_to_le16(sizeof(struct smb2_sess_setup_req) -
1 /* pad */ - 4 /* rfc1001 len */);
req->SecurityBufferLength = cpu_to_le16(blob_length);
- iov[1].iov_base = security_blob;
- iov[1].iov_len = blob_length;
inc_rfc1001_len(req, blob_length - 1 /* pad */);
@@ -722,6 +764,7 @@ ssetup_ntlmssp_authenticate:
kfree(security_blob);
rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
+ ses->Suid = rsp->hdr.SessionId;
if (resp_buftype != CIFS_NO_BUFFER &&
rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) {
if (phase != NtLmNegotiate) {
@@ -739,7 +782,6 @@ ssetup_ntlmssp_authenticate:
/* NTLMSSP Negotiate sent now processing challenge (response) */
phase = NtLmChallenge; /* process ntlmssp challenge */
rc = 0; /* MORE_PROCESSING is not an error here but expected */
- ses->Suid = rsp->hdr.SessionId;
rc = decode_ntlmssp_challenge(rsp->Buffer,
le16_to_cpu(rsp->SecurityBufferLength), ses);
}
@@ -796,6 +838,10 @@ keygen_exit:
kfree(ses->auth_key.response);
ses->auth_key.response = NULL;
}
+ if (spnego_key) {
+ key_invalidate(spnego_key);
+ key_put(spnego_key);
+ }
kfree(ses->ntlmssp);
return rc;
@@ -876,6 +922,12 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
if (tcon && tcon->bad_network_name)
return -ENOENT;
+ if ((tcon && tcon->seal) &&
+ ((ses->server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION) == 0)) {
+ cifs_dbg(VFS, "encryption requested but no server support");
+ return -EOPNOTSUPP;
+ }
+
unc_path = kmalloc(MAX_SHARENAME_LENGTH * 2, GFP_KERNEL);
if (unc_path == NULL)
return -ENOMEM;
@@ -955,6 +1007,8 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
((tcon->share_flags & SHI1005_FLAGS_DFS) == 0))
cifs_dbg(VFS, "DFS capability contradicts DFS flag\n");
init_copy_chunk_defaults(tcon);
+ if (tcon->share_flags & SHI1005_FLAGS_ENCRYPT_DATA)
+ cifs_dbg(VFS, "Encrypted shares not supported");
if (tcon->ses->server->ops->validate_negotiate)
rc = tcon->ses->server->ops->validate_negotiate(xid, tcon);
tcon_exit:
@@ -1097,13 +1151,130 @@ add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
return 0;
}
+static struct create_durable_v2 *
+create_durable_v2_buf(struct cifs_fid *pfid)
+{
+ struct create_durable_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_v2), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, dcontext));
+ buf->ccontext.DataLength = cpu_to_le32(sizeof(struct durable_context_v2));
+ buf->ccontext.NameOffset = cpu_to_le16(offsetof
+ (struct create_durable_v2, Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Timeout = 0; /* Should this be configurable by workload */
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ get_random_bytes(buf->dcontext.CreateGuid, 16);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'Q';
+ return buf;
+}
+
+static struct create_durable_handle_reconnect_v2 *
+create_reconnect_durable_v2_buf(struct cifs_fid *fid)
+{
+ struct create_durable_handle_reconnect_v2 *buf;
+
+ buf = kzalloc(sizeof(struct create_durable_handle_reconnect_v2),
+ GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ buf->ccontext.DataOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ dcontext));
+ buf->ccontext.DataLength =
+ cpu_to_le32(sizeof(struct durable_reconnect_context_v2));
+ buf->ccontext.NameOffset =
+ cpu_to_le16(offsetof(struct create_durable_handle_reconnect_v2,
+ Name));
+ buf->ccontext.NameLength = cpu_to_le16(4);
+
+ buf->dcontext.Fid.PersistentFileId = fid->persistent_fid;
+ buf->dcontext.Fid.VolatileFileId = fid->volatile_fid;
+ buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
+ memcpy(buf->dcontext.CreateGuid, fid->create_guid, 16);
+
+ /* SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 is "DH2C" */
+ buf->Name[0] = 'D';
+ buf->Name[1] = 'H';
+ buf->Name[2] = '2';
+ buf->Name[3] = 'C';
+ return buf;
+}
+
+static int
+add_durable_v2_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ iov[num].iov_base = create_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable_v2));
+ inc_rfc1001_len(&req->hdr, sizeof(struct create_durable_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
static int
-add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+add_durable_reconnect_v2_context(struct kvec *iov, unsigned int *num_iovec,
struct cifs_open_parms *oparms)
{
struct smb2_create_req *req = iov[0].iov_base;
unsigned int num = *num_iovec;
+ /* indicate that we don't need to relock the file */
+ oparms->reconnect = false;
+
+ iov[num].iov_base = create_reconnect_durable_v2_buf(oparms->fid);
+ if (iov[num].iov_base == NULL)
+ return -ENOMEM;
+ iov[num].iov_len = sizeof(struct create_durable_handle_reconnect_v2);
+ if (!req->CreateContextsOffset)
+ req->CreateContextsOffset =
+ cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
+ iov[1].iov_len);
+ le32_add_cpu(&req->CreateContextsLength,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ inc_rfc1001_len(&req->hdr,
+ sizeof(struct create_durable_handle_reconnect_v2));
+ *num_iovec = num + 1;
+ return 0;
+}
+
+static int
+add_durable_context(struct kvec *iov, unsigned int *num_iovec,
+ struct cifs_open_parms *oparms, bool use_persistent)
+{
+ struct smb2_create_req *req = iov[0].iov_base;
+ unsigned int num = *num_iovec;
+
+ if (use_persistent) {
+ if (oparms->reconnect)
+ return add_durable_reconnect_v2_context(iov, num_iovec,
+ oparms);
+ else
+ return add_durable_v2_context(iov, num_iovec, oparms);
+ }
+
if (oparms->reconnect) {
iov[num].iov_base = create_reconnect_durable_buf(oparms->fid);
/* indicate that we don't need to relock the file */
@@ -1221,7 +1392,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
ccontext->Next =
cpu_to_le32(server->vals->create_lease_size);
}
- rc = add_durable_context(iov, &num_iovecs, oparms);
+
+ rc = add_durable_context(iov, &num_iovecs, oparms,
+ tcon->use_persistent);
if (rc) {
cifs_small_buf_release(req);
kfree(copy_path);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 451108284a2f..4af52780ec35 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -590,6 +590,44 @@ struct create_durable {
} Data;
} __packed;
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT 0x00000002
+struct durable_context_v2 {
+ __le32 Timeout;
+ __le32 Flags;
+ __u64 Reserved;
+ __u8 CreateGuid[16];
+} __packed;
+
+struct create_durable_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_context_v2 dcontext;
+} __packed;
+
+/* See MS-SMB2 2.2.13.2.12 */
+struct durable_reconnect_context_v2 {
+ struct {
+ __u64 PersistentFileId;
+ __u64 VolatileFileId;
+ } Fid;
+ __u8 CreateGuid[16];
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+/* See MS-SMB2 2.2.14.2.12 */
+struct durable_reconnect_context_v2_rsp {
+ __le32 Timeout;
+ __le32 Flags; /* see above DHANDLE_FLAG_PERSISTENT */
+} __packed;
+
+struct create_durable_handle_reconnect_v2 {
+ struct create_context ccontext;
+ __u8 Name[8];
+ struct durable_reconnect_context_v2 dcontext;
+} __packed;
+
#define COPY_CHUNK_RES_KEY_SIZE 24
struct resume_key_req {
char ResumeKey[COPY_CHUNK_RES_KEY_SIZE];
@@ -643,6 +681,13 @@ struct fsctl_get_integrity_information_rsp {
/* Integrity flags for above */
#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001
+/* See MS-SMB2 2.2.31.3 */
+struct network_resiliency_req {
+ __le32 Timeout;
+ __le32 Reserved;
+} __packed;
+/* There is no buffer for the response ie no struct network_resiliency_rsp */
+
struct validate_negotiate_info_req {
__le32 Capabilities;
diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h
index a639d0dab453..f996daeea271 100644
--- a/fs/cifs/smbfsctl.h
+++ b/fs/cifs/smbfsctl.h
@@ -90,7 +90,7 @@
#define FSCTL_SRV_ENUMERATE_SNAPSHOTS 0x00144064
/* Retrieve an opaque file reference for server-side data movement ie copy */
#define FSCTL_SRV_REQUEST_RESUME_KEY 0x00140078
-#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4 /* BB add struct */
+#define FSCTL_LMR_REQUEST_RESILIENCY 0x001401D4
#define FSCTL_LMR_GET_LINK_TRACK_INF 0x001400E8 /* BB add struct */
#define FSCTL_LMR_SET_LINK_TRACK_INF 0x001400EC /* BB add struct */
#define FSCTL_VALIDATE_NEGOTIATE_INFO 0x00140204
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 48851f6ea6ec..dcf26537c935 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -686,7 +686,7 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
if (get_user(nmsgs, &udata->nmsgs))
return -EFAULT;
- if (nmsgs > I2C_RDRW_IOCTL_MAX_MSGS)
+ if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS)
return -EINVAL;
if (get_user(datap, &udata->msgs))
diff --git a/fs/coredump.c b/fs/coredump.c
index a8f75640ac86..1777331eee76 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -280,23 +280,24 @@ out:
return ispipe;
}
-static int zap_process(struct task_struct *start, int exit_code)
+static int zap_process(struct task_struct *start, int exit_code, int flags)
{
struct task_struct *t;
int nr = 0;
+ /* ignore all signals except SIGKILL, see prepare_signal() */
+ start->signal->flags = SIGNAL_GROUP_COREDUMP | flags;
start->signal->group_exit_code = exit_code;
start->signal->group_stop_count = 0;
- t = start;
- do {
+ for_each_thread(start, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
if (t != current && t->mm) {
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
nr++;
}
- } while_each_thread(start, t);
+ }
return nr;
}
@@ -311,10 +312,8 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
mm->core_state = core_state;
- nr = zap_process(tsk, exit_code);
tsk->signal->group_exit_task = tsk;
- /* ignore all signals except SIGKILL, see prepare_signal() */
- tsk->signal->flags = SIGNAL_GROUP_COREDUMP;
+ nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
}
spin_unlock_irq(&tsk->sighand->siglock);
@@ -360,18 +359,18 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
continue;
if (g->flags & PF_KTHREAD)
continue;
- p = g;
- do {
- if (p->mm) {
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code);
- p->signal->flags = SIGNAL_GROUP_EXIT;
- unlock_task_sighand(p, &flags);
- }
- break;
+
+ for_each_thread(g, p) {
+ if (unlikely(!p->mm))
+ continue;
+ if (unlikely(p->mm == mm)) {
+ lock_task_sighand(p, &flags);
+ nr += zap_process(p, exit_code,
+ SIGNAL_GROUP_EXIT);
+ unlock_task_sighand(p, &flags);
}
- } while_each_thread(g, p);
+ break;
+ }
}
rcu_read_unlock();
done:
diff --git a/fs/dax.c b/fs/dax.c
index 93bf2f990ace..d1e5cb7311a1 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -29,6 +29,11 @@
#include <linux/uio.h>
#include <linux/vmstat.h>
+/*
+ * dax_clear_blocks() is called from within transaction context from XFS,
+ * and hence this means the stack from this point must follow GFP_NOFS
+ * semantics for all operations.
+ */
int dax_clear_blocks(struct inode *inode, sector_t block, long size)
{
struct block_device *bdev = inode->i_sb->s_bdev;
@@ -119,7 +124,8 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
size_t len;
if (pos == max) {
unsigned blkbits = inode->i_blkbits;
- sector_t block = pos >> blkbits;
+ long page = pos >> PAGE_SHIFT;
+ sector_t block = page << (PAGE_SHIFT - blkbits);
unsigned first = pos - (block << blkbits);
long size;
@@ -168,8 +174,10 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
else
len = iov_iter_zero(max - pos, iter);
- if (!len)
+ if (!len) {
+ retval = -EFAULT;
break;
+ }
pos += len;
addr += len;
@@ -284,6 +292,7 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf)
{
+ struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address;
void __pmem *addr;
@@ -291,6 +300,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
pgoff_t size;
int error;
+ i_mmap_lock_read(mapping);
+
/*
* Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the
@@ -320,6 +331,8 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = vm_insert_mixed(vma, vaddr, pfn);
out:
+ i_mmap_unlock_read(mapping);
+
return error;
}
@@ -381,17 +394,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate
*/
error = -EIO;
- goto unlock;
+ goto unlock_page;
}
- } else {
- i_mmap_lock_write(mapping);
}
error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */
if (error)
- goto unlock;
+ goto unlock_page;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -402,9 +413,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO;
if (error)
- goto unlock;
+ goto unlock_page;
} else {
- i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf);
}
}
@@ -416,15 +426,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else
clear_user_highpage(new_page, vaddr);
if (error)
- goto unlock;
+ goto unlock_page;
vmf->page = page;
if (!page) {
+ i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT;
if (vmf->pgoff >= size) {
+ i_mmap_unlock_read(mapping);
error = -EIO;
- goto unlock;
+ goto out;
}
}
return VM_FAULT_LOCKED;
@@ -460,8 +472,6 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
}
- if (!page)
- i_mmap_unlock_write(mapping);
out:
if (error == -ENOMEM)
return VM_FAULT_OOM | major;
@@ -470,14 +480,11 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major;
- unlock:
+ unlock_page:
if (page) {
unlock_page(page);
page_cache_release(page);
- } else {
- i_mmap_unlock_write(mapping);
}
-
goto out;
}
EXPORT_SYMBOL(__dax_fault);
@@ -555,10 +562,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE;
- i_mmap_lock_write(mapping);
length = get_block(inode, block, &bh, write);
if (length)
return VM_FAULT_SIGBUS;
+ i_mmap_lock_read(mapping);
/*
* If the filesystem isn't willing to tell us the length of a hole,
@@ -568,24 +575,14 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
goto fallback;
- if (buffer_unwritten(&bh) || buffer_new(&bh)) {
- int i;
- for (i = 0; i < PTRS_PER_PMD; i++)
- clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
- wmb_pmem();
- count_vm_event(PGMAJFAULT);
- mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
- result |= VM_FAULT_MAJOR;
- }
-
/*
* If we allocated new storage, make sure no process has any
* zero pages covering this hole
*/
if (buffer_new(&bh)) {
- i_mmap_unlock_write(mapping);
+ i_mmap_unlock_read(mapping);
unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
- i_mmap_lock_write(mapping);
+ i_mmap_lock_read(mapping);
}
/*
@@ -632,15 +629,32 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
goto fallback;
+ /*
+ * TODO: teach vmf_insert_pfn_pmd() to support
+ * 'pte_special' for pmds
+ */
+ if (pfn_valid(pfn))
+ goto fallback;
+
+ if (buffer_unwritten(&bh) || buffer_new(&bh)) {
+ int i;
+ for (i = 0; i < PTRS_PER_PMD; i++)
+ clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+ wmb_pmem();
+ count_vm_event(PGMAJFAULT);
+ mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+ result |= VM_FAULT_MAJOR;
+ }
+
result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
}
out:
+ i_mmap_unlock_read(mapping);
+
if (buffer_unwritten(&bh))
complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
- i_mmap_unlock_write(mapping);
-
return result;
fallback:
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6c55ade071c3..d2ba12e23ed9 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -42,6 +42,22 @@ const struct file_operations debugfs_file_operations = {
.llseek = noop_llseek,
};
+static struct dentry *debugfs_create_mode(const char *name, umode_t mode,
+ struct dentry *parent, void *value,
+ const struct file_operations *fops,
+ const struct file_operations *fops_ro,
+ const struct file_operations *fops_wo)
+{
+ /* if there are no write bits set, make read only */
+ if (!(mode & S_IWUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_ro);
+ /* if there are no read bits set, make write only */
+ if (!(mode & S_IRUGO))
+ return debugfs_create_file(name, mode, parent, value, fops_wo);
+
+ return debugfs_create_file(name, mode, parent, value, fops);
+}
+
static int debugfs_u8_set(void *data, u64 val)
{
*(u8 *)data = val;
@@ -83,14 +99,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n");
struct dentry *debugfs_create_u8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u8,
+ &fops_u8_ro, &fops_u8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u8);
@@ -135,14 +145,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n");
struct dentry *debugfs_create_u16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u16,
+ &fops_u16_ro, &fops_u16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u16);
@@ -187,14 +191,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n");
struct dentry *debugfs_create_u32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u32,
+ &fops_u32_ro, &fops_u32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u32);
@@ -240,17 +238,59 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n");
struct dentry *debugfs_create_u64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_u64_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_u64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_u64,
+ &fops_u64_ro, &fops_u64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_u64);
+static int debugfs_ulong_set(void *data, u64 val)
+{
+ *(unsigned long *)data = val;
+ return 0;
+}
+
+static int debugfs_ulong_get(void *data, u64 *val)
+{
+ *val = *(unsigned long *)data;
+ return 0;
+}
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong, debugfs_ulong_get, debugfs_ulong_set, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_ro, debugfs_ulong_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n");
+
+/**
+ * debugfs_create_ulong - create a debugfs file that is used to read and write
+ * an unsigned long value.
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file. This should be a
+ * directory dentry if set. If this parameter is %NULL, then the
+ * file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ * from.
+ *
+ * This function creates a file in debugfs with the given name that
+ * contains the value of the variable @value. If the @mode variable is so
+ * set, it can be read from, and written to.
+ *
+ * This function will return a pointer to a dentry if it succeeds. This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.) If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned. It is not wise to check for this value, but rather, check for
+ * %NULL or !%NULL instead as to eliminate the need for #ifdef in the calling
+ * code.
+ */
+struct dentry *debugfs_create_ulong(const char *name, umode_t mode,
+ struct dentry *parent, unsigned long *value)
+{
+ return debugfs_create_mode(name, mode, parent, value, &fops_ulong,
+ &fops_ulong_ro, &fops_ulong_wo);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_ulong);
+
DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n");
@@ -264,6 +304,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_ro, debugfs_u64_get, NULL, "0x%016llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64_wo, NULL, debugfs_u64_set, "0x%016llx\n");
/*
* debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
@@ -286,14 +328,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n
struct dentry *debugfs_create_x8(const char *name, umode_t mode,
struct dentry *parent, u8 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x8_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x8);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x8,
+ &fops_x8_ro, &fops_x8_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x8);
@@ -310,14 +346,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8);
struct dentry *debugfs_create_x16(const char *name, umode_t mode,
struct dentry *parent, u16 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x16_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x16);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x16,
+ &fops_x16_ro, &fops_x16_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x16);
@@ -334,14 +364,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16);
struct dentry *debugfs_create_x32(const char *name, umode_t mode,
struct dentry *parent, u32 *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value, &fops_x32_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_x32);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x32,
+ &fops_x32_ro, &fops_x32_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x32);
@@ -358,7 +382,8 @@ EXPORT_SYMBOL_GPL(debugfs_create_x32);
struct dentry *debugfs_create_x64(const char *name, umode_t mode,
struct dentry *parent, u64 *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_x64);
+ return debugfs_create_mode(name, mode, parent, value, &fops_x64,
+ &fops_x64_ro, &fops_x64_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_x64);
@@ -375,6 +400,8 @@ static int debugfs_size_t_get(void *data, u64 *val)
}
DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
"%llu\n"); /* %llu and %zu are more or less the same */
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_ro, debugfs_size_t_get, NULL, "%llu\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_size_t_wo, NULL, debugfs_size_t_set, "%llu\n");
/**
* debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value
@@ -389,7 +416,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set,
struct dentry *debugfs_create_size_t(const char *name, umode_t mode,
struct dentry *parent, size_t *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_size_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_size_t,
+ &fops_size_t_ro, &fops_size_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_size_t);
@@ -422,16 +450,8 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set, "%lld\n");
struct dentry *debugfs_create_atomic_t(const char *name, umode_t mode,
struct dentry *parent, atomic_t *value)
{
- /* if there are no write bits set, make read only */
- if (!(mode & S_IWUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_ro);
- /* if there are no read bits set, make write only */
- if (!(mode & S_IRUGO))
- return debugfs_create_file(name, mode, parent, value,
- &fops_atomic_t_wo);
-
- return debugfs_create_file(name, mode, parent, value, &fops_atomic_t);
+ return debugfs_create_mode(name, mode, parent, value, &fops_atomic_t,
+ &fops_atomic_t_ro, &fops_atomic_t_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_atomic_t);
@@ -439,7 +459,7 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
size_t count, loff_t *ppos)
{
char buf[3];
- u32 *val = file->private_data;
+ bool *val = file->private_data;
if (*val)
buf[0] = 'Y';
@@ -457,7 +477,7 @@ ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
char buf[32];
size_t buf_size;
bool bv;
- u32 *val = file->private_data;
+ bool *val = file->private_data;
buf_size = min(count, (sizeof(buf)-1));
if (copy_from_user(buf, user_buf, buf_size))
@@ -478,6 +498,18 @@ static const struct file_operations fops_bool = {
.llseek = default_llseek,
};
+static const struct file_operations fops_bool_ro = {
+ .read = debugfs_read_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
+static const struct file_operations fops_bool_wo = {
+ .write = debugfs_write_file_bool,
+ .open = simple_open,
+ .llseek = default_llseek,
+};
+
/**
* debugfs_create_bool - create a debugfs file that is used to read and write a boolean value
* @name: a pointer to a string containing the name of the file to create.
@@ -503,9 +535,10 @@ static const struct file_operations fops_bool = {
* code.
*/
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
- struct dentry *parent, u32 *value)
+ struct dentry *parent, bool *value)
{
- return debugfs_create_file(name, mode, parent, value, &fops_bool);
+ return debugfs_create_mode(name, mode, parent, value, &fops_bool,
+ &fops_bool_ro, &fops_bool_wo);
}
EXPORT_SYMBOL_GPL(debugfs_create_bool);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index c711be8d6a3c..b7fcc0de0b2f 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -271,8 +271,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&d_inode(parent)->i_mutex);
+ simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+ }
+
return dentry;
}
@@ -533,7 +537,8 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent)
/**
* debugfs_remove - removes a file or directory from the debugfs filesystem
* @dentry: a pointer to a the dentry of the file or directory to be
- * removed.
+ * removed. If this parameter is NULL or an error value, nothing
+ * will be done.
*
* This function removes a file or directory in debugfs that was previously
* created with a call to another debugfs function (like
@@ -565,7 +570,8 @@ EXPORT_SYMBOL_GPL(debugfs_remove);
/**
* debugfs_remove_recursive - recursively removes a directory
- * @dentry: a pointer to a the dentry of the directory to be removed.
+ * @dentry: a pointer to a the dentry of the directory to be removed. If this
+ * parameter is NULL or an error value, nothing will be done.
*
* This function recursively removes a directory tree in debugfs that
* was previously created with a call to another debugfs function
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 11256291642e..cb5337d8c273 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -109,6 +109,8 @@ struct dio_submit {
struct dio {
int flags; /* doesn't change */
int rw;
+ blk_qc_t bio_cookie;
+ struct block_device *bio_bdev;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
@@ -120,6 +122,7 @@ struct dio {
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
bool defer_completion; /* defer AIO completion to workqueue? */
+ bool should_dirty; /* if pages should be dirtied */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
@@ -360,7 +363,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
/*
* bio_alloc() is guaranteed to return a bio when called with
- * __GFP_WAIT and we request a valid number of vectors.
+ * __GFP_RECLAIM and we request a valid number of vectors.
*/
bio = bio_alloc(GFP_KERNEL, nr_vecs);
@@ -393,14 +396,17 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio)
dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- if (dio->is_async && dio->rw == READ)
+ if (dio->is_async && dio->rw == READ && dio->should_dirty)
bio_set_pages_dirty(bio);
- if (sdio->submit_io)
+ dio->bio_bdev = bio->bi_bdev;
+
+ if (sdio->submit_io) {
sdio->submit_io(dio->rw, bio, dio->inode,
sdio->logical_offset_in_bio);
- else
- submit_bio(dio->rw, bio);
+ dio->bio_cookie = BLK_QC_T_NONE;
+ } else
+ dio->bio_cookie = submit_bio(dio->rw, bio);
sdio->bio = NULL;
sdio->boundary = 0;
@@ -439,7 +445,8 @@ static struct bio *dio_await_one(struct dio *dio)
__set_current_state(TASK_UNINTERRUPTIBLE);
dio->waiter = current;
spin_unlock_irqrestore(&dio->bio_lock, flags);
- io_schedule();
+ if (!blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+ io_schedule();
/* wake up sets us TASK_RUNNING */
spin_lock_irqsave(&dio->bio_lock, flags);
dio->waiter = NULL;
@@ -464,14 +471,15 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
if (bio->bi_error)
dio->io_error = -EIO;
- if (dio->is_async && dio->rw == READ) {
+ if (dio->is_async && dio->rw == READ && dio->should_dirty) {
bio_check_pages_dirty(bio); /* transfers ownership */
err = bio->bi_error;
} else {
bio_for_each_segment_all(bvec, bio, i) {
struct page *page = bvec->bv_page;
- if (dio->rw == READ && !PageCompound(page))
+ if (dio->rw == READ && !PageCompound(page) &&
+ dio->should_dirty)
set_page_dirty_lock(page);
page_cache_release(page);
}
@@ -1219,6 +1227,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
spin_lock_init(&dio->bio_lock);
dio->refcount = 1;
+ dio->should_dirty = (iter->type == ITER_IOVEC);
sdio.iter = iter;
sdio.final_block_in_request =
(offset + iov_iter_count(iter)) >> blkbits;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 5532f097f6da..d401425f602a 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -145,7 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
if (xop->callback == NULL) {
- rv = wait_event_killable(recv_wq, (op->done != 0));
+ rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
log_debug(ls, "dlm_posix_lock: wait killed %llx",
(unsigned long long)number);
@@ -172,7 +172,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = op->info.rv;
if (!rv) {
- if (posix_lock_file_wait(file, fl) < 0)
+ if (locks_lock_file_wait(file, fl) < 0)
log_error(ls, "dlm_posix_lock: vfs lock error %llx",
(unsigned long long)number);
}
@@ -262,7 +262,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
/* cause the vfs unlock to return ENOENT if lock is not found */
fl->fl_flags |= FL_EXISTS;
- rv = posix_lock_file_wait(file, fl);
+ rv = locks_lock_file_wait(file, fl);
if (rv == -ENOENT) {
rv = 0;
goto out_free;
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index 5ba029e627cc..7b39260c7bba 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -86,7 +86,7 @@ ecryptfs_get_encrypted_key_payload_data(struct key *key)
{
if (key->type == &key_type_encrypted)
return (struct ecryptfs_auth_tok *)
- (&((struct encrypted_key_payload *)key->payload.data)->payload_data);
+ (&((struct encrypted_key_payload *)key->payload.data[0])->payload_data);
else
return NULL;
}
@@ -117,8 +117,7 @@ ecryptfs_get_key_payload_data(struct key *key)
auth_tok = ecryptfs_get_encrypted_key_payload_data(key);
if (!auth_tok)
- return (struct ecryptfs_auth_tok *)
- (((struct user_key_payload *)key->payload.data)->data);
+ return (struct ecryptfs_auth_tok *)user_key_payload(key)->data;
else
return auth_tok;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 3c4db1172d22..e2e47ba5d313 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -270,7 +270,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
ecryptfs_inode = ecryptfs_do_create(directory_inode, ecryptfs_dentry,
mode);
- if (unlikely(IS_ERR(ecryptfs_inode))) {
+ if (IS_ERR(ecryptfs_inode)) {
ecryptfs_printk(KERN_WARNING, "Failed to create file in"
"lower filesystem\n");
rc = PTR_ERR(ecryptfs_inode);
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 09a6bb1ad63c..994e078da4bb 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -80,9 +80,6 @@ static int exofs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
struct inode *inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
inode = exofs_new_inode(dir, mode);
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 8d15febd0aa3..4c69c94cafd8 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -684,6 +684,9 @@ struct ext2_inode_info {
struct rw_semaphore xattr_sem;
#endif
rwlock_t i_meta_lock;
+#ifdef CONFIG_FS_DAX
+ struct rw_semaphore dax_sem;
+#endif
/*
* truncate_mutex is for serialising ext2_truncate() against
@@ -699,6 +702,14 @@ struct ext2_inode_info {
#endif
};
+#ifdef CONFIG_FS_DAX
+#define dax_sem_down_write(ext2_inode) down_write(&(ext2_inode)->dax_sem)
+#define dax_sem_up_write(ext2_inode) up_write(&(ext2_inode)->dax_sem)
+#else
+#define dax_sem_down_write(ext2_inode)
+#define dax_sem_up_write(ext2_inode)
+#endif
+
/*
* Inode dynamic state flags
*/
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 1982c3f11aec..11a42c5a09ae 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -27,27 +27,103 @@
#include "acl.h"
#ifdef CONFIG_FS_DAX
+/*
+ * The lock ordering for ext2 DAX fault paths is:
+ *
+ * mmap_sem (MM)
+ * sb_start_pagefault (vfs, freeze)
+ * ext2_inode_info->dax_sem
+ * address_space->i_mmap_rwsem or page_lock (mutually exclusive in DAX)
+ * ext2_inode_info->truncate_mutex
+ *
+ * The default page_lock and i_size verification done by non-DAX fault paths
+ * is sufficient because ext2 doesn't support hole punching.
+ */
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_fault(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_fault(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (vmf->flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
- return dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+ down_read(&ei->dax_sem);
+
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
{
- return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+
+static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+ struct ext2_inode_info *ei = EXT2_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ down_read(&ei->dax_sem);
+
+ /* check that the faulting page hasn't raced with truncate */
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+
+ up_read(&ei->dax_sem);
+ sb_end_pagefault(inode->i_sb);
+ return ret;
}
static const struct vm_operations_struct ext2_dax_vm_ops = {
.fault = ext2_dax_fault,
.pmd_fault = ext2_dax_pmd_fault,
.page_mkwrite = ext2_dax_mkwrite,
- .pfn_mkwrite = dax_pfn_mkwrite,
+ .pfn_mkwrite = ext2_dax_pfn_mkwrite,
};
static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c60a248c640c..0aa9bf6e6e53 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1085,6 +1085,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
ext2_free_data(inode, p, q);
}
+/* dax_sem must be held when calling this function */
static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
{
__le32 *i_data = EXT2_I(inode)->i_data;
@@ -1100,6 +1101,10 @@ static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
blocksize = inode->i_sb->s_blocksize;
iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
+#ifdef CONFIG_FS_DAX
+ WARN_ON(!rwsem_is_locked(&ei->dax_sem));
+#endif
+
n = ext2_block_to_path(inode, iblock, offsets, NULL);
if (n == 0)
return;
@@ -1185,7 +1190,10 @@ static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
return;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
+
+ dax_sem_down_write(EXT2_I(inode));
__ext2_truncate_blocks(inode, offset);
+ dax_sem_up_write(EXT2_I(inode));
}
static int ext2_setsize(struct inode *inode, loff_t newsize)
@@ -1213,8 +1221,10 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
if (error)
return error;
+ dax_sem_down_write(EXT2_I(inode));
truncate_setsize(inode, newsize);
__ext2_truncate_blocks(inode, newsize);
+ dax_sem_up_write(EXT2_I(inode));
inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
if (inode_needs_sync(inode)) {
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index b4841e3066a5..3267a80dbbe2 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -143,9 +143,6 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode,
struct inode * inode;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = dquot_initialize(dir);
if (err)
return err;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 900e19cf9ef6..3a71cea68420 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -192,6 +192,9 @@ static void init_once(void *foo)
init_rwsem(&ei->xattr_sem);
#endif
mutex_init(&ei->truncate_mutex);
+#ifdef CONFIG_FS_DAX
+ init_rwsem(&ei->dax_sem);
+#endif
inode_init_once(&ei->vfs_inode);
}
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 0b6bfd3a398b..fa70848afa8f 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -293,10 +293,9 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list",
ext2_xattr_handler(entry->e_name_index);
if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ size_t size = handler->list(handler, dentry, buffer,
+ rest, entry->e_name,
+ entry->e_name_len);
if (buffer) {
if (size > rest) {
error = -ERANGE;
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index 702fc6840246..dfb08750370d 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -8,8 +8,9 @@
#include "xattr.h"
static size_t
-ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext2_xattr_security_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -23,8 +24,9 @@ ext2_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -33,8 +35,9 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 42b6e9874bcc..3150dd3a7859 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -9,8 +9,9 @@
#include "xattr.h"
static size_t
-ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext2_xattr_trusted_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -27,8 +28,9 @@ ext2_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -37,8 +39,9 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index ecdc4605192c..339a49bbb8ef 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -11,8 +11,9 @@
#include "xattr.h"
static size_t
-ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext2_xattr_user_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -29,8 +30,9 @@ ext2_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -41,8 +43,9 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 47728da7702c..b46e9fc64196 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -63,7 +63,7 @@ config EXT4_FS
If unsure, say N.
config EXT4_USE_FOR_EXT2
- bool "Use ext4 for ext2/ext3 file systems"
+ bool "Use ext4 for ext2 file systems"
depends on EXT4_FS
depends on EXT2_FS=n
default y
diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index 75285ea9aa05..f52cf54f0cbc 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
- xattr_trusted.o inline.o readpage.o
+ xattr_trusted.o inline.o readpage.o sysfs.o
ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o
ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index cd6ea29be645..ec0668a60678 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -191,6 +191,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
/* If checksum is bad mark all blocks used to prevent allocation
* essentially implementing a per-group read-only flag. */
if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
+ ext4_error(sb, "Checksum bad for group %u", block_group);
grp = ext4_get_group_info(sb, block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
@@ -203,7 +204,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return -EIO;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, sb->s_blocksize);
@@ -213,7 +214,7 @@ static int ext4_init_block_bitmap(struct super_block *sb,
start = ext4_group_first_block_no(sb, block_group);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flex_bg = 1;
/* Set bits for block and inode bitmaps, and inode table */
@@ -322,7 +323,7 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
ext4_fsblk_t blk;
ext4_fsblk_t group_first_block;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+ if (ext4_has_feature_flex_bg(sb)) {
/* with FLEX_BG, the inode/block bitmaps and itable
* blocks may not be in the group at all
* so the bitmap validation will be skipped for those groups
@@ -360,42 +361,45 @@ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb,
return 0;
}
-static void ext4_validate_block_bitmap(struct super_block *sb,
- struct ext4_group_desc *desc,
- ext4_group_t block_group,
- struct buffer_head *bh)
+static int ext4_validate_block_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
{
ext4_fsblk_t blk;
struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
struct ext4_sb_info *sbi = EXT4_SB(sb);
- if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
- return;
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
ext4_lock_group(sb, block_group);
- blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
- if (unlikely(blk != 0)) {
+ if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
+ desc, bh))) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
- block_group, blk);
+ ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSBADCRC;
}
- if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group,
- desc, bh))) {
+ blk = ext4_valid_block_bitmap(sb, desc, block_group, bh);
+ if (unlikely(blk != 0)) {
ext4_unlock_group(sb, block_group);
- ext4_error(sb, "bg %u: bad block bitmap checksum", block_group);
+ ext4_error(sb, "bg %u: block %llu: invalid block bitmap",
+ block_group, blk);
if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
percpu_counter_sub(&sbi->s_freeclusters_counter,
grp->bb_free);
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
- return;
+ return -EFSCORRUPTED;
}
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
+ return 0;
}
/**
@@ -414,17 +418,18 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh;
ext4_fsblk_t bitmap_blk;
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_block_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
if (unlikely(!bh)) {
ext4_error(sb, "Cannot get buffer for block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
if (bitmap_uptodate(bh))
@@ -437,7 +442,6 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
}
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
- int err;
err = ext4_init_block_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
@@ -445,7 +449,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
if (err)
- ext4_error(sb, "Checksum bad for grp %u", block_group);
+ goto out;
goto verify;
}
ext4_unlock_group(sb, block_group);
@@ -468,11 +472,13 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
submit_bh(READ | REQ_META | REQ_PRIO, bh);
return bh;
verify:
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- if (buffer_verified(bh))
- return bh;
+ err = ext4_validate_block_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
+ return bh;
+out:
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
/* Returns 0 on success, 1 on error */
@@ -485,32 +491,32 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
return 0;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return 1;
+ return -EFSCORRUPTED;
wait_on_buffer(bh);
if (!buffer_uptodate(bh)) {
ext4_error(sb, "Cannot read block bitmap - "
"block_group = %u, block_bitmap = %llu",
block_group, (unsigned long long) bh->b_blocknr);
- return 1;
+ return -EIO;
}
clear_buffer_new(bh);
/* Panic or remount fs read-only if block bitmap is invalid */
- ext4_validate_block_bitmap(sb, desc, block_group, bh);
- /* ...but check for error just in case errors=continue. */
- return !buffer_verified(bh);
+ return ext4_validate_block_bitmap(sb, desc, block_group, bh);
}
struct buffer_head *
ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
{
struct buffer_head *bh;
+ int err;
bh = ext4_read_block_bitmap_nowait(sb, block_group);
- if (!bh)
- return NULL;
- if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+ if (IS_ERR(bh))
+ return bh;
+ err = ext4_wait_block_bitmap(sb, block_group, bh);
+ if (err) {
put_bh(bh);
- return NULL;
+ return ERR_PTR(err);
}
return bh;
}
@@ -681,8 +687,10 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
desc_count += ext4_free_group_clusters(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_block_bitmap(sb, i);
- if (bitmap_bh == NULL)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_CLUSTERS_PER_GROUP(sb) / 8);
@@ -740,14 +748,13 @@ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group)
if (group == 0)
return 1;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_SPARSE_SUPER2)) {
+ if (ext4_has_feature_sparse_super2(sb)) {
if (group == le32_to_cpu(es->s_backup_bgs[0]) ||
group == le32_to_cpu(es->s_backup_bgs[1]))
return 1;
return 0;
}
- if ((group <= 1) || !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER))
+ if ((group <= 1) || !ext4_has_feature_sparse_super(sb))
return 1;
if (!(group & 1))
return 0;
@@ -776,7 +783,7 @@ static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
if (!ext4_bg_has_super(sb, group))
return 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
+ if (ext4_has_feature_meta_bg(sb))
return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
else
return EXT4_SB(sb)->s_gdb_count;
@@ -797,8 +804,7 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group)
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) ||
- metagroup < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg)
return ext4_bg_num_gdb_nometa(sb, group);
return ext4_bg_num_gdb_meta(sb,group);
@@ -818,7 +824,7 @@ static unsigned ext4_num_base_meta_clusters(struct super_block *sb,
/* Check for superblock and gdt backups in this group */
num = ext4_bg_has_super(sb, block_group);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
+ if (!ext4_has_feature_meta_bg(sb) ||
block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) *
sbi->s_desc_per_block) {
if (num) {
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 3522340c7a99..02ddec6d8a7d 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -234,7 +234,7 @@ int ext4_check_blockref(const char *function, unsigned int line,
es->s_last_error_block = cpu_to_le64(blk);
ext4_error_inode(inode, function, line, blk,
"invalid block");
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 45731558138c..af06830bfc00 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -253,8 +253,7 @@ typedef enum {
EXT4_ENCRYPT,
} ext4_direction_t;
-static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
- struct inode *inode,
+static int ext4_page_crypto(struct inode *inode,
ext4_direction_t rw,
pgoff_t index,
struct page *src_page,
@@ -296,7 +295,6 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx,
else
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -353,7 +351,7 @@ struct page *ext4_encrypt(struct inode *inode,
if (IS_ERR(ciphertext_page))
goto errout;
ctx->w.control_page = plaintext_page;
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, plaintext_page->index,
plaintext_page, ciphertext_page);
if (err) {
ciphertext_page = ERR_PTR(err);
@@ -378,31 +376,14 @@ struct page *ext4_encrypt(struct inode *inode,
*
* Return: Zero on success, non-zero otherwise.
*/
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page)
+int ext4_decrypt(struct page *page)
{
BUG_ON(!PageLocked(page));
- return ext4_page_crypto(ctx, page->mapping->host,
+ return ext4_page_crypto(page->mapping->host,
EXT4_DECRYPT, page->index, page, page);
}
-/*
- * Convenience function which takes care of allocating and
- * deallocating the encryption context
- */
-int ext4_decrypt_one(struct inode *inode, struct page *page)
-{
- int ret;
-
- struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode);
-
- if (IS_ERR(ctx))
- return PTR_ERR(ctx);
- ret = ext4_decrypt(ctx, page);
- ext4_release_crypto_ctx(ctx);
- return ret;
-}
-
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
{
struct ext4_crypto_ctx *ctx;
@@ -411,7 +392,13 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
ext4_lblk_t lblk = ex->ee_block;
ext4_fsblk_t pblk = ext4_ext_pblock(ex);
unsigned int len = ext4_ext_get_actual_len(ex);
- int err = 0;
+ int ret, err = 0;
+
+#if 0
+ ext4_msg(inode->i_sb, KERN_CRIT,
+ "ext4_encrypted_zeroout ino %lu lblk %u len %u",
+ (unsigned long) inode->i_ino, lblk, len);
+#endif
BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE);
@@ -426,7 +413,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
}
while (len--) {
- err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk,
+ err = ext4_page_crypto(inode, EXT4_ENCRYPT, lblk,
ZERO_PAGE(0), ciphertext_page);
if (err)
goto errout;
@@ -437,17 +424,26 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
goto errout;
}
bio->bi_bdev = inode->i_sb->s_bdev;
- bio->bi_iter.bi_sector = pblk;
- err = bio_add_page(bio, ciphertext_page,
+ bio->bi_iter.bi_sector =
+ pblk << (inode->i_sb->s_blocksize_bits - 9);
+ ret = bio_add_page(bio, ciphertext_page,
inode->i_sb->s_blocksize, 0);
- if (err) {
+ if (ret != inode->i_sb->s_blocksize) {
+ /* should never happen! */
+ ext4_msg(inode->i_sb, KERN_ERR,
+ "bio_add_page failed: %d", ret);
+ WARN_ON(1);
bio_put(bio);
+ err = -EIO;
goto errout;
}
err = submit_bio_wait(WRITE, bio);
+ if ((err == 0) && bio->bi_error)
+ err = -EIO;
bio_put(bio);
if (err)
goto errout;
+ lblk++; pblk++;
}
err = 0;
errout:
diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c
index 847f919c84d9..2fbef8a14760 100644
--- a/fs/ext4/crypto_fname.c
+++ b/fs/ext4/crypto_fname.c
@@ -120,7 +120,6 @@ static int ext4_fname_encrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -182,7 +181,6 @@ static int ext4_fname_decrypt(struct inode *inode,
ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
res = crypto_ablkcipher_decrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c
index 1d510c11b100..c5882b36e558 100644
--- a/fs/ext4/crypto_key.c
+++ b/fs/ext4/crypto_key.c
@@ -71,7 +71,6 @@ static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE],
EXT4_AES_256_XTS_KEY_SIZE, NULL);
res = crypto_ablkcipher_encrypt(req);
if (res == -EINPROGRESS || res == -EBUSY) {
- BUG_ON(req->base.data != &ecr);
wait_for_completion(&ecr.completion);
res = ecr.res;
}
@@ -121,7 +120,7 @@ int _ext4_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct ext4_encryption_key *master_key;
struct ext4_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
@@ -208,8 +207,13 @@ retry:
goto out;
}
crypt_info->ci_keyring_key = keyring_key;
- BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ if (keyring_key->type != &key_type_logon) {
+ printk_once(KERN_WARNING
+ "ext4: key type must be logon\n");
+ res = -ENOKEY;
+ goto out;
+ }
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct ext4_encryption_key)) {
res = -EINVAL;
goto out;
@@ -217,7 +221,13 @@ retry:
master_key = (struct ext4_encryption_key *)ukp->data;
BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE !=
EXT4_KEY_DERIVATION_NONCE_SIZE);
- BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE);
+ if (master_key->size != EXT4_AES_256_XTS_KEY_SIZE) {
+ printk_once(KERN_WARNING
+ "ext4: key size incorrect: %d\n",
+ master_key->size);
+ res = -ENOKEY;
+ goto out;
+ }
res = ext4_derive_key_aes(ctx.nonce, master_key->raw,
raw_key);
if (res)
diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c
index a640ec2c4b13..ad050698143f 100644
--- a/fs/ext4/crypto_policy.c
+++ b/fs/ext4/crypto_policy.c
@@ -150,7 +150,8 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent,
if ((parent == NULL) || (child == NULL)) {
pr_err("parent %p child %p\n", parent, child);
- BUG_ON(1);
+ WARN_ON(1); /* Should never happen */
+ return 0;
}
/* no restrictions if the parent directory is not encrypted */
if (!ext4_encrypted_inode(parent))
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f9e14911918c..1d1bca74f844 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -40,8 +40,7 @@ static int is_dx_dir(struct inode *inode)
{
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX) &&
+ if (ext4_has_feature_dir_index(inode->i_sb) &&
((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
((inode->i_size >> sb->s_blocksize_bits) == 1) ||
ext4_has_inline_data(inode)))
@@ -621,14 +620,14 @@ int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf,
while ((char *) de < top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset))
- return -EIO;
+ return -EFSCORRUPTED;
nlen = EXT4_DIR_REC_LEN(de->name_len);
rlen = ext4_rec_len_from_disk(de->rec_len, buf_size);
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
offset += rlen;
}
if ((char *) de > top)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index fd1f28be5296..750063f7a50c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -374,6 +374,7 @@ struct flex_groups {
#define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */
#define EXT4_EOFBLOCKS_FL 0x00400000 /* Blocks allocated beyond EOF */
#define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */
+#define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */
#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */
#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */
@@ -431,6 +432,7 @@ enum {
EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */
EXT4_INODE_EOFBLOCKS = 22, /* Blocks allocated beyond EOF */
EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */
+ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */
EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */
};
@@ -475,6 +477,7 @@ static inline void ext4_check_flag_values(void)
CHECK_FLAG_VALUE(EA_INODE);
CHECK_FLAG_VALUE(EOFBLOCKS);
CHECK_FLAG_VALUE(INLINE_DATA);
+ CHECK_FLAG_VALUE(PROJINHERIT);
CHECK_FLAG_VALUE(RESERVED);
}
@@ -692,6 +695,7 @@ struct ext4_inode {
__le32 i_crtime; /* File Creation time */
__le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */
__le32 i_version_hi; /* high 32 bits for 64-bit version */
+ __le32 i_projid; /* Project ID */
};
struct move_extent {
@@ -1019,6 +1023,9 @@ struct ext4_inode_info {
#define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated
file systems */
+#define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly
+ specified journal checksum */
+
#define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \
~EXT4_MOUNT_##opt
#define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \
@@ -1179,7 +1186,9 @@ struct ext4_super_block {
__u8 s_encrypt_algos[4]; /* Encryption algorithms in use */
__u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */
__le32 s_lpf_ino; /* Location of the lost+found inode */
- __le32 s_reserved[100]; /* Padding to the end of the block */
+ __le32 s_prj_quota_inum; /* inode for tracking project quota */
+ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
+ __le32 s_reserved[98]; /* Padding to the end of the block */
__le32 s_checksum; /* crc32c(superblock) */
};
@@ -1522,6 +1531,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
* Feature set definitions
*/
+/* Use the ext4_{has,set,clear}_feature_* helpers; these will be removed */
#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \
((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0)
#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \
@@ -1566,6 +1576,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
*/
#define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400
#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000
+#define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000
#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001
#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002
@@ -1578,11 +1589,99 @@ static inline int ext4_encrypted_inode(struct inode *inode)
#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200
#define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */
#define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */
+#define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000
#define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */
#define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */
#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000
+#define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat |= \
+ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \
+ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \
+ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \
+}
+
+#define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \
+static inline bool ext4_has_feature_##name(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \
+} \
+static inline void ext4_set_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat |= \
+ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+} \
+static inline void ext4_clear_feature_##name(struct super_block *sb) \
+{ \
+ EXT4_SB(sb)->s_es->s_feature_incompat &= \
+ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \
+}
+
+EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC)
+EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES)
+EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL)
+EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR)
+EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE)
+EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX)
+EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2)
+
+EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER)
+EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR)
+EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK)
+EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE)
+EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA)
+EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC)
+EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM)
+EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY)
+EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT)
+
+EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION)
+EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER)
+EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV)
+EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS)
+EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT)
+EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP)
+EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG)
+EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE)
+EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED)
+EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR)
+EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA)
+EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
+
#define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_META_BG)
@@ -1598,7 +1697,7 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
-#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR
+#define EXT4_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR
#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \
EXT4_FEATURE_INCOMPAT_RECOVER| \
EXT4_FEATURE_INCOMPAT_META_BG| \
@@ -1607,7 +1706,8 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_INCOMPAT_FLEX_BG| \
EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
- EXT4_FEATURE_INCOMPAT_ENCRYPT)
+ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
+ EXT4_FEATURE_INCOMPAT_CSUM_SEED)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@@ -1619,6 +1719,40 @@ static inline int ext4_encrypted_inode(struct inode *inode)
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
EXT4_FEATURE_RO_COMPAT_QUOTA)
+#define EXTN_FEATURE_FUNCS(ver) \
+static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \
+} \
+static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \
+{ \
+ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \
+ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \
+}
+
+EXTN_FEATURE_FUNCS(2)
+EXTN_FEATURE_FUNCS(3)
+EXTN_FEATURE_FUNCS(4)
+
+static inline bool ext4_has_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_compat != 0);
+}
+static inline bool ext4_has_ro_compat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0);
+}
+static inline bool ext4_has_incompat_features(struct super_block *sb)
+{
+ return (EXT4_SB(sb)->s_es->s_feature_incompat != 0);
+}
+
/*
* Default values for user and/or group using reserved blocks
*/
@@ -1769,8 +1903,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
* (c) Daniel Phillips, 2001
*/
-#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
- EXT4_FEATURE_COMPAT_DIR_INDEX) && \
+#define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \
ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -2063,8 +2196,7 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx);
void ext4_restore_control_page(struct page *data_page);
struct page *ext4_encrypt(struct inode *inode,
struct page *plaintext_page);
-int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page);
-int ext4_decrypt_one(struct inode *inode, struct page *page);
+int ext4_decrypt(struct page *page);
int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
#ifdef CONFIG_EXT4_FS_ENCRYPTION
@@ -2072,7 +2204,7 @@ int ext4_init_crypto(void);
void ext4_exit_crypto(void);
static inline int ext4_sb_has_crypto(struct super_block *sb)
{
- return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ return ext4_has_feature_encrypt(sb);
}
#else
static inline int ext4_init_crypto(void) { return 0; }
@@ -2193,8 +2325,7 @@ int ext4_insert_dentry(struct inode *dir,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_COMPAT_DIR_INDEX))
+ if (!ext4_has_feature_dir_index(inode->i_sb))
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
}
static unsigned char ext4_filetype_table[] = {
@@ -2203,8 +2334,7 @@ static unsigned char ext4_filetype_table[] = {
static inline unsigned char get_dtype(struct super_block *sb, int filetype)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE) ||
- (filetype >= EXT4_FT_MAX))
+ if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX)
return DT_UNKNOWN;
return ext4_filetype_table[filetype];
@@ -2245,6 +2375,7 @@ extern int ext4_init_inode_table(struct super_block *sb,
extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
/* mballoc.c */
+extern const struct file_operations ext4_seq_mb_groups_fops;
extern long ext4_mb_stats;
extern long ext4_mb_max_to_scan;
extern int ext4_mb_init(struct super_block *);
@@ -2372,6 +2503,7 @@ extern int ext4_group_extend(struct super_block *sb,
extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
/* super.c */
+extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern void *ext4_kvmalloc(size_t size, gfp_t flags);
@@ -2534,15 +2666,13 @@ extern int ext4_register_li_request(struct super_block *sb,
static inline int ext4_has_group_desc_csum(struct super_block *sb)
{
- return EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_GDT_CSUM) ||
- (EXT4_SB(sb)->s_chksum_driver != NULL);
+ return ext4_has_feature_gdt_csum(sb) ||
+ EXT4_SB(sb)->s_chksum_driver != NULL;
}
static inline int ext4_has_metadata_csum(struct super_block *sb)
{
- WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(sb) &&
!EXT4_SB(sb)->s_chksum_driver);
return (EXT4_SB(sb)->s_chksum_driver != NULL);
@@ -2889,7 +3019,7 @@ static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = {
static inline void ext4_set_de_type(struct super_block *sb,
struct ext4_dir_entry_2 *de,
umode_t mode) {
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(sb))
de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
}
@@ -2903,6 +3033,12 @@ extern const struct inode_operations ext4_encrypted_symlink_inode_operations;
extern const struct inode_operations ext4_symlink_inode_operations;
extern const struct inode_operations ext4_fast_symlink_inode_operations;
+/* sysfs.c */
+extern int ext4_register_sysfs(struct super_block *sb);
+extern void ext4_unregister_sysfs(struct super_block *sb);
+extern int __init ext4_init_sysfs(void);
+extern void ext4_exit_sysfs(void);
+
/* block_validity */
extern void ext4_release_system_zone(struct super_block *sb);
extern int ext4_setup_system_zone(struct super_block *sb);
@@ -3049,4 +3185,7 @@ extern void ext4_resize_end(struct super_block *sb);
#endif /* __KERNEL__ */
+#define EFSBADCRC EBADMSG /* Bad CRC detected */
+#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
+
#endif /* _EXT4_H */
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d41843181818..e770c1ee4613 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -88,13 +88,13 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
return 0;
}
+ err = handle->h_err;
if (!handle->h_transaction) {
- err = jbd2_journal_stop(handle);
- return handle->h_err ? handle->h_err : err;
+ rc = jbd2_journal_stop(handle);
+ return err ? err : rc;
}
sb = handle->h_transaction->t_journal->j_private;
- err = handle->h_err;
rc = jbd2_journal_stop(handle);
if (!err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 9c5b49fb281e..5f5846211095 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -34,8 +34,7 @@
*/
#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \
- (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \
- ? 20U : 8U)
+ (ext4_has_feature_extents(sb) ? 20U : 8U)
/* Extended attribute operations touch at most two data buffers,
* two bitmap buffers, and two group summaries, in addition to the inode
@@ -84,17 +83,16 @@
/* Amount of blocks needed for quota update - we know that the structure was
* allocated so we need to update only data block */
#define EXT4_QUOTA_TRANS_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
- 1 : 0)
+ ext4_has_feature_quota(sb)) ? 1 : 0)
/* Amount of blocks needed for quota insert/delete - we do some block writes
* but inode, sb and group updates are done only once */
#define EXT4_QUOTA_INIT_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_INIT_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_INIT_REWRITE) : 0)
#define EXT4_QUOTA_DEL_BLOCKS(sb) ((test_opt(sb, QUOTA) ||\
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) ?\
+ ext4_has_feature_quota(sb)) ?\
(DQUOT_DEL_ALLOC*(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)\
+3+DQUOT_DEL_REWRITE) : 0)
#else
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 2553aa8b608d..551353b1b17a 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -442,7 +442,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
int depth, ext4_fsblk_t pblk)
{
const char *error_msg;
- int max = 0;
+ int max = 0, err = -EFSCORRUPTED;
if (unlikely(eh->eh_magic != EXT4_EXT_MAGIC)) {
error_msg = "invalid magic";
@@ -473,6 +473,7 @@ static int __ext4_ext_check(const char *function, unsigned int line,
if (ext_depth(inode) != depth &&
!ext4_extent_block_csum_verify(inode, eh)) {
error_msg = "extent tree corrupted";
+ err = -EFSBADCRC;
goto corrupted;
}
return 0;
@@ -485,7 +486,7 @@ corrupted:
le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
max, le16_to_cpu(eh->eh_depth), depth);
- return -EIO;
+ return err;
}
#define ext4_ext_check(inode, eh, depth, pblk) \
@@ -899,7 +900,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
bh = read_extent_tree_block(inode, path[ppos].p_block, --i,
flags);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
ret = PTR_ERR(bh);
goto err;
}
@@ -910,7 +911,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
put_bh(bh);
EXT4_ERROR_INODE(inode,
"ppos %d > depth %d", ppos, depth);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto err;
}
path[ppos].p_bh = bh;
@@ -959,7 +960,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d == ei_block %d!",
logical, le32_to_cpu(curp->p_idx->ei_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (unlikely(le16_to_cpu(curp->p_hdr->eh_entries)
@@ -968,7 +969,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
"eh_entries %d >= eh_max %d!",
le16_to_cpu(curp->p_hdr->eh_entries),
le16_to_cpu(curp->p_hdr->eh_max));
- return -EIO;
+ return -EFSCORRUPTED;
}
if (logical > le32_to_cpu(curp->p_idx->ei_block)) {
@@ -992,7 +993,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
ix->ei_block = cpu_to_le32(logical);
@@ -1001,7 +1002,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
if (unlikely(ix > EXT_LAST_INDEX(curp->p_hdr))) {
EXT4_ERROR_INODE(inode, "ix > EXT_LAST_INDEX!");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_dirty(handle, inode, curp);
@@ -1042,7 +1043,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
* border from split point */
if (unlikely(path[depth].p_ext > EXT_MAX_EXTENT(path[depth].p_hdr))) {
EXT4_ERROR_INODE(inode, "p_ext > EXT_MAX_EXTENT!");
- return -EIO;
+ return -EFSCORRUPTED;
}
if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) {
border = path[depth].p_ext[1].ee_block;
@@ -1086,7 +1087,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
newblock = ablocks[--a];
if (unlikely(newblock == 0)) {
EXT4_ERROR_INODE(inode, "newblock == 0!");
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS);
@@ -1112,7 +1113,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode, "eh_entries %d != eh_max %d!",
path[depth].p_hdr->eh_entries,
path[depth].p_hdr->eh_max);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy from next extent */
@@ -1151,7 +1152,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
k = depth - at - 1;
if (unlikely(k < 0)) {
EXT4_ERROR_INODE(inode, "k %d < 0!", k);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
if (k)
@@ -1191,7 +1192,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_MAX_INDEX != EXT_LAST_INDEX ee_block %d!",
le32_to_cpu(path[i].p_ext->ee_block));
- err = -EIO;
+ err = -EFSCORRUPTED;
goto cleanup;
}
/* start copy indexes */
@@ -1425,7 +1426,7 @@ static int ext4_ext_search_left(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1444,7 +1445,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"EXT_FIRST_EXTENT != ex *logical %d ee_block %d!",
*logical, le32_to_cpu(ex->ee_block));
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1455,7 +1456,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT_FIRST_INDEX(path[depth].p_hdr) != NULL ?
le32_to_cpu(EXT_FIRST_INDEX(path[depth].p_hdr)->ei_block) : 0,
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
return 0;
@@ -1465,7 +1466,7 @@ static int ext4_ext_search_left(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
*logical = le32_to_cpu(ex->ee_block) + ee_len - 1;
@@ -1495,7 +1496,7 @@ static int ext4_ext_search_right(struct inode *inode,
if (unlikely(path == NULL)) {
EXT4_ERROR_INODE(inode, "path == NULL *logical %d!", *logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = path->p_depth;
*phys = 0;
@@ -1514,7 +1515,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"first_extent(path[%d].p_hdr) != ex",
depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
while (--depth >= 0) {
ix = path[depth].p_idx;
@@ -1522,7 +1523,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"ix != EXT_FIRST_INDEX *logical %d!",
*logical);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
goto found_extent;
@@ -1532,7 +1533,7 @@ static int ext4_ext_search_right(struct inode *inode,
EXT4_ERROR_INODE(inode,
"logical %d < ee_block %d + ee_len %d!",
*logical, le32_to_cpu(ex->ee_block), ee_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (ex != EXT_LAST_EXTENT(path[depth].p_hdr)) {
@@ -1670,7 +1671,7 @@ static int ext4_ext_correct_indexes(handle_t *handle, struct inode *inode,
if (unlikely(ex == NULL || eh == NULL)) {
EXT4_ERROR_INODE(inode,
"ex %p == NULL or eh %p == NULL", ex, eh);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (depth == 0) {
@@ -1938,14 +1939,14 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
mb_flags |= EXT4_MB_DELALLOC_RESERVED;
if (unlikely(ext4_ext_get_actual_len(newext) == 0)) {
EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
depth = ext_depth(inode);
ex = path[depth].p_ext;
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* try to insert block into found extent and return */
@@ -2172,7 +2173,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(path[depth].p_hdr == NULL)) {
up_read(&EXT4_I(inode)->i_data_sem);
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
ex = path[depth].p_ext;
@@ -2241,7 +2242,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
if (unlikely(es.es_len == 0)) {
EXT4_ERROR_INODE(inode, "es.es_len == 0");
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
@@ -2264,7 +2265,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
"next extent == %u, next "
"delalloc extent = %u",
next, next_del);
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
}
@@ -2363,7 +2364,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
leaf = ext4_idx_pblock(path->p_idx);
if (unlikely(path->p_hdr->eh_entries == 0)) {
EXT4_ERROR_INODE(inode, "path->p_hdr->eh_entries == 0");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = ext4_ext_get_access(handle, inode, path);
if (err)
@@ -2612,7 +2613,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
eh = path[depth].p_hdr;
if (unlikely(path[depth].p_hdr == NULL)) {
EXT4_ERROR_INODE(inode, "path[%d].p_hdr == NULL", depth);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* find where to start removing */
ex = path[depth].p_ext;
@@ -2666,7 +2667,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
"on extent %u:%u",
start, end, ex_ee_block,
ex_ee_block + ex_ee_len - 1);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
} else if (a != ex_ee_block) {
/* remove tail of the extent */
@@ -2841,7 +2842,7 @@ again:
EXT4_ERROR_INODE(inode,
"path[%d].p_hdr == NULL",
depth);
- err = -EIO;
+ err = -EFSCORRUPTED;
}
goto out;
}
@@ -2920,7 +2921,7 @@ again:
i = 0;
if (ext4_ext_check(inode, path[0].p_hdr, depth, 0)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out;
}
}
@@ -2978,7 +2979,7 @@ again:
* Should be a no-op if we did IO above. */
cond_resched();
if (WARN_ON(i + 1 > depth)) {
- err = -EIO;
+ err = -EFSCORRUPTED;
break;
}
path[i + 1].p_bh = bh;
@@ -3054,7 +3055,7 @@ void ext4_ext_init(struct super_block *sb)
* possible initialization would be here
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
#if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
printk(KERN_INFO "EXT4-fs: file extents enabled"
#ifdef AGGRESSIVE_TEST
@@ -3081,7 +3082,7 @@ void ext4_ext_init(struct super_block *sb)
*/
void ext4_ext_release(struct super_block *sb)
{
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
+ if (!ext4_has_feature_extents(sb))
return;
#ifdef EXTENTS_STATS
@@ -3345,7 +3346,7 @@ static int ext4_split_extent(handle_t *handle,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
unwritten = ext4_ext_is_unwritten(ex);
split_flag1 = 0;
@@ -3558,6 +3559,9 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
max_zeroout = sbi->s_extent_max_zeroout_kb >>
(inode->i_sb->s_blocksize_bits - 10);
+ if (ext4_encrypted_inode(inode))
+ max_zeroout = 0;
+
/* If extent is less than s_max_zeroout_kb, zeroout directly */
if (max_zeroout && (ee_len <= max_zeroout)) {
err = ext4_ext_zeroout(inode, ex);
@@ -3970,7 +3974,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode,
if (!ex) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) map->m_lblk);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4308,7 +4312,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
"lblock: %lu, depth: %d pblock %lld",
(unsigned long) map->m_lblk, depth,
path[depth].p_block);
- err = -EIO;
+ err = -EFSCORRUPTED;
goto out2;
}
@@ -5271,7 +5275,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
if (depth == path->p_depth) {
ex_start = path[depth].p_ext;
if (!ex_start)
- return -EIO;
+ return -EFSCORRUPTED;
ex_last = EXT_LAST_EXTENT(path[depth].p_hdr);
@@ -5411,7 +5415,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
if (!extent) {
EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
(unsigned long) *iterator);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (SHIFT == SHIFT_LEFT && *iterator >
le32_to_cpu(extent->ee_block)) {
@@ -5792,7 +5796,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
int split = 0;
path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path1))) {
+ if (IS_ERR(path1)) {
*erp = PTR_ERR(path1);
path1 = NULL;
finish:
@@ -5800,7 +5804,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
goto repeat;
}
path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
- if (unlikely(IS_ERR(path2))) {
+ if (IS_ERR(path2)) {
*erp = PTR_ERR(path2);
path2 = NULL;
goto finish;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 26724aeece73..ac748b3af1c1 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1089,20 +1089,9 @@ static unsigned long ext4_es_scan(struct shrinker *shrink,
return nr_shrunk;
}
-static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos)
+int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v)
{
- return *pos ? NULL : SEQ_START_TOKEN;
-}
-
-static void *
-ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos)
-{
- return NULL;
-}
-
-static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
-{
- struct ext4_sb_info *sbi = seq->private;
+ struct ext4_sb_info *sbi = EXT4_SB((struct super_block *) seq->private);
struct ext4_es_stats *es_stats = &sbi->s_es_stats;
struct ext4_inode_info *ei, *max = NULL;
unsigned int inode_cnt = 0;
@@ -1143,45 +1132,6 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v)
return 0;
}
-static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v)
-{
-}
-
-static const struct seq_operations ext4_es_seq_shrinker_info_ops = {
- .start = ext4_es_seq_shrinker_info_start,
- .next = ext4_es_seq_shrinker_info_next,
- .stop = ext4_es_seq_shrinker_info_stop,
- .show = ext4_es_seq_shrinker_info_show,
-};
-
-static int
-ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file)
-{
- int ret;
-
- ret = seq_open(file, &ext4_es_seq_shrinker_info_ops);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = PDE_DATA(inode);
- }
-
- return ret;
-}
-
-static int
-ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file)
-{
- return seq_release(inode, file);
-}
-
-static const struct file_operations ext4_es_seq_shrinker_info_fops = {
- .owner = THIS_MODULE,
- .open = ext4_es_seq_shrinker_info_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = ext4_es_seq_shrinker_info_release,
-};
-
int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
{
int err;
@@ -1210,10 +1160,6 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi)
if (err)
goto err2;
- if (sbi->s_proc)
- proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc,
- &ext4_es_seq_shrinker_info_fops, sbi);
-
return 0;
err2:
@@ -1225,8 +1171,6 @@ err1:
void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
{
- if (sbi->s_proc)
- remove_proc_entry("es_shrinker_info", sbi->s_proc);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt);
percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt);
unregister_shrinker(&sbi->s_es_shrinker);
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 691b52613ce4..f7aa24f4642d 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -172,4 +172,6 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es,
extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi);
extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
+extern int ext4_seq_es_shrinker_info_show(struct seq_file *seq, void *v);
+
#endif /* _EXT4_EXTENTS_STATUS_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 619bfc1fda8c..1b8024d26f65 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -64,7 +64,7 @@ void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap)
}
/* Initializes an uninitialized inode bitmap */
-static unsigned ext4_init_inode_bitmap(struct super_block *sb,
+static int ext4_init_inode_bitmap(struct super_block *sb,
struct buffer_head *bh,
ext4_group_t block_group,
struct ext4_group_desc *gdp)
@@ -89,7 +89,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
count);
}
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return 0;
+ return -EFSBADCRC;
}
memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
@@ -99,7 +99,7 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, block_group, gdp);
- return EXT4_INODES_PER_GROUP(sb);
+ return 0;
}
void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
@@ -112,6 +112,42 @@ void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
put_bh(bh);
}
+static int ext4_validate_inode_bitmap(struct super_block *sb,
+ struct ext4_group_desc *desc,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_fsblk_t blk;
+ struct ext4_group_info *grp = ext4_get_group_info(sb, block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+ if (buffer_verified(bh))
+ return 0;
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
+ return -EFSCORRUPTED;
+
+ ext4_lock_group(sb, block_group);
+ blk = ext4_inode_bitmap(sb, desc);
+ if (!ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+ EXT4_INODES_PER_GROUP(sb) / 8)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+ "inode_bitmap = %llu", block_group, blk);
+ grp = ext4_get_group_info(sb, block_group);
+ if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
+ int count;
+ count = ext4_free_inodes_count(sb, desc);
+ percpu_counter_sub(&sbi->s_freeinodes_counter,
+ count);
+ }
+ set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
+ return -EFSBADCRC;
+ }
+ set_buffer_verified(bh);
+ ext4_unlock_group(sb, block_group);
+ return 0;
+}
+
/*
* Read the inode allocation bitmap for a given block_group, reading
* into the specified slot in the superblock's bitmap cache.
@@ -124,12 +160,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
struct ext4_group_desc *desc;
struct buffer_head *bh = NULL;
ext4_fsblk_t bitmap_blk;
- struct ext4_group_info *grp;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int err;
desc = ext4_get_group_desc(sb, block_group, NULL);
if (!desc)
- return NULL;
+ return ERR_PTR(-EFSCORRUPTED);
bitmap_blk = ext4_inode_bitmap(sb, desc);
bh = sb_getblk(sb, bitmap_blk);
@@ -137,7 +172,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
if (bitmap_uptodate(bh))
goto verify;
@@ -150,12 +185,14 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_lock_group(sb, block_group);
if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
- ext4_init_inode_bitmap(sb, bh, block_group, desc);
+ err = ext4_init_inode_bitmap(sb, bh, block_group, desc);
set_bitmap_uptodate(bh);
set_buffer_uptodate(bh);
set_buffer_verified(bh);
ext4_unlock_group(sb, block_group);
unlock_buffer(bh);
+ if (err)
+ goto out;
return bh;
}
ext4_unlock_group(sb, block_group);
@@ -182,31 +219,17 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
ext4_error(sb, "Cannot read inode bitmap - "
"block_group = %u, inode_bitmap = %llu",
block_group, bitmap_blk);
- return NULL;
+ return ERR_PTR(-EIO);
}
verify:
- ext4_lock_group(sb, block_group);
- if (!buffer_verified(bh) &&
- !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
- EXT4_INODES_PER_GROUP(sb) / 8)) {
- ext4_unlock_group(sb, block_group);
- put_bh(bh);
- ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
- "inode_bitmap = %llu", block_group, bitmap_blk);
- grp = ext4_get_group_info(sb, block_group);
- if (!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
- int count;
- count = ext4_free_inodes_count(sb, desc);
- percpu_counter_sub(&sbi->s_freeinodes_counter,
- count);
- }
- set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state);
- return NULL;
- }
- ext4_unlock_group(sb, block_group);
- set_buffer_verified(bh);
+ err = ext4_validate_inode_bitmap(sb, desc, block_group, bh);
+ if (err)
+ goto out;
return bh;
+out:
+ put_bh(bh);
+ return ERR_PTR(err);
}
/*
@@ -286,8 +309,15 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
/* Don't bother if the inode bitmap is corrupt. */
grp = ext4_get_group_info(sb, block_group);
- if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) || !bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ fatal = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ goto error_return;
+ }
+ if (unlikely(EXT4_MB_GRP_IBITMAP_CORRUPT(grp))) {
+ fatal = -EFSCORRUPTED;
goto error_return;
+ }
BUFFER_TRACE(bitmap_bh, "get_write_access");
fatal = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -826,7 +856,9 @@ got_group:
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
- if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || !inode_bitmap_bh) {
+ if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) ||
+ IS_ERR(inode_bitmap_bh)) {
+ inode_bitmap_bh = NULL;
if (++group == ngroups)
group = 0;
continue;
@@ -902,8 +934,8 @@ got:
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (!block_bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(block_bitmap_bh)) {
+ err = PTR_ERR(block_bitmap_bh);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
@@ -1045,7 +1077,7 @@ got:
ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize;
ei->i_inline_off = 0;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA))
+ if (ext4_has_feature_inline_data(sb))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
@@ -1060,7 +1092,7 @@ got:
if (err)
goto fail_free_drop;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
@@ -1116,14 +1148,17 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino)
/* Error cases - e2fsck has already cleaned up for us */
if (ino > max_ino) {
ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino);
+ err = -EFSCORRUPTED;
goto error;
}
block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb);
bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb);
bitmap_bh = ext4_read_inode_bitmap(sb, block_group);
- if (!bitmap_bh) {
- ext4_warning(sb, "inode bitmap error for orphan %lu", ino);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_warning(sb, "inode bitmap error %ld for orphan %lu",
+ ino, err);
goto error;
}
@@ -1198,8 +1233,10 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
desc_count += ext4_free_inodes_count(sb, gdp);
brelse(bitmap_bh);
bitmap_bh = ext4_read_inode_bitmap(sb, i);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ bitmap_bh = NULL;
continue;
+ }
x = ext4_count_free(bitmap_bh->b_data,
EXT4_INODES_PER_GROUP(sb) / 8);
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 2468261748b2..355ef9c36c87 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -562,11 +562,10 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
/*
* Okay, we need to do block allocation.
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(inode->i_sb)) {
EXT4_ERROR_INODE(inode, "Can't allocate blocks for "
"non-extent mapped inodes with bigalloc");
- return -EUCLEAN;
+ return -EFSCORRUPTED;
}
/* Set up for the direct block allocation */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index cd944a7a99cd..d884989cc83d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -434,8 +434,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
memset((void *)ext4_raw_inode(&is.iloc)->i_block,
0, EXT4_MIN_INLINE_DATA_SIZE);
- if (EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(inode->i_sb)) {
if (S_ISDIR(inode->i_mode) ||
S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 612fbcf76b5c..ea433a7f4bca 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -378,7 +378,7 @@ static int __check_block_validity(struct inode *inode, const char *func,
"lblock %lu mapped to illegal pblock "
"(length %d)", (unsigned long) map->m_lblk,
map->m_len);
- return -EIO;
+ return -EFSCORRUPTED;
}
return 0;
}
@@ -480,7 +480,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
/* We can handle the block number less than EXT_MAX_BLOCKS */
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
- return -EIO;
+ return -EFSCORRUPTED;
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
@@ -965,7 +965,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
if (unlikely(err))
page_zero_new_buffers(page, from, to);
else if (decrypt)
- err = ext4_decrypt_one(inode, page);
+ err = ext4_decrypt(page);
return err;
}
#endif
@@ -1181,6 +1181,38 @@ errout:
return ret ? ret : copied;
}
+/*
+ * This is a private version of page_zero_new_buffers() which doesn't
+ * set the buffer to be dirty, since in data=journalled mode we need
+ * to call ext4_handle_dirty_metadata() instead.
+ */
+static void zero_new_buffers(struct page *page, unsigned from, unsigned to)
+{
+ unsigned int block_start = 0, block_end;
+ struct buffer_head *head, *bh;
+
+ bh = head = page_buffers(page);
+ do {
+ block_end = block_start + bh->b_size;
+ if (buffer_new(bh)) {
+ if (block_end > from && block_start < to) {
+ if (!PageUptodate(page)) {
+ unsigned start, size;
+
+ start = max(from, block_start);
+ size = min(to, block_end) - start;
+
+ zero_user(page, start, size);
+ set_buffer_uptodate(bh);
+ }
+ clear_buffer_new(bh);
+ }
+ }
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
+}
+
static int ext4_journalled_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
@@ -1207,7 +1239,7 @@ static int ext4_journalled_write_end(struct file *file,
if (copied < len) {
if (!PageUptodate(page))
copied = 0;
- page_zero_new_buffers(page, from+copied, to);
+ zero_new_buffers(page, from+copied, to);
}
ret = ext4_walk_page_buffers(handle, page_buffers(page), from,
@@ -1815,11 +1847,22 @@ static int ext4_writepage(struct page *page,
* the page. But we may reach here when we do a journal commit via
* journal_submit_inode_data_buffers() and in that case we must write
* allocated buffers to achieve data=ordered mode guarantees.
+ *
+ * Also, if there is only one buffer per page (the fs block
+ * size == the page size), if one buffer needs block
+ * allocation or needs to modify the extent tree to clear the
+ * unwritten flag, we know that the page can't be written at
+ * all, so we might as well refuse the write immediately.
+ * Unfortunately if the block size != page size, we can't as
+ * easily detect this case using ext4_walk_page_buffers(), but
+ * for the extremely common case, this is an optimization that
+ * skips a useless round trip through ext4_bio_write_page().
*/
if (ext4_walk_page_buffers(NULL, page_bufs, 0, len, NULL,
ext4_bh_delay_or_unwritten)) {
redirty_page_for_writepage(wbc, page);
- if (current->flags & PF_MEMALLOC) {
+ if ((current->flags & PF_MEMALLOC) ||
+ (inode->i_sb->s_blocksize == PAGE_CACHE_SIZE)) {
/*
* For memory cleaning there's no point in writing only
* some buffers. So just bail out. Warn if we came here
@@ -2599,8 +2642,7 @@ static int ext4_nonda_switch(struct super_block *sb)
/* We always reserve for an inode update; the superblock could be there too */
static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len)
{
- if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE)))
+ if (likely(ext4_has_feature_large_file(inode->i_sb)))
return 1;
if (pos + len <= 0x7fffffffULL)
@@ -3344,7 +3386,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
int err = 0;
page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
- mapping_gfp_mask(mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(mapping, ~__GFP_FS));
if (!page)
return -ENOMEM;
@@ -3393,7 +3435,7 @@ static int __ext4_block_zero_page_range(handle_t *handle,
/* We expect the key to be set. */
BUG_ON(!ext4_has_encryption_key(inode));
BUG_ON(blocksize != PAGE_CACHE_SIZE);
- WARN_ON_ONCE(ext4_decrypt_one(inode, page));
+ WARN_ON_ONCE(ext4_decrypt(page));
}
}
if (ext4_should_journal_data(inode)) {
@@ -3820,7 +3862,7 @@ static int __ext4_get_inode_loc(struct inode *inode,
iloc->bh = NULL;
if (!ext4_valid_inum(sb, inode->i_ino))
- return -EIO;
+ return -EFSCORRUPTED;
iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb);
gdp = ext4_get_group_desc(sb, iloc->block_group, NULL);
@@ -4006,8 +4048,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
struct inode *inode = &(ei->vfs_inode);
struct super_block *sb = inode->i_sb;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
/* we are using combined 48 bit field */
i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 |
le32_to_cpu(raw_inode->i_blocks_lo);
@@ -4068,7 +4109,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
EXT4_INODE_SIZE(inode->i_sb));
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
}
} else
@@ -4088,7 +4129,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
EXT4_ERROR_INODE(inode, "checksum invalid");
- ret = -EIO;
+ ret = -EFSBADCRC;
goto bad_inode;
}
@@ -4130,7 +4171,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT))
+ if (ext4_has_feature_64bit(sb))
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode);
@@ -4203,7 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
!ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
ei->i_file_acl);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
goto bad_inode;
} else if (!ext4_has_inline_data(inode)) {
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
@@ -4254,7 +4295,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
} else if (ino == EXT4_BOOT_LOADER_INO) {
make_bad_inode(inode);
} else {
- ret = -EIO;
+ ret = -EFSCORRUPTED;
EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
@@ -4272,7 +4313,7 @@ bad_inode:
struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino)
{
if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO)
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
return ext4_iget(sb, ino);
}
@@ -4294,7 +4335,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
return 0;
}
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
+ if (!ext4_has_feature_huge_file(sb))
return -EFBIG;
if (i_blocks <= 0xffffffffffffULL) {
@@ -4455,8 +4496,7 @@ static int ext4_do_update_inode(handle_t *handle,
need_datasync = 1;
}
if (ei->i_disksize > 0x7fffffffULL) {
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE) ||
+ if (!ext4_has_feature_large_file(sb) ||
EXT4_SB(sb)->s_es->s_rev_level ==
cpu_to_le32(EXT4_GOOD_OLD_REV))
set_large_file = 1;
@@ -4505,8 +4545,7 @@ static int ext4_do_update_inode(handle_t *handle,
if (err)
goto out_brelse;
ext4_update_dynamic_rev(sb);
- EXT4_SET_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
+ ext4_set_feature_large_file(sb);
ext4_handle_sync(handle);
err = ext4_handle_dirty_super(handle, sb);
}
@@ -5244,7 +5283,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
!ext4_should_journal_data(inode) &&
!ext4_nonda_switch(inode->i_sb)) {
do {
- ret = __block_page_mkwrite(vma, vmf,
+ ret = block_page_mkwrite(vma, vmf,
ext4_da_get_block_prep);
} while (ret == -ENOSPC &&
ext4_should_retry_alloc(inode->i_sb, &retries));
@@ -5291,7 +5330,7 @@ retry_alloc:
ret = VM_FAULT_SIGBUS;
goto out;
}
- ret = __block_page_mkwrite(vma, vmf, get_block);
+ ret = block_page_mkwrite(vma, vmf, get_block);
if (!ret && ext4_should_journal_data(inode)) {
if (ext4_walk_page_buffers(handle, page_buffers(page), 0,
PAGE_CACHE_SIZE, NULL, do_journal_get_write_access)) {
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 1346cfa355d0..5e872fd40e5e 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -145,8 +145,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
inode_bl->i_version = 1;
i_size_write(inode_bl, 0);
inode_bl->i_mode = S_IFREG;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_extents(sb)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode_bl);
} else
@@ -383,8 +382,7 @@ setversion_out:
goto group_extend_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -432,8 +430,7 @@ group_extend_out:
goto mext_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online defrag not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -470,8 +467,7 @@ mext_out:
goto group_add_out;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not supported with bigalloc");
err = -EOPNOTSUPP;
@@ -553,8 +549,7 @@ group_add_out:
int err = 0, err2 = 0;
ext4_group_t o_group = EXT4_SB(sb)->s_groups_count;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC)) {
+ if (ext4_has_feature_bigalloc(sb)) {
ext4_msg(sb, KERN_ERR,
"Online resizing not (yet) supported with bigalloc");
return -EOPNOTSUPP;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 34b610ea5030..61eaf74dca37 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -874,8 +874,10 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
bh[i] = NULL;
continue;
}
- if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
- err = -ENOMEM;
+ bh[i] = ext4_read_block_bitmap_nowait(sb, group);
+ if (IS_ERR(bh[i])) {
+ err = PTR_ERR(bh[i]);
+ bh[i] = NULL;
goto out;
}
mb_debug(1, "read bitmap for group %u\n", group);
@@ -883,8 +885,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
/* wait for I/O completion */
for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
- if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
- err = -EIO;
+ int err2;
+
+ if (!bh[i])
+ continue;
+ err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
+ if (!err)
+ err = err2;
}
first_block = page->index * blocks_per_page;
@@ -2333,7 +2340,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
}
-static const struct file_operations ext4_mb_seq_groups_fops = {
+const struct file_operations ext4_seq_mb_groups_fops = {
.owner = THIS_MODULE,
.open = ext4_mb_seq_groups_open,
.read = seq_read,
@@ -2447,7 +2454,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
kmalloc(sb->s_blocksize, GFP_NOFS);
BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
bh = ext4_read_block_bitmap(sb, group);
- BUG_ON(bh == NULL);
+ BUG_ON(IS_ERR_OR_NULL(bh));
memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
sb->s_blocksize);
put_bh(bh);
@@ -2661,10 +2668,6 @@ int ext4_mb_init(struct super_block *sb)
if (ret != 0)
goto out_free_locality_groups;
- if (sbi->s_proc)
- proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
- &ext4_mb_seq_groups_fops, sb);
-
return 0;
out_free_locality_groups:
@@ -2705,9 +2708,6 @@ int ext4_mb_release(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
- if (sbi->s_proc)
- remove_proc_entry("mb_groups", sbi->s_proc);
-
if (sbi->s_group_info) {
for (i = 0; i < ngroups; i++) {
grinfo = ext4_get_group_info(sb, i);
@@ -2896,10 +2896,12 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
sb = ac->ac_sb;
sbi = EXT4_SB(sb);
- err = -EIO;
bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
- if (!bitmap_bh)
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto out_err;
+ }
BUFFER_TRACE(bitmap_bh, "getting write access");
err = ext4_journal_get_write_access(handle, bitmap_bh);
@@ -3843,8 +3845,10 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
return 0;
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u", group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
return 0;
}
@@ -4015,9 +4019,10 @@ repeat:
}
bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (bitmap_bh == NULL) {
- ext4_error(sb, "Error reading block bitmap for %u",
- group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ ext4_error(sb, "Error %d reading block bitmap for %u",
+ err, group);
ext4_mb_unload_buddy(&e4b);
continue;
}
@@ -4682,22 +4687,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
ext4_debug("freeing block %llu\n", block);
trace_ext4_free_blocks(inode, block, count, flags);
- if (flags & EXT4_FREE_BLOCKS_FORGET) {
- struct buffer_head *tbh = bh;
- int i;
-
- BUG_ON(bh && (count > 1));
+ if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ BUG_ON(count > 1);
- for (i = 0; i < count; i++) {
- cond_resched();
- if (!bh)
- tbh = sb_find_get_block(inode->i_sb,
- block + i);
- if (!tbh)
- continue;
- ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
- inode, tbh, block + i);
- }
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block);
}
/*
@@ -4742,6 +4736,19 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
count += sbi->s_cluster_ratio - overflow;
}
+ if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
+ int i;
+
+ for (i = 0; i < count; i++) {
+ cond_resched();
+ bh = sb_find_get_block(inode->i_sb, block + i);
+ if (!bh)
+ continue;
+ ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
+ inode, bh, block + i);
+ }
+ }
+
do_more:
overflow = 0;
ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
@@ -4761,8 +4768,9 @@ do_more:
}
count_clusters = EXT4_NUM_B2C(sbi, count);
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
@@ -4931,8 +4939,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
}
bitmap_bh = ext4_read_block_bitmap(sb, block_group);
- if (!bitmap_bh) {
- err = -EIO;
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
goto error_return;
}
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 6163ad21cb0e..a4651894cc33 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -448,8 +448,7 @@ int ext4_ext_migrate(struct inode *inode)
* If the filesystem does not support extents, or the inode
* already is extent-based, error out.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
@@ -625,13 +624,11 @@ int ext4_ind_migrate(struct inode *inode)
handle_t *handle;
int ret;
- if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_INCOMPAT_EXTENTS) ||
+ if (!ext4_has_feature_extents(inode->i_sb) ||
(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
return -EINVAL;
- if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (ext4_has_feature_bigalloc(inode->i_sb))
return -EOPNOTSUPP;
/*
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 6eb1a619890c..0a512aa81bf7 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -98,10 +98,12 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
}
mmp = (struct mmp_struct *)((*bh)->b_data);
- if (le32_to_cpu(mmp->mmp_magic) == EXT4_MMP_MAGIC &&
- ext4_mmp_csum_verify(sb, mmp))
+ if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+ ret = -EFSCORRUPTED;
+ else if (!ext4_mmp_csum_verify(sb, mmp))
+ ret = -EFSBADCRC;
+ else
return 0;
- ret = -EINVAL;
warn_exit:
ext4_warning(sb, "Error %d while reading MMP block %llu",
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 9f61e7679a6d..a969ab39f302 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -109,7 +109,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!bh) {
ext4_error_inode(inode, func, line, block,
"Directory hole found");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
dirent = (struct ext4_dir_entry *) bh->b_data;
/* Determine whether or not we have an index block */
@@ -124,7 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
if (!is_dx_block && type == INDEX) {
ext4_error_inode(inode, func, line, block,
"directory leaf block found instead of index block");
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!ext4_has_metadata_csum(inode->i_sb) ||
buffer_verified(bh))
@@ -142,7 +142,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory index failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
if (!is_dx_block) {
@@ -152,7 +152,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
ext4_error_inode(inode, func, line, block,
"Directory block failed checksum");
brelse(bh);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSBADCRC);
}
}
return bh;
@@ -1429,7 +1429,7 @@ restart:
}
num++;
bh = ext4_getblk(NULL, dir, b++, 0);
- if (unlikely(IS_ERR(bh))) {
+ if (IS_ERR(bh)) {
if (ra_max == 0) {
ret = bh;
goto cleanup_and_exit;
@@ -1570,19 +1570,19 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
brelse(bh);
if (!ext4_valid_inum(dir->i_sb, ino)) {
EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (unlikely(ino == dir->i_ino)) {
EXT4_ERROR_INODE(dir, "'%pd' linked to parent dir",
dentry);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
inode = ext4_iget_normal(dir->i_sb, ino);
if (inode == ERR_PTR(-ESTALE)) {
EXT4_ERROR_INODE(dir,
"deleted inode referenced: %u",
ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
@@ -1619,7 +1619,7 @@ struct dentry *ext4_get_parent(struct dentry *child)
if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) {
EXT4_ERROR_INODE(d_inode(child),
"bad parent inode number: %u", ino);
- return ERR_PTR(-EIO);
+ return ERR_PTR(-EFSCORRUPTED);
}
return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino));
@@ -1807,7 +1807,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode,
while ((char *) de <= top) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
buf, buf_size, offset)) {
- res = -EIO;
+ res = -EFSCORRUPTED;
goto return_result;
}
/* Provide crypto context and crypto buffer to ext4 match */
@@ -1967,7 +1967,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
if ((char *) de >= (((char *) root) + blocksize)) {
EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
brelse(bh);
- return -EIO;
+ return -EFSCORRUPTED;
}
len = ((char *) root) + (blocksize - csum_size) - (char *) de;
@@ -2118,7 +2118,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
goto out;
if (blocks == 1 && !dx_fallback &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ ext4_has_feature_dir_index(sb)) {
retval = make_indexed_dir(handle, &fname, dentry,
inode, bh);
bh = NULL; /* make_indexed_dir releases bh */
@@ -2315,7 +2315,7 @@ int ext4_generic_delete_entry(handle_t *handle,
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
bh->b_data, bh->b_size, i))
- return -EIO;
+ return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
@@ -2388,8 +2388,7 @@ static void ext4_inc_count(handle_t *handle, struct inode *inode)
/* limit is 16-bit i_links_count */
if (inode->i_nlink >= EXT4_LINK_MAX || inode->i_nlink == 2) {
set_nlink(inode, 1);
- EXT4_SET_RO_COMPAT_FEATURE(inode->i_sb,
- EXT4_FEATURE_RO_COMPAT_DIR_NLINK);
+ ext4_set_feature_dir_nlink(inode->i_sb);
}
}
}
@@ -2469,9 +2468,6 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err, credits, retries = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = dquot_initialize(dir);
if (err)
return err;
@@ -2934,7 +2930,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_rmdir;
@@ -3008,7 +3004,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
inode = d_inode(dentry);
- retval = -EIO;
+ retval = -EFSCORRUPTED;
if (le32_to_cpu(de->inode) != inode->i_ino)
goto end_unlink;
@@ -3310,7 +3306,7 @@ static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
if (!ent->dir_bh)
return retval;
if (le32_to_cpu(ent->parent_de->inode) != ent->dir->i_ino)
- return -EIO;
+ return -EFSCORRUPTED;
BUFFER_TRACE(ent->dir_bh, "get_write_access");
return ext4_journal_get_write_access(handle, ent->dir_bh);
}
@@ -3352,8 +3348,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
if (retval)
return retval;
ent->de->inode = cpu_to_le32(ino);
- if (EXT4_HAS_INCOMPAT_FEATURE(ent->dir->i_sb,
- EXT4_FEATURE_INCOMPAT_FILETYPE))
+ if (ext4_has_feature_filetype(ent->dir->i_sb))
ent->de->file_type = file_type;
ent->dir->i_version++;
ent->dir->i_ctime = ent->dir->i_mtime =
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 84ba4d2b3a35..17fbe3882b8e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -425,6 +425,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
struct buffer_head *bh, *head;
int ret = 0;
int nr_submitted = 0;
+ int nr_to_submit = 0;
blocksize = 1 << inode->i_blkbits;
@@ -477,11 +478,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
}
set_buffer_async_write(bh);
+ nr_to_submit++;
} while ((bh = bh->b_this_page) != head);
bh = head = page_buffers(page);
- if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+ if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode) &&
+ nr_to_submit) {
data_page = ext4_encrypt(inode, page);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index e26803fb210d..5dc5e95063de 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -62,7 +62,7 @@ static void completion_pages(struct work_struct *work)
bio_for_each_segment_all(bv, bio, i) {
struct page *page = bv->bv_page;
- int ret = ext4_decrypt(ctx, page);
+ int ret = ext4_decrypt(page);
if (ret) {
WARN_ON_ONCE(1);
SetPageError(page);
@@ -165,8 +165,8 @@ int ext4_mpage_readpages(struct address_space *mapping,
if (pages) {
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL))
+ if (add_to_page_cache_lru(page, mapping, page->index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index cf0c472047e3..ad62d7acc315 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -490,7 +490,7 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
group_data[0].group != sbi->s_groups_count);
reserved_gdb = le16_to_cpu(es->s_reserved_gdt_blocks);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
/* This transaction may be extended/restarted along the way */
handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, EXT4_MAX_TRANS_DATA);
@@ -680,8 +680,7 @@ static unsigned ext4_list_backups(struct super_block *sb, unsigned *three,
int mult = 3;
unsigned ret;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (!ext4_has_feature_sparse_super(sb)) {
ret = *min;
*min += 1;
return ret;
@@ -1040,7 +1039,7 @@ exit_free:
* do not copy the full number of backups at this time. The resize
* which changed s_groups_count will backup again.
*/
-static void update_backups(struct super_block *sb, int blk_off, char *data,
+static void update_backups(struct super_block *sb, sector_t blk_off, char *data,
int size, int meta_bg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1065,7 +1064,7 @@ static void update_backups(struct super_block *sb, int blk_off, char *data,
group = ext4_list_backups(sb, &three, &five, &seven);
last = sbi->s_groups_count;
} else {
- group = ext4_meta_bg_first_group(sb, group) + 1;
+ group = ext4_get_group_number(sb, blk_off) + 1;
last = (ext4_group_t)(group + EXT4_DESC_PER_BLOCK(sb) - 2);
}
@@ -1158,7 +1157,7 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
int i, gdb_off, gdb_num, err = 0;
int meta_bg;
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
for (i = 0; i < count; i++, group++) {
int reserved_gdb = ext4_bg_has_super(sb, group) ?
le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
@@ -1381,9 +1380,7 @@ static void ext4_update_super(struct super_block *sb,
ext4_debug("free blocks count %llu",
percpu_counter_read(&sbi->s_freeclusters_counter));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
- sbi->s_log_groups_per_flex) {
+ if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, group_data[0].group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
@@ -1476,8 +1473,7 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_META_BG);
+ int meta_bg = ext4_has_feature_meta_bg(sb);
sector_t old_gdb = 0;
update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
@@ -1585,8 +1581,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
- if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) {
+ if (gdb_off == 0 && !ext4_has_feature_sparse_super(sb)) {
ext4_warning(sb, "Can't resize non-sparse filesystem further");
return -EPERM;
}
@@ -1604,9 +1599,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
}
if (reserved_gdb || gdb_off == 0) {
- if (!EXT4_HAS_COMPAT_FEATURE(sb,
- EXT4_FEATURE_COMPAT_RESIZE_INODE)
- || !le16_to_cpu(es->s_reserved_gdt_blocks)) {
+ if (ext4_has_feature_resize_inode(sb) ||
+ !le16_to_cpu(es->s_reserved_gdt_blocks)) {
ext4_warning(sb,
"No reserved GDT blocks, can't resize");
return -EPERM;
@@ -1825,8 +1819,8 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
if (err)
goto errout;
- EXT4_CLEAR_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE);
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ ext4_clear_feature_resize_inode(sb);
+ ext4_set_feature_meta_bg(sb);
sbi->s_es->s_first_meta_bg =
cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count));
@@ -1918,9 +1912,9 @@ retry:
n_desc_blocks = num_desc_blocks(sb, n_group + 1);
o_desc_blocks = num_desc_blocks(sb, sbi->s_groups_count);
- meta_bg = EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG);
+ meta_bg = ext4_has_feature_meta_bg(sb);
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)) {
+ if (ext4_has_feature_resize_inode(sb)) {
if (meta_bg) {
ext4_error(sb, "resize_inode and meta_bg enabled "
"simultaneously");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a63c7b0a10cf..753f4e68b820 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -34,7 +34,6 @@
#include <linux/namei.h>
#include <linux/quotaops.h>
#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/log2.h>
#include <linux/crc16.h>
@@ -54,11 +53,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/ext4.h>
-static struct proc_dir_entry *ext4_proc_root;
-static struct kset *ext4_kset;
static struct ext4_lazy_init *ext4_li_info;
static struct mutex ext4_li_mtx;
-static struct ext4_features *ext4_feat;
static int ext4_mballoc_ready;
static struct ratelimit_state ext4_mount_msg_ratelimit;
@@ -83,7 +79,6 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly);
static void ext4_destroy_lazyinit_thread(void);
static void ext4_unregister_li_request(struct super_block *sb);
static void ext4_clear_request_list(void);
-static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
#if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
static struct file_system_type ext2_fs_type = {
@@ -115,8 +110,7 @@ MODULE_ALIAS("ext3");
static int ext4_verify_csum_type(struct super_block *sb,
struct ext4_super_block *es)
{
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+ if (!ext4_has_feature_metadata_csum(sb))
return 1;
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
@@ -394,9 +388,13 @@ static void ext4_handle_error(struct super_block *sb)
smp_wmb();
sb->s_flags |= MS_RDONLY;
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs (device %s): panic forced after error\n",
sb->s_id);
+ }
}
#define ext4_error_ratelimit(sb) \
@@ -495,6 +493,12 @@ const char *ext4_decode_error(struct super_block *sb, int errno,
char *errstr = NULL;
switch (errno) {
+ case -EFSCORRUPTED:
+ errstr = "Corrupt filesystem";
+ break;
+ case -EFSBADCRC:
+ errstr = "Filesystem failed CRC";
+ break;
case -EIO:
errstr = "IO failure";
break;
@@ -585,8 +589,12 @@ void __ext4_abort(struct super_block *sb, const char *function,
jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
save_error_info(sb, function, line);
}
- if (test_opt(sb, ERRORS_PANIC))
+ if (test_opt(sb, ERRORS_PANIC)) {
+ if (EXT4_SB(sb)->s_journal &&
+ !(EXT4_SB(sb)->s_journal->j_flags & JBD2_REC_ERR))
+ return;
panic("EXT4-fs panic from previous error\n");
+ }
}
void __ext4_msg(struct super_block *sb,
@@ -800,6 +808,7 @@ static void ext4_put_super(struct super_block *sb)
ext4_abort(sb, "Couldn't clean up the journal");
}
+ ext4_unregister_sysfs(sb);
ext4_es_unregister_shrinker(sbi);
del_timer_sync(&sbi->s_err_report);
ext4_release_system_zone(sb);
@@ -808,18 +817,12 @@ static void ext4_put_super(struct super_block *sb)
ext4_xattr_put_super(sb);
if (!(sb->s_flags & MS_RDONLY)) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
es->s_state = cpu_to_le16(sbi->s_mount_state);
}
if (!(sb->s_flags & MS_RDONLY))
ext4_commit_super(sb, 1);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
- kobject_del(&sbi->s_kobj);
-
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
kvfree(sbi->s_group_desc);
@@ -1058,7 +1061,7 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
return 0;
if (journal)
return jbd2_journal_try_to_free_buffers(journal, page,
- wait & ~__GFP_WAIT);
+ wait & ~__GFP_DIRECT_RECLAIM);
return try_to_free_buffers(page);
}
@@ -1288,7 +1291,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR, "Cannot set journaled quota options "
"when QUOTA feature is enabled");
return -1;
@@ -1381,10 +1384,10 @@ static const struct mount_opts {
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
EXT4_MOUNT_JOURNAL_CHECKSUM),
- MOPT_EXT4_ONLY | MOPT_SET},
+ MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
@@ -1513,8 +1516,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
return -1;
if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
return -1;
- if (m->flags & MOPT_EXPLICIT)
- set_opt2(sb, EXPLICIT_DELALLOC);
+ if (m->flags & MOPT_EXPLICIT) {
+ if (m->mount_opt & EXT4_MOUNT_DELALLOC) {
+ set_opt2(sb, EXPLICIT_DELALLOC);
+ } else if (m->mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) {
+ set_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM);
+ } else
+ return -1;
+ }
if (m->flags & MOPT_CLEAR_ERR)
clear_opt(sb, ERRORS_MASK);
if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
@@ -1647,8 +1656,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
"quota options when quota turned on");
return -1;
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ if (ext4_has_feature_quota(sb)) {
ext4_msg(sb, KERN_ERR,
"Cannot set journaled quota options "
"when QUOTA feature is enabled");
@@ -1707,7 +1715,7 @@ static int parse_options(char *options, struct super_block *sb,
return 0;
}
#ifdef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
+ if (ext4_has_feature_quota(sb) &&
(test_opt(sb, USRQUOTA) || test_opt(sb, GRPQUOTA))) {
ext4_msg(sb, KERN_ERR, "Cannot set quota options when QUOTA "
"feature is enabled");
@@ -1880,7 +1888,7 @@ static int ext4_show_options(struct seq_file *seq, struct dentry *root)
return _ext4_show_options(seq, root->d_sb, 0);
}
-static int options_seq_show(struct seq_file *seq, void *offset)
+int ext4_seq_options_show(struct seq_file *seq, void *offset)
{
struct super_block *sb = seq->private;
int rc;
@@ -1891,19 +1899,6 @@ static int options_seq_show(struct seq_file *seq, void *offset)
return rc;
}
-static int options_open_fs(struct inode *inode, struct file *file)
-{
- return single_open(file, options_seq_show, PDE_DATA(inode));
-}
-
-static const struct file_operations ext4_seq_options_fops = {
- .owner = THIS_MODULE,
- .open = options_open_fs,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
int read_only)
{
@@ -1944,7 +1939,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
es->s_mtime = cpu_to_le32(get_seconds());
ext4_update_dynamic_rev(sb);
if (sbi->s_journal)
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_set_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
done:
@@ -2027,12 +2022,13 @@ failed:
return 0;
}
-static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
int offset;
__u16 crc = 0;
__le32 le_group = cpu_to_le32(block_group);
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
if (ext4_has_metadata_csum(sbi->s_sb)) {
/* Use new metadata_csum algorithm */
@@ -2052,8 +2048,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
}
/* old crc16 code */
- if (!(sbi->s_es->s_feature_ro_compat &
- cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)))
+ if (!ext4_has_feature_gdt_csum(sb))
return 0;
offset = offsetof(struct ext4_group_desc, bg_checksum);
@@ -2063,8 +2058,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if ((sbi->s_es->s_feature_incompat &
- cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+ if (ext4_has_feature_64bit(sb) &&
offset < le16_to_cpu(sbi->s_es->s_desc_size))
crc = crc16(crc, (__u8 *)gdp + offset,
le16_to_cpu(sbi->s_es->s_desc_size) -
@@ -2078,8 +2072,7 @@ int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
struct ext4_group_desc *gdp)
{
if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
- block_group, gdp)))
+ (gdp->bg_checksum != ext4_group_desc_csum(sb, block_group, gdp)))
return 0;
return 1;
@@ -2090,7 +2083,7 @@ void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
{
if (!ext4_has_group_desc_csum(sb))
return;
- gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+ gdp->bg_checksum = ext4_group_desc_csum(sb, block_group, gdp);
}
/* Called at mount-time, super-block is locked */
@@ -2106,7 +2099,7 @@ static int ext4_check_descriptors(struct super_block *sb,
int flexbg_flag = 0;
ext4_group_t i, grp = sbi->s_groups_count;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
flexbg_flag = 1;
ext4_debug("Checking group descriptors");
@@ -2150,7 +2143,7 @@ static int ext4_check_descriptors(struct super_block *sb,
if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
"Checksum for group %u failed (%u!=%u)",
- i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+ i, le16_to_cpu(ext4_group_desc_csum(sb, i,
gdp)), le16_to_cpu(gdp->bg_checksum));
if (!(sb->s_flags & MS_RDONLY)) {
ext4_unlock_group(sb, i);
@@ -2413,8 +2406,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg);
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_META_BG) ||
- nr < first_meta_bg)
+ if (!ext4_has_feature_meta_bg(sb) || nr < first_meta_bg)
return logical_sb_block + nr + 1;
bg = sbi->s_desc_per_block * nr;
if (ext4_bg_has_super(sb, bg))
@@ -2470,335 +2462,6 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return ret;
}
-/* sysfs supprt */
-
-struct ext4_attr {
- struct attribute attr;
- ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
- ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
- const char *, size_t);
- union {
- int offset;
- int deprecated_val;
- } u;
-};
-
-static int parse_strtoull(const char *buf,
- unsigned long long max, unsigned long long *value)
-{
- int ret;
-
- ret = kstrtoull(skip_spaces(buf), 0, value);
- if (!ret && *value > max)
- ret = -EINVAL;
- return ret;
-}
-
-static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (s64) EXT4_C2B(sbi,
- percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-}
-
-static ssize_t session_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%lu\n",
- (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- sbi->s_sectors_written_start) >> 1);
-}
-
-static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- struct super_block *sb = sbi->s_buddy_cache->i_sb;
-
- if (!sb->s_bdev->bd_part)
- return snprintf(buf, PAGE_SIZE, "0\n");
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long)(sbi->s_kbytes_written +
- ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
- EXT4_SB(sb)->s_sectors_written_start) >> 1)));
-}
-
-static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
-
- if (t && (!is_power_of_2(t) || t > 0x40000000))
- return -EINVAL;
-
- sbi->s_inode_readahead_blks = t;
- return count;
-}
-
-static ssize_t sbi_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t sbi_ui_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
- unsigned long t;
- int ret;
-
- ret = kstrtoul(skip_spaces(buf), 0, &t);
- if (ret)
- return ret;
- *ui = t;
- return count;
-}
-
-static ssize_t es_ui_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
-
- unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) +
- a->u.offset);
-
- return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
-}
-
-static ssize_t reserved_clusters_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%llu\n",
- (unsigned long long) atomic64_read(&sbi->s_resv_clusters));
-}
-
-static ssize_t reserved_clusters_store(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- unsigned long long val;
- int ret;
-
- if (parse_strtoull(buf, -1ULL, &val))
- return -EINVAL;
- ret = ext4_reserve_clusters(sbi, val);
-
- return ret ? ret : count;
-}
-
-static ssize_t trigger_test_error(struct ext4_attr *a,
- struct ext4_sb_info *sbi,
- const char *buf, size_t count)
-{
- int len = count;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (len && buf[len-1] == '\n')
- len--;
-
- if (len)
- ext4_error(sbi->s_sb, "%.*s", len, buf);
- return count;
-}
-
-static ssize_t sbi_deprecated_show(struct ext4_attr *a,
- struct ext4_sb_info *sbi, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
-}
-
-#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_sb_info, _elname),\
- }, \
-}
-
-#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = _mode }, \
- .show = _show, \
- .store = _store, \
- .u = { \
- .offset = offsetof(struct ext4_super_block, _elname), \
- }, \
-}
-
-#define EXT4_ATTR(name, mode, show, store) \
-static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
-
-#define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL)
-#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
-#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
-
-#define EXT4_RO_ATTR_ES_UI(name, elname) \
- EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname)
-#define EXT4_RW_ATTR_SBI_UI(name, elname) \
- EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
-
-#define ATTR_LIST(name) &ext4_attr_##name.attr
-#define EXT4_DEPRECATED_ATTR(_name, _val) \
-static struct ext4_attr ext4_attr_##_name = { \
- .attr = {.name = __stringify(_name), .mode = 0444 }, \
- .show = sbi_deprecated_show, \
- .u = { \
- .deprecated_val = _val, \
- }, \
-}
-
-EXT4_RO_ATTR(delayed_allocation_blocks);
-EXT4_RO_ATTR(session_write_kbytes);
-EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RW_ATTR(reserved_clusters);
-EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
- inode_readahead_blks_store, s_inode_readahead_blks);
-EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
-EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
-EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
-EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
-EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
-EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
-EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
-EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
-EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
-EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
-EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
-EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
-EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
-
-static struct attribute *ext4_attrs[] = {
- ATTR_LIST(delayed_allocation_blocks),
- ATTR_LIST(session_write_kbytes),
- ATTR_LIST(lifetime_write_kbytes),
- ATTR_LIST(reserved_clusters),
- ATTR_LIST(inode_readahead_blks),
- ATTR_LIST(inode_goal),
- ATTR_LIST(mb_stats),
- ATTR_LIST(mb_max_to_scan),
- ATTR_LIST(mb_min_to_scan),
- ATTR_LIST(mb_order2_req),
- ATTR_LIST(mb_stream_req),
- ATTR_LIST(mb_group_prealloc),
- ATTR_LIST(max_writeback_mb_bump),
- ATTR_LIST(extent_max_zeroout_kb),
- ATTR_LIST(trigger_fs_error),
- ATTR_LIST(err_ratelimit_interval_ms),
- ATTR_LIST(err_ratelimit_burst),
- ATTR_LIST(warning_ratelimit_interval_ms),
- ATTR_LIST(warning_ratelimit_burst),
- ATTR_LIST(msg_ratelimit_interval_ms),
- ATTR_LIST(msg_ratelimit_burst),
- ATTR_LIST(errors_count),
- ATTR_LIST(first_error_time),
- ATTR_LIST(last_error_time),
- NULL,
-};
-
-/* Features this copy of ext4 supports */
-EXT4_INFO_ATTR(lazy_itable_init);
-EXT4_INFO_ATTR(batched_discard);
-EXT4_INFO_ATTR(meta_bg_resize);
-EXT4_INFO_ATTR(encryption);
-
-static struct attribute *ext4_feat_attrs[] = {
- ATTR_LIST(lazy_itable_init),
- ATTR_LIST(batched_discard),
- ATTR_LIST(meta_bg_resize),
- ATTR_LIST(encryption),
- NULL,
-};
-
-static ssize_t ext4_attr_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->show ? a->show(a, sbi, buf) : 0;
-}
-
-static ssize_t ext4_attr_store(struct kobject *kobj,
- struct attribute *attr,
- const char *buf, size_t len)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
-
- return a->store ? a->store(a, sbi, buf, len) : 0;
-}
-
-static void ext4_sb_release(struct kobject *kobj)
-{
- struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
- s_kobj);
- complete(&sbi->s_kobj_unregister);
-}
-
-static const struct sysfs_ops ext4_attr_ops = {
- .show = ext4_attr_show,
- .store = ext4_attr_store,
-};
-
-static struct kobj_type ext4_ktype = {
- .default_attrs = ext4_attrs,
- .sysfs_ops = &ext4_attr_ops,
- .release = ext4_sb_release,
-};
-
-static void ext4_feat_release(struct kobject *kobj)
-{
- complete(&ext4_feat->f_kobj_unregister);
-}
-
-static ssize_t ext4_feat_show(struct kobject *kobj,
- struct attribute *attr, char *buf)
-{
- return snprintf(buf, PAGE_SIZE, "supported\n");
-}
-
-/*
- * We can not use ext4_attr_show/store because it relies on the kobject
- * being embedded in the ext4_sb_info structure which is definitely not
- * true in this case.
- */
-static const struct sysfs_ops ext4_feat_ops = {
- .show = ext4_feat_show,
- .store = NULL,
-};
-
-static struct kobj_type ext4_feat_ktype = {
- .default_attrs = ext4_feat_attrs,
- .sysfs_ops = &ext4_feat_ops,
- .release = ext4_feat_release,
-};
-
/*
* Check whether this filesystem can be mounted based on
* the features present and the RDONLY/RDWR mount requested.
@@ -2807,7 +2470,7 @@ static struct kobj_type ext4_feat_ktype = {
*/
static int ext4_feature_set_ok(struct super_block *sb, int readonly)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_incompat_features(sb)) {
ext4_msg(sb, KERN_ERR,
"Couldn't mount because of "
"unsupported optional features (%x)",
@@ -2819,14 +2482,14 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
if (readonly)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) {
+ if (ext4_has_feature_readonly(sb)) {
ext4_msg(sb, KERN_INFO, "filesystem is read-only");
sb->s_flags |= MS_RDONLY;
return 1;
}
/* Check that feature set is OK for a read-write mount */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+ if (ext4_has_unknown_ext4_ro_compat_features(sb)) {
ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
"unsupported optional features (%x)",
(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
@@ -2837,7 +2500,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
* Large file size enabled file system can only be mounted
* read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
*/
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+ if (ext4_has_feature_huge_file(sb)) {
if (sizeof(blkcnt_t) < sizeof(u64)) {
ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
"cannot be mounted RDWR without "
@@ -2845,8 +2508,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
}
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
+ if (ext4_has_feature_bigalloc(sb) && !ext4_has_feature_extents(sb)) {
ext4_msg(sb, KERN_ERR,
"Can't support bigalloc feature without "
"extents feature\n");
@@ -2854,8 +2516,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
}
#ifndef CONFIG_QUOTA
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !readonly) {
+ if (ext4_has_feature_quota(sb) && !readonly) {
ext4_msg(sb, KERN_ERR,
"Filesystem with quota feature cannot be mounted RDWR "
"without CONFIG_QUOTA");
@@ -3312,7 +2973,7 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
- if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_BIGALLOC))
+ if (!ext4_has_feature_bigalloc(sb))
return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
sbi->s_itb_per_group + 2);
@@ -3403,10 +3064,10 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-
-static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
+static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
/*
* There's no need to reserve anything when we aren't using extents.
@@ -3414,8 +3075,8 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* hole punching doesn't need new metadata... This is needed especially
* to keep ext2/3 backward compatibility.
*/
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS))
- return 0;
+ if (!ext4_has_feature_extents(sb))
+ return;
/*
* By default we reserve 2% or 4096 clusters, whichever is smaller.
* This should cover the situations where we can not afford to run
@@ -3424,26 +3085,13 @@ static ext4_fsblk_t ext4_calculate_resv_clusters(struct super_block *sb)
* allocation would require 1, or 2 blocks, higher numbers are
* very rare.
*/
- resv_clusters = ext4_blocks_count(EXT4_SB(sb)->s_es) >>
- EXT4_SB(sb)->s_cluster_bits;
+ resv_clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
do_div(resv_clusters, 50);
resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
- return resv_clusters;
-}
-
-
-static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
-{
- ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
- sbi->s_cluster_bits;
-
- if (count >= clusters)
- return -EINVAL;
-
- atomic64_set(&sbi->s_resv_clusters, count);
- return 0;
+ atomic64_set(&sbi->s_resv_clusters, resv_clusters);
}
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
@@ -3526,9 +3174,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
/* Warn if metadata_csum and gdt_csum are both set. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+ if (ext4_has_feature_metadata_csum(sb) &&
+ ext4_has_feature_gdt_csum(sb))
ext4_warning(sb, "metadata_csum and uninit_bg are "
"redundant flags; please run fsck.");
@@ -3541,8 +3188,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
/* Load the checksum driver */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+ if (ext4_has_feature_metadata_csum(sb)) {
sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(sbi->s_chksum_driver)) {
ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
@@ -3557,11 +3203,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
"invalid superblock checksum. Run e2fsck?");
silent = 1;
+ ret = -EFSBADCRC;
goto cantfind_ext4;
}
/* Precompute checksum seed for all metadata */
- if (ext4_has_metadata_csum(sb))
+ if (ext4_has_feature_csum_seed(sb))
+ sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
+ else if (ext4_has_metadata_csum(sb))
sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
sizeof(es->s_uuid));
@@ -3664,17 +3313,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV &&
- (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
+ (ext4_has_compat_features(sb) ||
+ ext4_has_ro_compat_features(sb) ||
+ ext4_has_incompat_features(sb)))
ext4_msg(sb, KERN_WARNING,
"feature flags set on rev 0 fs, "
"running e2fsck is recommended");
if (es->s_creator_os == cpu_to_le32(EXT4_OS_HURD)) {
set_opt2(sb, HURD_COMPAT);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
ext4_msg(sb, KERN_ERR,
"The Hurd can't support 64-bit file systems");
goto failed_mount;
@@ -3732,8 +3380,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) &&
- es->s_encryption_level) {
+ if (ext4_has_feature_encrypt(sb) && es->s_encryption_level) {
ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d",
es->s_encryption_level);
goto failed_mount;
@@ -3765,8 +3412,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
}
- has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
+ has_huge_files = ext4_has_feature_huge_file(sb);
sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -3790,7 +3436,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
+ if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
!is_power_of_2(sbi->s_desc_size)) {
@@ -3821,7 +3467,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
for (i = 0; i < 4; i++)
sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
sbi->s_def_hash_version = es->s_def_hash_version;
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
+ if (ext4_has_feature_dir_index(sb)) {
i = le32_to_cpu(es->s_flags);
if (i & EXT2_FLAGS_UNSIGNED_HASH)
sbi->s_hash_unsigned = 3;
@@ -3841,8 +3487,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Handle clustersize */
clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size);
- has_bigalloc = EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_BIGALLOC);
+ has_bigalloc = ext4_has_feature_bigalloc(sb);
if (has_bigalloc) {
if (clustersize < blocksize) {
ext4_msg(sb, KERN_ERR,
@@ -3961,13 +3606,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (ext4_proc_root)
- sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
- if (sbi->s_proc)
- proc_create_data("options", S_IRUGO, sbi->s_proc,
- &ext4_seq_options_fops, sb);
-
bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
@@ -3982,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
}
if (!ext4_check_descriptors(sb, &first_not_zeroed)) {
ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
+ ret = -EFSCORRUPTED;
goto failed_mount2;
}
@@ -4007,7 +3646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ext4_xattr_handlers;
#ifdef CONFIG_QUOTA
sb->dq_op = &ext4_quota_operations;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
+ if (ext4_has_feature_quota(sb))
sb->s_qcop = &dquot_quotactl_sysfile_ops;
else
sb->s_qcop = &ext4_qctl_operations;
@@ -4021,11 +3660,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_root = NULL;
needs_recovery = (es->s_last_orphan != 0 ||
- EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_RECOVER));
+ ext4_has_feature_journal_needs_recovery(sb));
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_MMP) &&
- !(sb->s_flags & MS_RDONLY))
+ if (ext4_has_feature_mmp(sb) && !(sb->s_flags & MS_RDONLY))
if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block)))
goto failed_mount3a;
@@ -4033,23 +3670,47 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
* The first inode we look at is the journal inode. Don't try
* root first: it may be modified in the journal!
*/
- if (!test_opt(sb, NOLOAD) &&
- EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) {
if (ext4_load_journal(sb, es, journal_devnum))
goto failed_mount3a;
} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
goto failed_mount_wq;
} else {
+ /* Nojournal mode, all journal mount options are illegal */
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_checksum, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_async_commit, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "commit=%lu, fs mounted w/o journal",
+ sbi->s_commit_interval / HZ);
+ goto failed_mount_wq;
+ }
+ if (EXT4_MOUNT_DATA_FLAGS &
+ (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "data=, fs mounted w/o journal");
+ goto failed_mount_wq;
+ }
+ sbi->s_def_mount_opt &= EXT4_MOUNT_JOURNAL_CHECKSUM;
+ clear_opt(sb, JOURNAL_CHECKSUM);
clear_opt(sb, DATA_FLAGS);
sbi->s_journal = NULL;
needs_recovery = 0;
goto no_journal;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
+ if (ext4_has_feature_64bit(sb) &&
!jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
JBD2_FEATURE_INCOMPAT_64BIT)) {
ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
@@ -4101,18 +3762,16 @@ no_journal:
}
}
- if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
- EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
(blocksize != PAGE_CACHE_SIZE)) {
ext4_msg(sb, KERN_ERR,
"Unsupported blocksize for fs encryption");
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
- !(sb->s_flags & MS_RDONLY) &&
- !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
+ if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) &&
+ !ext4_has_feature_encrypt(sb)) {
+ ext4_set_feature_encrypt(sb);
ext4_commit_super(sb, 1);
}
@@ -4171,8 +3830,7 @@ no_journal:
if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
EXT4_GOOD_OLD_INODE_SIZE;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE)) {
+ if (ext4_has_feature_extra_isize(sb)) {
if (sbi->s_want_extra_isize <
le16_to_cpu(es->s_want_extra_isize))
sbi->s_want_extra_isize =
@@ -4192,12 +3850,7 @@ no_journal:
"available");
}
- err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sb));
- if (err) {
- ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
- "reserved pool", ext4_calculate_resv_clusters(sb));
- goto failed_mount4a;
- }
+ ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
if (err) {
@@ -4236,7 +3889,7 @@ no_journal:
goto failed_mount6;
}
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
+ if (ext4_has_feature_flex_bg(sb))
if (!ext4_fill_flex_info(sb)) {
ext4_msg(sb, KERN_ERR,
"unable to initialize "
@@ -4248,17 +3901,13 @@ no_journal:
if (err)
goto failed_mount6;
- sbi->s_kobj.kset = ext4_kset;
- init_completion(&sbi->s_kobj_unregister);
- err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
- "%s", sb->s_id);
+ err = ext4_register_sysfs(sb);
if (err)
goto failed_mount7;
#ifdef CONFIG_QUOTA
/* Enable quota usage during mount. */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) &&
- !(sb->s_flags & MS_RDONLY)) {
+ if (ext4_has_feature_quota(sb) && !(sb->s_flags & MS_RDONLY)) {
err = ext4_enable_quotas(sb);
if (err)
goto failed_mount8;
@@ -4313,7 +3962,7 @@ cantfind_ext4:
#ifdef CONFIG_QUOTA
failed_mount8:
- kobject_del(&sbi->s_kobj);
+ ext4_unregister_sysfs(sb);
#endif
failed_mount7:
ext4_unregister_li_request(sb);
@@ -4353,10 +4002,6 @@ failed_mount2:
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
- if (sbi->s_proc) {
- remove_proc_entry("options", sbi->s_proc);
- remove_proc_entry(sb->s_id, ext4_proc_root);
- }
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(sbi->s_qf_names[i]);
@@ -4403,7 +4048,7 @@ static journal_t *ext4_get_journal(struct super_block *sb,
struct inode *journal_inode;
journal_t *journal;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
/* First, test for the existence of a valid inode on disk. Bad
* things happen if we iget() an unused inode, as the subsequent
@@ -4453,7 +4098,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
struct ext4_super_block *es;
struct block_device *bdev;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
bdev = ext4_blkdev_get(j_dev, sb);
if (bdev == NULL)
@@ -4545,7 +4190,7 @@ static int ext4_load_journal(struct super_block *sb,
int err = 0;
int really_read_only;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
if (journal_devnum &&
journal_devnum != le32_to_cpu(es->s_journal_dev)) {
@@ -4562,7 +4207,7 @@ static int ext4_load_journal(struct super_block *sb,
* crash? For recovery, we need to check in advance whether we
* can get read-write access to the device.
*/
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
+ if (ext4_has_feature_journal_needs_recovery(sb)) {
if (sb->s_flags & MS_RDONLY) {
ext4_msg(sb, KERN_INFO, "INFO: recovery "
"required on readonly filesystem");
@@ -4593,7 +4238,7 @@ static int ext4_load_journal(struct super_block *sb,
if (!(journal->j_flags & JBD2_BARRIER))
ext4_msg(sb, KERN_INFO, "barriers disabled");
- if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
+ if (!ext4_has_feature_journal_needs_recovery(sb))
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
@@ -4707,7 +4352,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
{
journal_t *journal = EXT4_SB(sb)->s_journal;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+ if (!ext4_has_feature_journal(sb)) {
BUG_ON(journal != NULL);
return;
}
@@ -4715,9 +4360,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
if (jbd2_journal_flush(journal) < 0)
goto out;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
+ if (ext4_has_feature_journal_needs_recovery(sb) &&
sb->s_flags & MS_RDONLY) {
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
ext4_commit_super(sb, 1);
}
@@ -4737,7 +4382,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
int j_errno;
const char *errstr;
- BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
+ BUG_ON(!ext4_has_feature_journal(sb));
journal = EXT4_SB(sb)->s_journal;
@@ -4852,7 +4497,7 @@ static int ext4_freeze(struct super_block *sb)
goto out;
/* Journal blocked and flushed, clear needs_recovery flag. */
- EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_clear_feature_journal_needs_recovery(sb);
}
error = ext4_commit_super(sb, 1);
@@ -4874,7 +4519,7 @@ static int ext4_unfreeze(struct super_block *sb)
if (EXT4_SB(sb)->s_journal) {
/* Reset the needs_recovery flag before the fs is unlocked. */
- EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+ ext4_set_feature_journal_needs_recovery(sb);
}
ext4_commit_super(sb, 1);
@@ -5027,8 +4672,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
ext4_mark_recovery_complete(sb, es);
} else {
/* Make sure we can mount this feature set readwrite */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_READONLY) ||
+ if (ext4_has_feature_readonly(sb) ||
!ext4_feature_set_ok(sb, 0)) {
err = -EROFS;
goto restore_opts;
@@ -5044,9 +4688,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
ext4_msg(sb, KERN_ERR,
"ext4_remount: Checksum for group %u failed (%u!=%u)",
- g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
+ g, le16_to_cpu(ext4_group_desc_csum(sb, g, gdp)),
le16_to_cpu(gdp->bg_checksum));
- err = -EINVAL;
+ err = -EFSBADCRC;
goto restore_opts;
}
}
@@ -5076,8 +4720,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
sbi->s_mount_state = le16_to_cpu(es->s_state);
if (!ext4_setup_super(sb, es, 0))
sb->s_flags &= ~MS_RDONLY;
- if (EXT4_HAS_INCOMPAT_FEATURE(sb,
- EXT4_FEATURE_INCOMPAT_MMP))
+ if (ext4_has_feature_mmp(sb))
if (ext4_multi_mount_protect(sb,
le64_to_cpu(es->s_mmp_block))) {
err = -EROFS;
@@ -5110,8 +4753,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
- else if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
- EXT4_FEATURE_RO_COMPAT_QUOTA)) {
+ else if (ext4_has_feature_quota(sb)) {
err = ext4_enable_quotas(sb);
if (err)
goto restore_opts;
@@ -5255,7 +4897,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot)
struct ext4_sb_info *sbi = EXT4_SB(sb);
/* Are we journaling quotas? */
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA) ||
+ if (ext4_has_feature_quota(sb) ||
sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
dquot_mark_dquot_dirty(dquot);
return ext4_write_dquot(dquot);
@@ -5343,7 +4985,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
};
- BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA));
+ BUG_ON(!ext4_has_feature_quota(sb));
if (!qf_inums[type])
return -EPERM;
@@ -5537,11 +5179,11 @@ static inline void unregister_as_ext2(void)
static inline int ext2_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT2_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext2_incompat_features(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT2_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext2_ro_compat_features(sb))
return 0;
return 1;
}
@@ -5566,13 +5208,13 @@ static inline void unregister_as_ext3(void)
static inline int ext3_feature_set_ok(struct super_block *sb)
{
- if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT3_FEATURE_INCOMPAT_SUPP))
+ if (ext4_has_unknown_ext3_incompat_features(sb))
return 0;
- if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+ if (!ext4_has_feature_journal(sb))
return 0;
if (sb->s_flags & MS_RDONLY)
return 1;
- if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT3_FEATURE_RO_COMPAT_SUPP))
+ if (ext4_has_unknown_ext3_ro_compat_features(sb))
return 0;
return 1;
}
@@ -5586,37 +5228,6 @@ static struct file_system_type ext4_fs_type = {
};
MODULE_ALIAS_FS("ext4");
-static int __init ext4_init_feat_adverts(void)
-{
- struct ext4_features *ef;
- int ret = -ENOMEM;
-
- ef = kzalloc(sizeof(struct ext4_features), GFP_KERNEL);
- if (!ef)
- goto out;
-
- ef->f_kobj.kset = ext4_kset;
- init_completion(&ef->f_kobj_unregister);
- ret = kobject_init_and_add(&ef->f_kobj, &ext4_feat_ktype, NULL,
- "features");
- if (ret) {
- kfree(ef);
- goto out;
- }
-
- ext4_feat = ef;
- ret = 0;
-out:
- return ret;
-}
-
-static void ext4_exit_feat_adverts(void)
-{
- kobject_put(&ext4_feat->f_kobj);
- wait_for_completion(&ext4_feat->f_kobj_unregister);
- kfree(ext4_feat);
-}
-
/* Shared across all ext4 file systems */
wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ];
struct mutex ext4__aio_mutex[EXT4_WQ_HASH_SZ];
@@ -5643,21 +5254,15 @@ static int __init ext4_init_fs(void)
err = ext4_init_pageio();
if (err)
- goto out7;
+ goto out5;
err = ext4_init_system_zone();
if (err)
- goto out6;
- ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
- if (!ext4_kset) {
- err = -ENOMEM;
- goto out5;
- }
- ext4_proc_root = proc_mkdir("fs/ext4", NULL);
+ goto out4;
- err = ext4_init_feat_adverts();
+ err = ext4_init_sysfs();
if (err)
- goto out4;
+ goto out3;
err = ext4_init_mballoc();
if (err)
@@ -5682,16 +5287,12 @@ out1:
ext4_mballoc_ready = 0;
ext4_exit_mballoc();
out2:
- ext4_exit_feat_adverts();
-out4:
- if (ext4_proc_root)
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
-out5:
+ ext4_exit_sysfs();
+out3:
ext4_exit_system_zone();
-out6:
+out4:
ext4_exit_pageio();
-out7:
+out5:
ext4_exit_es();
return err;
@@ -5706,9 +5307,7 @@ static void __exit ext4_exit_fs(void)
unregister_filesystem(&ext4_fs_type);
destroy_inodecache();
ext4_exit_mballoc();
- ext4_exit_feat_adverts();
- remove_proc_entry("fs/ext4", NULL);
- kset_unregister(ext4_kset);
+ ext4_exit_sysfs();
ext4_exit_system_zone();
ext4_exit_pageio();
ext4_exit_es();
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index c677f2c1044b..abe2401ce405 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -57,7 +57,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
sizeof(struct ext4_encrypted_symlink_data) - 1) >
max_size) {
/* Symlink data on the disk is corrupted */
- res = -EIO;
+ res = -EFSCORRUPTED;
goto errout;
}
plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ?
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
new file mode 100644
index 000000000000..1b57c72f4a00
--- /dev/null
+++ b/fs/ext4/sysfs.c
@@ -0,0 +1,448 @@
+/*
+ * linux/fs/ext4/sysfs.c
+ *
+ * Copyright (C) 1992, 1993, 1994, 1995
+ * Remy Card (card@masi.ibp.fr)
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+
+#include "ext4.h"
+#include "ext4_jbd2.h"
+
+typedef enum {
+ attr_noop,
+ attr_delayed_allocation_blocks,
+ attr_session_write_kbytes,
+ attr_lifetime_write_kbytes,
+ attr_reserved_clusters,
+ attr_inode_readahead,
+ attr_trigger_test_error,
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_atomic,
+} attr_id_t;
+
+typedef enum {
+ ptr_explicit,
+ ptr_ext4_sb_info_offset,
+ ptr_ext4_super_block_offset,
+} attr_ptr_t;
+
+static const char *proc_dirname = "fs/ext4";
+static struct proc_dir_entry *ext4_proc_root;
+
+struct ext4_attr {
+ struct attribute attr;
+ short attr_id;
+ short attr_ptr;
+ union {
+ int offset;
+ void *explicit_ptr;
+ } u;
+};
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ if (!sb->s_bdev->bd_part)
+ return snprintf(buf, PAGE_SIZE, "0\n");
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)(sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1)));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+ int ret;
+
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+
+ if (t && (!is_power_of_2(t) || t > 0x40000000))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long long val;
+ ext4_fsblk_t clusters = (ext4_blocks_count(sbi->s_es) >>
+ sbi->s_cluster_bits);
+ int ret;
+
+ ret = kstrtoull(skip_spaces(buf), 0, &val);
+ if (!ret || val >= clusters)
+ return -EINVAL;
+
+ atomic64_set(&sbi->s_resv_clusters, val);
+ return count;
+}
+
+static ssize_t trigger_test_error(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ int len = count;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (len && buf[len-1] == '\n')
+ len--;
+
+ if (len)
+ ext4_error(sbi->s_sb, "%.*s", len, buf);
+ return count;
+}
+
+#define EXT4_ATTR(_name,_mode,_id) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+
+#define EXT4_ATTR_FUNC(_name,_mode) EXT4_ATTR(_name,_mode,_name)
+
+#define EXT4_ATTR_FEATURE(_name) EXT4_ATTR(_name, 0444, feature)
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_id,_struct,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_##_struct##_offset, \
+ .u = { \
+ .offset = offsetof(struct _struct, _elname),\
+ }, \
+}
+
+#define EXT4_RO_ATTR_ES_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0444, pointer_ui, ext4_super_block, _elname)
+
+#define EXT4_RW_ATTR_SBI_UI(_name,_elname) \
+ EXT4_ATTR_OFFSET(_name, 0644, pointer_ui, ext4_sb_info, _elname)
+
+#define EXT4_ATTR_PTR(_name,_mode,_id,_ptr) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .attr_ptr = ptr_explicit, \
+ .u = { \
+ .explicit_ptr = _ptr, \
+ }, \
+}
+
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
+EXT4_ATTR_FUNC(session_write_kbytes, 0444);
+EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
+EXT4_ATTR_FUNC(reserved_clusters, 0644);
+
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
+ ext4_sb_info, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
+EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(err_ratelimit_burst, s_err_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
+EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
+EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
+EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+
+static unsigned int old_bump_val = 128;
+EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(reserved_clusters),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(inode_goal),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ ATTR_LIST(max_writeback_mb_bump),
+ ATTR_LIST(extent_max_zeroout_kb),
+ ATTR_LIST(trigger_fs_error),
+ ATTR_LIST(err_ratelimit_interval_ms),
+ ATTR_LIST(err_ratelimit_burst),
+ ATTR_LIST(warning_ratelimit_interval_ms),
+ ATTR_LIST(warning_ratelimit_burst),
+ ATTR_LIST(msg_ratelimit_interval_ms),
+ ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(errors_count),
+ ATTR_LIST(first_error_time),
+ ATTR_LIST(last_error_time),
+ NULL,
+};
+
+/* Features this copy of ext4 supports */
+EXT4_ATTR_FEATURE(lazy_itable_init);
+EXT4_ATTR_FEATURE(batched_discard);
+EXT4_ATTR_FEATURE(meta_bg_resize);
+EXT4_ATTR_FEATURE(encryption);
+EXT4_ATTR_FEATURE(metadata_csum_seed);
+
+static struct attribute *ext4_feat_attrs[] = {
+ ATTR_LIST(lazy_itable_init),
+ ATTR_LIST(batched_discard),
+ ATTR_LIST(meta_bg_resize),
+ ATTR_LIST(encryption),
+ ATTR_LIST(metadata_csum_seed),
+ NULL,
+};
+
+static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
+{
+ switch (a->attr_ptr) {
+ case ptr_explicit:
+ return a->u.explicit_ptr;
+ case ptr_ext4_sb_info_offset:
+ return (void *) (((char *) sbi) + a->u.offset);
+ case ptr_ext4_super_block_offset:
+ return (void *) (((char *) sbi->s_es) + a->u.offset);
+ }
+ return NULL;
+}
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+
+ switch (a->attr_id) {
+ case attr_delayed_allocation_blocks:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) EXT4_C2B(sbi,
+ percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
+ case attr_session_write_kbytes:
+ return session_write_kbytes_show(a, sbi, buf);
+ case attr_lifetime_write_kbytes:
+ return lifetime_write_kbytes_show(a, sbi, buf);
+ case attr_reserved_clusters:
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (unsigned long long)
+ atomic64_read(&sbi->s_resv_clusters));
+ case attr_inode_readahead:
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%u\n",
+ *((unsigned int *) ptr));
+ case attr_pointer_atomic:
+ if (!ptr)
+ return 0;
+ return snprintf(buf, PAGE_SIZE, "%d\n",
+ atomic_read((atomic_t *) ptr));
+ case attr_feature:
+ return snprintf(buf, PAGE_SIZE, "supported\n");
+ }
+
+ return 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+ void *ptr = calc_ptr(a, sbi);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_reserved_clusters:
+ return reserved_clusters_store(a, sbi, buf, len);
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ *((unsigned int *) ptr) = t;
+ return len;
+ case attr_inode_readahead:
+ return inode_readahead_blks_store(a, sbi, buf, len);
+ case attr_trigger_test_error:
+ return trigger_test_error(a, sbi, buf, len);
+ }
+ return 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_sb_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
+static struct kobj_type ext4_ktype = {
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kset ext4_kset = {
+ .kobj = {.ktype = &ext4_ktype},
+};
+
+static struct kobj_type ext4_feat_ktype = {
+ .default_attrs = ext4_feat_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+};
+
+static struct kobject ext4_feat = {
+ .kset = &ext4_kset,
+};
+
+#define PROC_FILE_SHOW_DEFN(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ return single_open(file, ext4_seq_##name##_show, PDE_DATA(inode)); \
+} \
+\
+const struct file_operations ext4_seq_##name##_fops = { \
+ .owner = THIS_MODULE, \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+}
+
+#define PROC_FILE_LIST(name) \
+ { __stringify(name), &ext4_seq_##name##_fops }
+
+PROC_FILE_SHOW_DEFN(es_shrinker_info);
+PROC_FILE_SHOW_DEFN(options);
+
+static struct ext4_proc_files {
+ const char *name;
+ const struct file_operations *fops;
+} proc_files[] = {
+ PROC_FILE_LIST(options),
+ PROC_FILE_LIST(es_shrinker_info),
+ PROC_FILE_LIST(mb_groups),
+ { NULL, NULL },
+};
+
+int ext4_register_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+ int err;
+
+ sbi->s_kobj.kset = &ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_sb_ktype, NULL,
+ "%s", sb->s_id);
+ if (err)
+ return err;
+
+ if (ext4_proc_root)
+ sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ proc_create_data(p->name, S_IRUGO, sbi->s_proc,
+ p->fops, sb);
+ }
+ return 0;
+}
+
+void ext4_unregister_sysfs(struct super_block *sb)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct ext4_proc_files *p;
+
+ if (sbi->s_proc) {
+ for (p = proc_files; p->name; p++)
+ remove_proc_entry(p->name, sbi->s_proc);
+ remove_proc_entry(sb->s_id, ext4_proc_root);
+ }
+ kobject_del(&sbi->s_kobj);
+}
+
+int __init ext4_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&ext4_kset.kobj, "ext4");
+ ext4_kset.kobj.parent = fs_kobj;
+ ret = kset_register(&ext4_kset);
+ if (ret)
+ return ret;
+
+ ret = kobject_init_and_add(&ext4_feat, &ext4_feat_ktype,
+ NULL, "features");
+ if (ret)
+ kset_unregister(&ext4_kset);
+ else
+ ext4_proc_root = proc_mkdir(proc_dirname, NULL);
+ return ret;
+}
+
+void ext4_exit_sysfs(void)
+{
+ kobject_put(&ext4_feat);
+ kset_unregister(&ext4_kset);
+ remove_proc_entry(proc_dirname, NULL);
+ ext4_proc_root = NULL;
+}
+
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 16e28c08d1e8..6b6b3e751f8c 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -195,7 +195,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
while (!IS_LAST_ENTRY(e)) {
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
if ((void *)next >= end)
- return -EIO;
+ return -EFSCORRUPTED;
e = next;
}
@@ -205,7 +205,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
(void *)e + sizeof(__u32) ||
value_start + le16_to_cpu(entry->e_value_offs) +
le32_to_cpu(entry->e_value_size) > end))
- return -EIO;
+ return -EFSCORRUPTED;
entry = EXT4_XATTR_NEXT(entry);
}
@@ -222,9 +222,9 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
BHDR(bh)->h_blocks != cpu_to_le32(1))
- return -EIO;
+ return -EFSCORRUPTED;
if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
- return -EIO;
+ return -EFSBADCRC;
error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size,
bh->b_data);
if (!error)
@@ -239,7 +239,7 @@ ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size)
if (entry->e_value_block != 0 || value_size > size ||
le16_to_cpu(entry->e_value_offs) + value_size > size)
- return -EIO;
+ return -EFSCORRUPTED;
return 0;
}
@@ -266,7 +266,7 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index,
}
*pentry = entry;
if (!cmp && ext4_xattr_check_entry(entry, size))
- return -EIO;
+ return -EFSCORRUPTED;
return cmp ? -ENODATA : 0;
}
@@ -297,13 +297,13 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
bad_block:
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
entry = BFIRST(bh);
error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -405,10 +405,9 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
ext4_xattr_handler(entry->e_name_index);
if (handler) {
- size_t size = handler->list(dentry, buffer, rest,
- entry->e_name,
- entry->e_name_len,
- handler->flags);
+ size_t size = handler->list(handler, dentry, buffer,
+ rest, entry->e_name,
+ entry->e_name_len);
if (buffer) {
if (size > rest)
return -ERANGE;
@@ -445,7 +444,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
ext4_xattr_cache_insert(ext4_mb_cache, bh);
@@ -525,12 +524,12 @@ errout:
static void ext4_xattr_update_super_block(handle_t *handle,
struct super_block *sb)
{
- if (EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR))
+ if (ext4_has_feature_xattr(sb))
return;
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access");
if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) {
- EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR);
+ ext4_set_feature_xattr(sb);
ext4_handle_dirty_super(handle, sb);
}
}
@@ -751,7 +750,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (ext4_xattr_check_block(inode, bs->bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
/* Find the named attribute. */
@@ -811,7 +810,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
bs->bh);
}
unlock_buffer(bs->bh);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (!error)
error = ext4_handle_dirty_xattr_block(handle,
@@ -855,7 +854,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
}
error = ext4_xattr_set_entry(i, s);
- if (error == -EIO)
+ if (error == -EFSCORRUPTED)
goto bad_block;
if (error)
goto cleanup;
@@ -1314,7 +1313,7 @@ retry:
if (ext4_xattr_check_block(inode, bh)) {
EXT4_ERROR_INODE(inode, "bad block %llu",
EXT4_I(inode)->i_file_acl);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto cleanup;
}
base = BHDR(bh);
@@ -1579,7 +1578,7 @@ ext4_xattr_cmp(struct ext4_xattr_header *header1,
memcmp(entry1->e_name, entry2->e_name, entry1->e_name_len))
return 1;
if (entry1->e_value_block != 0 || entry2->e_value_block != 0)
- return -EIO;
+ return -EFSCORRUPTED;
if (memcmp((char *)header1 + le16_to_cpu(entry1->e_value_offs),
(char *)header2 + le16_to_cpu(entry2->e_value_offs),
le32_to_cpu(entry1->e_value_size)))
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 95d90e0560f0..36f4c1a84c21 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -12,8 +12,9 @@
#include "xattr.h"
static size_t
-ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext4_xattr_security_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
const size_t total_len = prefix_len + name_len + 1;
@@ -28,8 +29,9 @@ ext4_xattr_security_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext4_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -38,8 +40,9 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name,
}
static int
-ext4_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 891ee2ddfbd6..488089053342 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -13,8 +13,9 @@
#include "xattr.h"
static size_t
-ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext4_xattr_trusted_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -31,8 +32,9 @@ ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
- size_t size, int type)
+ext4_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -41,8 +43,9 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer,
}
static int
-ext4_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index 6ed932b3c043..d2dec3364062 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -12,8 +12,9 @@
#include "xattr.h"
static size_t
-ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+ext4_xattr_user_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -30,8 +31,9 @@ ext4_xattr_user_list(struct dentry *dentry, char *list, size_t list_size,
}
static int
-ext4_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+ext4_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -42,8 +44,9 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name,
}
static int
-ext4_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+ext4_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index c5a38e352a80..f661d80474be 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -47,7 +47,8 @@ repeat:
/*
* We guarantee no failure on the returned page.
*/
-struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
+ bool is_meta)
{
struct address_space *mapping = META_MAPPING(sbi);
struct page *page;
@@ -58,6 +59,9 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
.blk_addr = index,
.encrypted_page = NULL,
};
+
+ if (unlikely(!is_meta))
+ fio.rw &= ~REQ_META;
repeat:
page = grab_cache_page(mapping, index);
if (!page) {
@@ -91,6 +95,17 @@ out:
return page;
}
+struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, true);
+}
+
+/* for POR only */
+struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index)
+{
+ return __get_meta_page(sbi, index, false);
+}
+
bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
{
switch (type) {
@@ -125,7 +140,8 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type)
/*
* Readahead CP/NAT/SIT/SSA pages
*/
-int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
+int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
+ int type, bool sync)
{
block_t prev_blk_addr = 0;
struct page *page;
@@ -133,10 +149,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
struct f2fs_io_info fio = {
.sbi = sbi,
.type = META,
- .rw = READ_SYNC | REQ_META | REQ_PRIO,
+ .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA,
.encrypted_page = NULL,
};
+ if (unlikely(type == META_POR))
+ fio.rw &= ~REQ_META;
+
for (; nrpages-- > 0; blkno++) {
if (!is_valid_blkaddr(sbi, blkno, type))
@@ -196,7 +215,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
f2fs_put_page(page, 0);
if (readahead)
- ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR);
+ ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
}
static int f2fs_write_meta_page(struct page *page,
@@ -257,7 +276,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
long nr_to_write)
{
struct address_space *mapping = META_MAPPING(sbi);
- pgoff_t index = 0, end = LONG_MAX;
+ pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX;
struct pagevec pvec;
long nwritten = 0;
struct writeback_control wbc = {
@@ -277,6 +296,13 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type,
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
+ if (prev == LONG_MAX)
+ prev = page->index - 1;
+ if (nr_to_write != LONG_MAX && page->index != prev + 1) {
+ pagevec_release(&pvec);
+ goto stop;
+ }
+
lock_page(page);
if (unlikely(page->mapping != mapping)) {
@@ -297,13 +323,14 @@ continue_unlock:
break;
}
nwritten++;
+ prev = page->index;
if (unlikely(nwritten >= nr_to_write))
break;
}
pagevec_release(&pvec);
cond_resched();
}
-
+stop:
if (nwritten)
f2fs_submit_merged_bio(sbi, type, WRITE);
@@ -495,7 +522,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi);
- ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP);
+ ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true);
for (i = 0; i < orphan_blocks; i++) {
struct page *page = get_meta_page(sbi, start_blk + i);
@@ -1000,6 +1027,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
start_blk = __start_cp_addr(sbi);
+ /* need to wait for end_io results */
+ wait_on_all_pages_writeback(sbi);
+ if (unlikely(f2fs_cp_error(sbi)))
+ return;
+
/* write out checkpoint buffer at block 0 */
update_meta_page(sbi, ckpt, start_blk++);
@@ -1109,6 +1141,9 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
if (cpc->reason == CP_RECOVERY)
f2fs_msg(sbi->sb, KERN_NOTICE,
"checkpoint: version = %llx", ckpt_ver);
+
+ /* do checkpoint periodically */
+ sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval);
out:
mutex_unlock(&sbi->cp_mutex);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c
index 9f77de2ef317..5de2d866a25c 100644
--- a/fs/f2fs/crypto_key.c
+++ b/fs/f2fs/crypto_key.c
@@ -122,7 +122,7 @@ int _f2fs_get_encryption_info(struct inode *inode)
struct key *keyring_key = NULL;
struct f2fs_encryption_key *master_key;
struct f2fs_encryption_context ctx;
- struct user_key_payload *ukp;
+ const struct user_key_payload *ukp;
struct crypto_ablkcipher *ctfm;
const char *cipher_str;
char raw_key[F2FS_MAX_KEY_SIZE];
@@ -199,7 +199,7 @@ retry:
}
crypt_info->ci_keyring_key = keyring_key;
BUG_ON(keyring_key->type != &key_type_logon);
- ukp = ((struct user_key_payload *)keyring_key->payload.data);
+ ukp = user_key_payload(keyring_key);
if (ukp->datalen != sizeof(struct f2fs_encryption_key)) {
res = -EINVAL;
goto out;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index a82abe921b89..972eab7ac071 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -275,7 +275,8 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
return f2fs_reserve_block(dn, index);
}
-struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
+struct page *get_read_data_page(struct inode *inode, pgoff_t index,
+ int rw, bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct dnode_of_data dn;
@@ -292,7 +293,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw)
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
return read_mapping_page(mapping, index, NULL);
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, for_write);
if (!page)
return ERR_PTR(-ENOMEM);
@@ -352,7 +353,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
return page;
f2fs_put_page(page, 0);
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, false);
if (IS_ERR(page))
return page;
@@ -372,12 +373,13 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
* Because, the callers, functions in dir.c and GC, should be able to know
* whether this page exists or not.
*/
-struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
+struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
+ bool for_write)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
repeat:
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, for_write);
if (IS_ERR(page))
return page;
@@ -411,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode,
struct dnode_of_data dn;
int err;
repeat:
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, true);
if (!page) {
/*
* before exiting, we should make sure ipage will be released
@@ -439,7 +441,7 @@ repeat:
} else {
f2fs_put_page(page, 1);
- page = get_read_data_page(inode, index, READ_SYNC);
+ page = get_read_data_page(inode, index, READ_SYNC, true);
if (IS_ERR(page))
goto repeat;
@@ -447,9 +449,9 @@ repeat:
lock_page(page);
}
got_it:
- if (new_i_size &&
- i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) {
- i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT));
+ if (new_i_size && i_size_read(inode) <
+ ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) {
+ i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT));
/* Only the directory inode sets new_i_size */
set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR);
}
@@ -489,8 +491,9 @@ alloc:
/* update i_size */
fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
dn->ofs_in_node;
- if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
- i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
+ if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
+ i_size_write(dn->inode,
+ ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
/* direct IO doesn't use extent cache to maximize the performance */
f2fs_drop_largest_extent(dn->inode, fofs);
@@ -523,6 +526,9 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
while (dn.ofs_in_node < end_offset && len) {
block_t blkaddr;
+ if (unlikely(f2fs_cp_error(sbi)))
+ goto sync_out;
+
blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
if (__allocate_data_block(&dn))
@@ -565,6 +571,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
{
unsigned int maxblocks = map->m_len;
struct dnode_of_data dn;
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA;
pgoff_t pgofs, end_offset;
int err = 0, ofs = 1;
@@ -595,40 +602,40 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
err = 0;
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR) {
- if (flag == F2FS_GET_BLOCK_BMAP) {
- err = -ENOENT;
- goto put_out;
- } else if (flag == F2FS_GET_BLOCK_READ ||
- flag == F2FS_GET_BLOCK_DIO) {
- goto put_out;
+
+ if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto put_out;
+ }
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto put_out;
+ allocated = true;
+ map->m_flags = F2FS_MAP_NEW;
+ } else {
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ dn.data_blkaddr != NEW_ADDR) {
+ if (flag == F2FS_GET_BLOCK_BMAP)
+ err = -ENOENT;
+ goto put_out;
+ }
+
+ /*
+ * preallocated unwritten block should be mapped
+ * for fiemap.
+ */
+ if (dn.data_blkaddr == NEW_ADDR)
+ map->m_flags = F2FS_MAP_UNWRITTEN;
}
- /*
- * if it is in fiemap call path (flag = F2FS_GET_BLOCK_FIEMAP),
- * mark it as mapped and unwritten block.
- */
}
- if (dn.data_blkaddr != NULL_ADDR) {
- map->m_flags = F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- if (dn.data_blkaddr == NEW_ADDR)
- map->m_flags |= F2FS_MAP_UNWRITTEN;
- } else if (create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto put_out;
- allocated = true;
- map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED;
- map->m_pblk = dn.data_blkaddr;
- } else {
- if (flag == F2FS_GET_BLOCK_BMAP)
- err = -ENOENT;
- goto put_out;
- }
+ map->m_flags |= F2FS_MAP_MAPPED;
+ map->m_pblk = dn.data_blkaddr;
+ map->m_len = 1;
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
- map->m_len = 1;
dn.ofs_in_node++;
pgofs++;
@@ -647,23 +654,35 @@ get_next:
goto unlock_out;
}
- if (dn.data_blkaddr == NEW_ADDR &&
- flag != F2FS_GET_BLOCK_FIEMAP)
- goto put_out;
-
end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
}
if (maxblocks > map->m_len) {
block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
- if (blkaddr == NULL_ADDR && create) {
- err = __allocate_data_block(&dn);
- if (err)
- goto sync_out;
- allocated = true;
- map->m_flags |= F2FS_MAP_NEW;
- blkaddr = dn.data_blkaddr;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+ if (create) {
+ if (unlikely(f2fs_cp_error(sbi))) {
+ err = -EIO;
+ goto sync_out;
+ }
+ err = __allocate_data_block(&dn);
+ if (err)
+ goto sync_out;
+ allocated = true;
+ map->m_flags |= F2FS_MAP_NEW;
+ blkaddr = dn.data_blkaddr;
+ } else {
+ /*
+ * we only merge preallocated unwritten blocks
+ * for fiemap.
+ */
+ if (flag != F2FS_GET_BLOCK_FIEMAP ||
+ blkaddr != NEW_ADDR)
+ goto sync_out;
+ }
}
+
/* Give more consecutive addresses for the readahead */
if ((map->m_pblk != NEW_ADDR &&
blkaddr == (map->m_pblk + ofs)) ||
@@ -752,6 +771,12 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
+ if (f2fs_has_inline_data(inode)) {
+ ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
+ if (ret != -EAGAIN)
+ return ret;
+ }
+
mutex_lock(&inode->i_mutex);
if (len >= isize) {
@@ -903,7 +928,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping,
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
- if (f2fs_map_blocks(inode, &map, 0, false))
+ if (f2fs_map_blocks(inode, &map, 0,
+ F2FS_GET_BLOCK_READ))
goto set_error_page;
}
got_it:
@@ -936,21 +962,14 @@ submit_and_realloc:
if (f2fs_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
- struct page *cpage;
ctx = f2fs_get_crypto_ctx(inode);
if (IS_ERR(ctx))
goto set_error_page;
/* wait the page to be moved by cleaning */
- cpage = find_lock_page(
- META_MAPPING(F2FS_I_SB(inode)),
- block_nr);
- if (cpage) {
- f2fs_wait_on_page_writeback(cpage,
- DATA);
- f2fs_put_page(cpage, 1);
- }
+ f2fs_wait_on_encrypted_page_writeback(
+ F2FS_I_SB(inode), block_nr);
}
bio = bio_alloc(GFP_KERNEL,
@@ -1012,6 +1031,9 @@ static int f2fs_read_data_pages(struct file *file,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = file->f_mapping->host;
+ struct page *page = list_entry(pages->prev, struct page, lru);
+
+ trace_f2fs_readpages(inode, page, nr_pages);
/* If the file has inline data, skip readpages */
if (f2fs_has_inline_data(inode))
@@ -1041,6 +1063,11 @@ int do_write_data_page(struct f2fs_io_info *fio)
}
if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
+
+ /* wait for GCed encrypted page writeback */
+ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode),
+ fio->blk_addr);
+
fio->encrypted_page = f2fs_encrypt(inode, fio->page);
if (IS_ERR(fio->encrypted_page)) {
err = PTR_ERR(fio->encrypted_page);
@@ -1429,6 +1456,10 @@ put_next:
f2fs_wait_on_page_writeback(page, DATA);
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
if (len == PAGE_CACHE_SIZE)
goto out_update;
if (PageUptodate(page))
@@ -1551,10 +1582,16 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
- if (iov_iter_rw(iter) == WRITE)
+ if (iov_iter_rw(iter) == WRITE) {
__allocate_data_blocks(inode, offset, count);
+ if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
+ err = -EIO;
+ goto out;
+ }
+ }
err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
+out:
if (err < 0 && iov_iter_rw(iter) == WRITE)
f2fs_write_failed(mapping, offset + count);
@@ -1636,12 +1673,13 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
{
struct inode *inode = mapping->host;
- /* we don't need to use inline_data strictly */
- if (f2fs_has_inline_data(inode)) {
- int err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
+ if (f2fs_has_inline_data(inode))
+ return 0;
+
+ /* make sure allocating whole blocks */
+ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+ filemap_write_and_wait(mapping);
+
return generic_block_bmap(mapping, block, get_data_block_bmap);
}
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index d013d8479753..478e5d54154f 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -33,11 +33,11 @@ static void update_general_status(struct f2fs_sb_info *sbi)
int i;
/* validation check of the segment numbers */
- si->hit_largest = atomic_read(&sbi->read_hit_largest);
- si->hit_cached = atomic_read(&sbi->read_hit_cached);
- si->hit_rbtree = atomic_read(&sbi->read_hit_rbtree);
+ si->hit_largest = atomic64_read(&sbi->read_hit_largest);
+ si->hit_cached = atomic64_read(&sbi->read_hit_cached);
+ si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
- si->total_ext = atomic_read(&sbi->total_hit_ext);
+ si->total_ext = atomic64_read(&sbi->total_hit_ext);
si->ext_tree = sbi->total_ext_tree;
si->ext_node = atomic_read(&sbi->total_ext_node);
si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
@@ -118,7 +118,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
}
}
dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100);
- si->bimodal = div_u64(bimodal, dist);
+ si->bimodal = div64_u64(bimodal, dist);
if (si->dirty_count)
si->avg_vblocks = div_u64(total_vblocks, ndirty);
else
@@ -198,9 +198,9 @@ get_cache:
si->page_mem = 0;
npages = NODE_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
npages = META_MAPPING(sbi)->nrpages;
- si->page_mem += npages << PAGE_CACHE_SHIFT;
+ si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT;
}
static int stat_show(struct seq_file *s, void *v)
@@ -283,12 +283,12 @@ static int stat_show(struct seq_file *s, void *v)
seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks,
si->bg_node_blks);
seq_puts(s, "\nExtent Cache:\n");
- seq_printf(s, " - Hit Count: L1-1:%d L1-2:%d L2:%d\n",
+ seq_printf(s, " - Hit Count: L1-1:%llu L1-2:%llu L2:%llu\n",
si->hit_largest, si->hit_cached,
si->hit_rbtree);
- seq_printf(s, " - Hit Ratio: %d%% (%d / %d)\n",
+ seq_printf(s, " - Hit Ratio: %llu%% (%llu / %llu)\n",
!si->total_ext ? 0 :
- (si->hit_total * 100) / si->total_ext,
+ div64_u64(si->hit_total * 100, si->total_ext),
si->hit_total, si->total_ext);
seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n",
si->ext_tree, si->ext_node);
@@ -333,13 +333,13 @@ static int stat_show(struct seq_file *s, void *v)
/* memory footprint */
update_mem_info(si->sbi);
- seq_printf(s, "\nMemory: %u KB\n",
+ seq_printf(s, "\nMemory: %llu KB\n",
(si->base_mem + si->cache_mem + si->page_mem) >> 10);
- seq_printf(s, " - static: %u KB\n",
+ seq_printf(s, " - static: %llu KB\n",
si->base_mem >> 10);
- seq_printf(s, " - cached: %u KB\n",
+ seq_printf(s, " - cached: %llu KB\n",
si->cache_mem >> 10);
- seq_printf(s, " - paged : %u KB\n",
+ seq_printf(s, " - paged : %llu KB\n",
si->page_mem >> 10);
}
mutex_unlock(&f2fs_stat_mutex);
@@ -378,10 +378,10 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
si->sbi = sbi;
sbi->stat_info = si;
- atomic_set(&sbi->total_hit_ext, 0);
- atomic_set(&sbi->read_hit_rbtree, 0);
- atomic_set(&sbi->read_hit_largest, 0);
- atomic_set(&sbi->read_hit_cached, 0);
+ atomic64_set(&sbi->total_hit_ext, 0);
+ atomic64_set(&sbi->read_hit_rbtree, 0);
+ atomic64_set(&sbi->read_hit_largest, 0);
+ atomic64_set(&sbi->read_hit_cached, 0);
atomic_set(&sbi->inline_xattr, 0);
atomic_set(&sbi->inline_inode, 0);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 8f15fc134040..7c1678ba8f92 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -258,7 +258,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p)
if (f2fs_has_inline_dentry(dir))
return f2fs_parent_inline_dir(dir, p);
- page = get_lock_data_page(dir, 0);
+ page = get_lock_data_page(dir, 0, false);
if (IS_ERR(page))
return NULL;
@@ -740,7 +740,7 @@ bool f2fs_empty_dir(struct inode *dir)
return f2fs_empty_inline_dir(dir);
for (bidx = 0; bidx < nblock; bidx++) {
- dentry_page = get_lock_data_page(dir, bidx);
+ dentry_page = get_lock_data_page(dir, bidx, false);
if (IS_ERR(dentry_page)) {
if (PTR_ERR(dentry_page) == -ENOENT)
continue;
@@ -787,7 +787,6 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
else
d_type = DT_UNKNOWN;
- /* encrypted case */
de_name.name = d->filename[bit_pos];
de_name.len = le16_to_cpu(de->name_len);
@@ -795,12 +794,20 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
int save_len = fstr->len;
int ret;
+ de_name.name = kmalloc(de_name.len, GFP_NOFS);
+ if (!de_name.name)
+ return false;
+
+ memcpy(de_name.name, d->filename[bit_pos], de_name.len);
+
ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code,
&de_name, fstr);
- de_name = *fstr;
- fstr->len = save_len;
+ kfree(de_name.name);
if (ret < 0)
return true;
+
+ de_name = *fstr;
+ fstr->len = save_len;
}
if (!dir_emit(ctx, de_name.name, de_name.len,
@@ -847,7 +854,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
for (; n < npages; n++) {
- dentry_page = get_lock_data_page(inode, n);
+ dentry_page = get_lock_data_page(inode, n, false);
if (IS_ERR(dentry_page))
continue;
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 997ac86f2a1d..7ddba812e11b 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -155,11 +155,12 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
return count - et->count;
}
-static void __drop_largest_extent(struct inode *inode, pgoff_t fofs)
+static void __drop_largest_extent(struct inode *inode,
+ pgoff_t fofs, unsigned int len)
{
struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest;
- if (largest->fofs <= fofs && largest->fofs + largest->len > fofs)
+ if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs)
largest->len = 0;
}
@@ -168,7 +169,7 @@ void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
if (!f2fs_may_extent_tree(inode))
return;
- __drop_largest_extent(inode, fofs);
+ __drop_largest_extent(inode, fofs, 1);
}
void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
@@ -350,8 +351,7 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi,
}
if (en) {
- if (en->ei.len > et->largest.len)
- et->largest = en->ei;
+ __try_update_largest_extent(et, en);
et->cached_en = en;
}
return en;
@@ -388,18 +388,17 @@ do_insert:
if (!en)
return NULL;
- if (en->ei.len > et->largest.len)
- et->largest = en->ei;
+ __try_update_largest_extent(et, en);
et->cached_en = en;
return en;
}
-unsigned int f2fs_update_extent_tree_range(struct inode *inode,
+static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
pgoff_t fofs, block_t blkaddr, unsigned int len)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct extent_tree *et = F2FS_I(inode)->extent_tree;
- struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL;
+ struct extent_node *en = NULL, *en1 = NULL;
struct extent_node *prev_en = NULL, *next_en = NULL;
struct extent_info ei, dei, prev;
struct rb_node **insert_p = NULL, *insert_parent = NULL;
@@ -409,6 +408,8 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
if (!et)
return false;
+ trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len);
+
write_lock(&et->lock);
if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) {
@@ -419,148 +420,99 @@ unsigned int f2fs_update_extent_tree_range(struct inode *inode,
prev = et->largest;
dei.len = 0;
- /* we do not guarantee that the largest extent is cached all the time */
- __drop_largest_extent(inode, fofs);
+ /*
+ * drop largest extent before lookup, in case it's already
+ * been shrunk from extent tree
+ */
+ __drop_largest_extent(inode, fofs, len);
/* 1. lookup first extent node in range [fofs, fofs + len - 1] */
en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en,
&insert_p, &insert_parent);
- if (!en) {
- if (next_en) {
- en = next_en;
- f2fs_bug_on(sbi, en->ei.fofs <= pos);
- pos = en->ei.fofs;
- } else {
- /*
- * skip searching in the tree since there is no
- * larger extent node in the cache.
- */
- goto update_extent;
- }
- }
+ if (!en)
+ en = next_en;
/* 2. invlidate all extent nodes in range [fofs, fofs + len - 1] */
- while (en) {
- struct rb_node *node;
+ while (en && en->ei.fofs < end) {
+ unsigned int org_end;
+ int parts = 0; /* # of parts current extent split into */
- if (pos >= end)
- break;
+ next_en = en1 = NULL;
dei = en->ei;
- en1 = en2 = NULL;
+ org_end = dei.fofs + dei.len;
+ f2fs_bug_on(sbi, pos >= org_end);
- node = rb_next(&en->rb_node);
+ if (pos > dei.fofs && pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
+ en->ei.len = pos - en->ei.fofs;
+ prev_en = en;
+ parts = 1;
+ }
- /*
- * 2.1 there are four cases when we invalidate blkaddr in extent
- * node, |V: valid address, X: will be invalidated|
- */
- /* case#1, invalidate right part of extent node |VVVVVXXXXX| */
- if (pos > dei.fofs && end >= dei.fofs + dei.len) {
- en->ei.len = pos - dei.fofs;
-
- if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
- __detach_extent_node(sbi, et, en);
- insert_p = NULL;
- insert_parent = NULL;
- goto update;
+ if (end < org_end && org_end - end >= F2FS_MIN_EXTENT_LEN) {
+ if (parts) {
+ set_extent_info(&ei, end,
+ end - dei.fofs + dei.blk,
+ org_end - end);
+ en1 = __insert_extent_tree(sbi, et, &ei,
+ NULL, NULL);
+ next_en = en1;
+ } else {
+ en->ei.fofs = end;
+ en->ei.blk += end - dei.fofs;
+ en->ei.len -= end - dei.fofs;
+ next_en = en;
}
-
- if (__is_extent_same(&dei, &et->largest))
- et->largest = en->ei;
- goto next;
+ parts++;
}
- /* case#2, invalidate left part of extent node |XXXXXVVVVV| */
- if (pos <= dei.fofs && end < dei.fofs + dei.len) {
- en->ei.fofs = end;
- en->ei.blk += end - dei.fofs;
- en->ei.len -= end - dei.fofs;
-
- if (en->ei.len < F2FS_MIN_EXTENT_LEN) {
- __detach_extent_node(sbi, et, en);
- insert_p = NULL;
- insert_parent = NULL;
- goto update;
- }
+ if (!next_en) {
+ struct rb_node *node = rb_next(&en->rb_node);
- if (__is_extent_same(&dei, &et->largest))
- et->largest = en->ei;
- goto next;
+ next_en = node ?
+ rb_entry(node, struct extent_node, rb_node)
+ : NULL;
}
- __detach_extent_node(sbi, et, en);
+ if (parts)
+ __try_update_largest_extent(et, en);
+ else
+ __detach_extent_node(sbi, et, en);
/*
- * if we remove node in rb-tree, our parent node pointer may
- * point the wrong place, discard them.
+ * if original extent is split into zero or two parts, extent
+ * tree has been altered by deletion or insertion, therefore
+ * invalidate pointers regard to tree.
*/
- insert_p = NULL;
- insert_parent = NULL;
-
- /* case#3, invalidate entire extent node |XXXXXXXXXX| */
- if (pos <= dei.fofs && end >= dei.fofs + dei.len) {
- if (__is_extent_same(&dei, &et->largest))
- et->largest.len = 0;
- goto update;
+ if (parts != 1) {
+ insert_p = NULL;
+ insert_parent = NULL;
}
- /*
- * case#4, invalidate data in the middle of extent node
- * |VVVXXXXVVV|
- */
- if (dei.len > F2FS_MIN_EXTENT_LEN) {
- unsigned int endofs;
-
- /* insert left part of split extent into cache */
- if (pos - dei.fofs >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, dei.fofs, dei.blk,
- pos - dei.fofs);
- en1 = __insert_extent_tree(sbi, et, &ei,
- NULL, NULL);
- }
-
- /* insert right part of split extent into cache */
- endofs = dei.fofs + dei.len;
- if (endofs - end >= F2FS_MIN_EXTENT_LEN) {
- set_extent_info(&ei, end,
- end - dei.fofs + dei.blk,
- endofs - end);
- en2 = __insert_extent_tree(sbi, et, &ei,
- NULL, NULL);
- }
- }
-update:
- /* 2.2 update in global extent list */
+ /* update in global extent list */
spin_lock(&sbi->extent_lock);
- if (en && !list_empty(&en->list))
+ if (!parts && !list_empty(&en->list))
list_del(&en->list);
if (en1)
list_add_tail(&en1->list, &sbi->extent_list);
- if (en2)
- list_add_tail(&en2->list, &sbi->extent_list);
spin_unlock(&sbi->extent_lock);
- /* 2.3 release extent node */
- if (en)
+ /* release extent node */
+ if (!parts)
kmem_cache_free(extent_node_slab, en);
-next:
- en = node ? rb_entry(node, struct extent_node, rb_node) : NULL;
- next_en = en;
- if (en)
- pos = en->ei.fofs;
+
+ en = next_en;
}
-update_extent:
/* 3. update extent in extent cache */
if (blkaddr) {
struct extent_node *den = NULL;
set_extent_info(&ei, fofs, blkaddr, len);
- en3 = __try_merge_extent_node(sbi, et, &ei, &den,
+ en1 = __try_merge_extent_node(sbi, et, &ei, &den,
prev_en, next_en);
- if (!en3)
- en3 = __insert_extent_tree(sbi, et, &ei,
+ if (!en1)
+ en1 = __insert_extent_tree(sbi, et, &ei,
insert_p, insert_parent);
/* give up extent_cache, if split and small updates happen */
@@ -572,11 +524,11 @@ update_extent:
}
spin_lock(&sbi->extent_lock);
- if (en3) {
- if (list_empty(&en3->list))
- list_add_tail(&en3->list, &sbi->extent_list);
+ if (en1) {
+ if (list_empty(&en1->list))
+ list_add_tail(&en1->list, &sbi->extent_list);
else
- list_move_tail(&en3->list, &sbi->extent_list);
+ list_move_tail(&en1->list, &sbi->extent_list);
}
if (den && !list_empty(&den->list))
list_del(&den->list);
@@ -650,6 +602,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
}
spin_unlock(&sbi->extent_lock);
+ /*
+ * reset ino for searching victims from beginning of global extent tree.
+ */
+ ino = F2FS_ROOT_INO(sbi);
+
while ((found = radix_tree_gang_lookup(root,
(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
unsigned i;
@@ -663,7 +620,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
write_unlock(&et->lock);
if (node_cnt + tree_cnt >= nr_shrink)
- break;
+ goto unlock_out;
}
}
unlock_out:
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index f1a90ffd7cad..9db5500d63d9 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -19,6 +19,7 @@
#include <linux/magic.h>
#include <linux/kobject.h>
#include <linux/sched.h>
+#include <linux/vmalloc.h>
#include <linux/bio.h>
#ifdef CONFIG_F2FS_CHECK_FS
@@ -52,6 +53,7 @@
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
+#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -122,6 +124,7 @@ enum {
(SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
#define BATCHED_TRIM_BLOCKS(sbi) \
(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
+#define DEF_CP_INTERVAL 60 /* 60 secs */
struct cp_control {
int reason;
@@ -230,6 +233,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
#define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5)
#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6)
+#define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7)
#define F2FS_IOC_SET_ENCRYPTION_POLICY \
_IOR('f', 19, struct f2fs_encryption_policy)
@@ -246,6 +250,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */
#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */
#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */
+#define F2FS_GOING_DOWN_METAFLUSH 0x3 /* going down with meta flush */
#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
/*
@@ -492,12 +497,20 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
return __is_extent_mergeable(cur, front);
}
+static inline void __try_update_largest_extent(struct extent_tree *et,
+ struct extent_node *en)
+{
+ if (en->ei.len > et->largest.len)
+ et->largest = en->ei;
+}
+
struct f2fs_nm_info {
block_t nat_blkaddr; /* base disk address of NAT */
nid_t max_nid; /* maximum possible node ids */
nid_t available_nids; /* maximum available node ids */
nid_t next_scan_nid; /* the next nid to be scanned */
unsigned int ram_thresh; /* control the memory footprint */
+ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */
/* NAT cache management */
struct radix_tree_root nat_root;/* root of the nat entry cache */
@@ -724,6 +737,7 @@ struct f2fs_sb_info {
struct rw_semaphore node_write; /* locking node writes */
struct mutex writepages; /* mutex for writepages() */
wait_queue_head_t cp_wait;
+ long cp_expires, cp_interval; /* next expected periodic cp */
struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */
@@ -787,10 +801,10 @@ struct f2fs_sb_info {
unsigned int segment_count[2]; /* # of allocated segments */
unsigned int block_count[2]; /* # of allocated blocks */
atomic_t inplace_count; /* # of inplace update */
- atomic_t total_hit_ext; /* # of lookup extent cache */
- atomic_t read_hit_rbtree; /* # of hit rbtree extent node */
- atomic_t read_hit_largest; /* # of hit largest extent node */
- atomic_t read_hit_cached; /* # of hit cached extent node */
+ atomic64_t total_hit_ext; /* # of lookup extent cache */
+ atomic64_t read_hit_rbtree; /* # of hit rbtree extent node */
+ atomic64_t read_hit_largest; /* # of hit largest extent node */
+ atomic64_t read_hit_cached; /* # of hit cached extent node */
atomic_t inline_xattr; /* # of inline_xattr inodes */
atomic_t inline_inode; /* # of inline_data inodes */
atomic_t inline_dir; /* # of inline_dentry inodes */
@@ -1220,6 +1234,24 @@ static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi)
return sbi->total_valid_inode_count;
}
+static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
+ pgoff_t index, bool for_write)
+{
+ if (!for_write)
+ return grab_cache_page(mapping, index);
+ return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
+}
+
+static inline void f2fs_copy_page(struct page *src, struct page *dst)
+{
+ char *src_kaddr = kmap(src);
+ char *dst_kaddr = kmap(dst);
+
+ memcpy(dst_kaddr, src_kaddr, PAGE_SIZE);
+ kunmap(dst);
+ kunmap(src);
+}
+
static inline void f2fs_put_page(struct page *page, int unlock)
{
if (!page)
@@ -1579,6 +1611,26 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
return S_ISREG(mode);
}
+static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kmalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags, PAGE_KERNEL);
+ return ret;
+}
+
+static inline void *f2fs_kvzalloc(size_t size, gfp_t flags)
+{
+ void *ret;
+
+ ret = kzalloc(size, flags | __GFP_NOWARN);
+ if (!ret)
+ ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL);
+ return ret;
+}
+
#define get_inode_mode(i) \
((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -1721,6 +1773,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *);
int create_flush_cmd_control(struct f2fs_sb_info *);
void destroy_flush_cmd_control(struct f2fs_sb_info *);
void invalidate_blocks(struct f2fs_sb_info *, block_t);
+bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *);
void release_discard_addrs(struct f2fs_sb_info *);
@@ -1739,6 +1792,7 @@ void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
void allocate_data_block(struct f2fs_sb_info *, struct page *,
block_t, block_t *, struct f2fs_summary *, int);
void f2fs_wait_on_page_writeback(struct page *, enum page_type);
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t);
void write_data_summaries(struct f2fs_sb_info *, block_t);
void write_node_summaries(struct f2fs_sb_info *, block_t);
int lookup_journal_in_cursum(struct f2fs_summary_block *,
@@ -1754,8 +1808,9 @@ void destroy_segment_manager_caches(void);
*/
struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
+struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t);
bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
-int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
+int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
@@ -1787,9 +1842,9 @@ void set_data_blkaddr(struct dnode_of_data *);
int reserve_new_block(struct dnode_of_data *);
int f2fs_get_block(struct dnode_of_data *, pgoff_t);
int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-struct page *get_read_data_page(struct inode *, pgoff_t, int);
+struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
struct page *find_data_page(struct inode *, pgoff_t);
-struct page *get_lock_data_page(struct inode *, pgoff_t);
+struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
int do_write_data_page(struct f2fs_io_info *);
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
@@ -1802,7 +1857,7 @@ int f2fs_release_page(struct page *, gfp_t);
int start_gc_thread(struct f2fs_sb_info *);
void stop_gc_thread(struct f2fs_sb_info *);
block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
-int f2fs_gc(struct f2fs_sb_info *);
+int f2fs_gc(struct f2fs_sb_info *, bool);
void build_gc_manager(struct f2fs_sb_info *);
/*
@@ -1820,7 +1875,8 @@ struct f2fs_stat_info {
struct f2fs_sb_info *sbi;
int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs;
int main_area_segs, main_area_sections, main_area_zones;
- int hit_largest, hit_cached, hit_rbtree, hit_total, total_ext;
+ unsigned long long hit_largest, hit_cached, hit_rbtree;
+ unsigned long long hit_total, total_ext;
int ext_tree, ext_node;
int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
int nats, dirty_nats, sits, dirty_sits, fnids;
@@ -1844,7 +1900,7 @@ struct f2fs_stat_info {
unsigned int segment_count[2];
unsigned int block_count[2];
unsigned int inplace_count;
- unsigned base_mem, cache_mem, page_mem;
+ unsigned long long base_mem, cache_mem, page_mem;
};
static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1857,10 +1913,10 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
#define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++)
#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++)
#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--)
-#define stat_inc_total_hit(sbi) (atomic_inc(&(sbi)->total_hit_ext))
-#define stat_inc_rbtree_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_rbtree))
-#define stat_inc_largest_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_largest))
-#define stat_inc_cached_node_hit(sbi) (atomic_inc(&(sbi)->read_hit_cached))
+#define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext))
+#define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree))
+#define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest))
+#define stat_inc_cached_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_cached))
#define stat_inc_inline_xattr(inode) \
do { \
if (f2fs_has_inline_xattr(inode)) \
@@ -1998,6 +2054,8 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *,
bool f2fs_empty_inline_dir(struct inode *);
int f2fs_read_inline_dir(struct file *, struct dir_context *,
struct f2fs_str *);
+int f2fs_inline_data_fiemap(struct inode *,
+ struct fiemap_extent_info *, __u64, __u64);
/*
* shrinker.c
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 8120f8685141..a197215ad52b 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -74,7 +74,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
goto mapped;
/* page is wholly or partially inside EOF */
- if (((page->index + 1) << PAGE_CACHE_SHIFT) > i_size_read(inode)) {
+ if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) >
+ i_size_read(inode)) {
unsigned offset;
offset = i_size_read(inode) & ~PAGE_CACHE_MASK;
zero_user_segment(page, offset, PAGE_CACHE_SIZE);
@@ -86,6 +87,11 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
mapped:
/* fill the page */
f2fs_wait_on_page_writeback(page, DATA);
+
+ /* wait for GCed encrypted page writeback */
+ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
+ f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+
/* if gced page is attached, don't write to cold segment */
clear_cold_data(page);
out:
@@ -343,7 +349,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence);
- for (; data_ofs < isize; data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+ for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA);
if (err && err != -ENOENT) {
@@ -504,14 +510,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from,
return 0;
if (cache_only) {
- page = grab_cache_page(mapping, index);
+ page = f2fs_grab_cache_page(mapping, index, false);
if (page && PageUptodate(page))
goto truncate_out;
f2fs_put_page(page, 1);
return 0;
}
- page = get_lock_data_page(inode, index);
+ page = get_lock_data_page(inode, index, true);
if (IS_ERR(page))
return 0;
truncate_out:
@@ -680,6 +686,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
* larger than i_size.
*/
truncate_setsize(inode, attr->ia_size);
+ inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
}
@@ -738,23 +745,31 @@ static int fill_zero(struct inode *inode, pgoff_t index,
int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
{
- pgoff_t index;
int err;
- for (index = pg_start; index < pg_end; index++) {
+ while (pg_start < pg_end) {
struct dnode_of_data dn;
+ pgoff_t end_offset, count;
set_new_dnode(&dn, inode, NULL, NULL, 0);
- err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+ err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
if (err) {
- if (err == -ENOENT)
+ if (err == -ENOENT) {
+ pg_start++;
continue;
+ }
return err;
}
- if (dn.data_blkaddr != NULL_ADDR)
- truncate_data_blocks_range(&dn, 1);
+ end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+ count = min(end_offset - dn.ofs_in_node, pg_end - pg_start);
+
+ f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset);
+
+ truncate_data_blocks_range(&dn, count);
f2fs_put_dnode(&dn);
+
+ pg_start += count;
}
return 0;
}
@@ -765,9 +780,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
loff_t off_start, off_end;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EOPNOTSUPP;
-
if (f2fs_has_inline_data(inode)) {
ret = f2fs_convert_inline_inode(inode);
if (ret)
@@ -805,8 +817,8 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
f2fs_balance_fs(sbi);
- blk_start = pg_start << PAGE_CACHE_SHIFT;
- blk_end = pg_end << PAGE_CACHE_SHIFT;
+ blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
+ blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
truncate_inode_pages_range(mapping, blk_start,
blk_end - 1);
@@ -819,86 +831,100 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
return ret;
}
-static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+static int __exchange_data_block(struct inode *inode, pgoff_t src,
+ pgoff_t dst, bool full)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct dnode_of_data dn;
- pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
- int ret = 0;
-
- for (; end < nrpages; start++, end++) {
- block_t new_addr, old_addr;
-
- f2fs_lock_op(sbi);
+ block_t new_addr;
+ bool do_replace = false;
+ int ret;
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- new_addr = NULL_ADDR;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA);
+ if (ret && ret != -ENOENT) {
+ return ret;
+ } else if (ret == -ENOENT) {
+ new_addr = NULL_ADDR;
+ } else {
+ new_addr = dn.data_blkaddr;
+ if (!is_checkpointed_data(sbi, new_addr)) {
+ dn.data_blkaddr = NULL_ADDR;
+ /* do not invalidate this block address */
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ do_replace = true;
}
+ f2fs_put_dnode(&dn);
+ }
- if (new_addr == NULL_ADDR) {
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- f2fs_unlock_op(sbi);
- continue;
- }
+ if (new_addr == NULL_ADDR)
+ return full ? truncate_hole(inode, dst, dst + 1) : 0;
- if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
- continue;
- } else {
- truncate_data_blocks_range(&dn, 1);
- }
+ if (do_replace) {
+ struct page *ipage = get_node_page(sbi, inode->i_ino);
+ struct node_info ni;
- f2fs_put_dnode(&dn);
- } else {
- struct page *ipage;
+ if (IS_ERR(ipage)) {
+ ret = PTR_ERR(ipage);
+ goto err_out;
+ }
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
+ set_new_dnode(&dn, inode, ipage, NULL, 0);
+ ret = f2fs_reserve_block(&dn, dst);
+ if (ret)
+ goto err_out;
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, start);
- if (ret)
- goto out;
+ truncate_data_blocks_range(&dn, 1);
- old_addr = dn.data_blkaddr;
- if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) {
- dn.data_blkaddr = NULL_ADDR;
- f2fs_update_extent_cache(&dn);
- invalidate_blocks(sbi, old_addr);
+ get_node_info(sbi, dn.nid, &ni);
+ f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
+ ni.version, true);
+ f2fs_put_dnode(&dn);
+ } else {
+ struct page *psrc, *pdst;
+
+ psrc = get_lock_data_page(inode, src, true);
+ if (IS_ERR(psrc))
+ return PTR_ERR(psrc);
+ pdst = get_new_data_page(inode, NULL, dst, false);
+ if (IS_ERR(pdst)) {
+ f2fs_put_page(psrc, 1);
+ return PTR_ERR(pdst);
+ }
+ f2fs_copy_page(psrc, pdst);
+ set_page_dirty(pdst);
+ f2fs_put_page(pdst, 1);
+ f2fs_put_page(psrc, 1);
- dn.data_blkaddr = new_addr;
- set_data_blkaddr(&dn);
- } else if (new_addr != NEW_ADDR) {
- struct node_info ni;
+ return truncate_hole(inode, src, src + 1);
+ }
+ return 0;
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
+err_out:
+ if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) {
+ dn.data_blkaddr = new_addr;
+ set_data_blkaddr(&dn);
+ f2fs_update_extent_cache(&dn);
+ f2fs_put_dnode(&dn);
+ }
+ return ret;
+}
- f2fs_put_dnode(&dn);
- }
+static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
+{
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
+ int ret = 0;
+
+ for (; end < nrpages; start++, end++) {
+ f2fs_balance_fs(sbi);
+ f2fs_lock_op(sbi);
+ ret = __exchange_data_block(inode, end, start, true);
f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- return 0;
-out:
- f2fs_unlock_op(sbi);
return ret;
}
@@ -908,9 +934,6 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
loff_t new_size;
int ret;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
if (offset + len >= i_size_read(inode))
return -EINVAL;
@@ -940,7 +963,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
new_size = i_size_read(inode) - len;
+ truncate_pagecache(inode, new_size);
ret = truncate_blocks(inode, new_size, true);
if (!ret)
@@ -959,9 +987,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
loff_t off_start, off_end;
int ret = 0;
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
-
ret = inode_newsize_ok(inode, (len + offset));
if (ret)
return ret;
@@ -1003,7 +1028,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
return ret;
new_size = max_t(loff_t, new_size,
- pg_start << PAGE_CACHE_SHIFT);
+ (loff_t)pg_start << PAGE_CACHE_SHIFT);
}
for (index = pg_start; index < pg_end; index++) {
@@ -1039,7 +1064,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
f2fs_unlock_op(sbi);
new_size = max_t(loff_t, new_size,
- (index + 1) << PAGE_CACHE_SHIFT);
+ (loff_t)(index + 1) << PAGE_CACHE_SHIFT);
}
if (off_end) {
@@ -1066,10 +1091,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
pgoff_t pg_start, pg_end, delta, nrpages, idx;
loff_t new_size;
- int ret;
-
- if (!S_ISREG(inode->i_mode))
- return -EINVAL;
+ int ret = 0;
new_size = i_size_read(inode) + len;
if (new_size > inode->i_sb->s_maxbytes)
@@ -1107,57 +1129,19 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE;
for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) {
- struct dnode_of_data dn;
- struct page *ipage;
- block_t new_addr, old_addr;
-
f2fs_lock_op(sbi);
-
- set_new_dnode(&dn, inode, NULL, NULL, 0);
- ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA);
- if (ret && ret != -ENOENT) {
- goto out;
- } else if (ret == -ENOENT) {
- goto next;
- } else if (dn.data_blkaddr == NULL_ADDR) {
- f2fs_put_dnode(&dn);
- goto next;
- } else {
- new_addr = dn.data_blkaddr;
- truncate_data_blocks_range(&dn, 1);
- f2fs_put_dnode(&dn);
- }
-
- ipage = get_node_page(sbi, inode->i_ino);
- if (IS_ERR(ipage)) {
- ret = PTR_ERR(ipage);
- goto out;
- }
-
- set_new_dnode(&dn, inode, ipage, NULL, 0);
- ret = f2fs_reserve_block(&dn, idx + delta);
- if (ret)
- goto out;
-
- old_addr = dn.data_blkaddr;
- f2fs_bug_on(sbi, old_addr != NEW_ADDR);
-
- if (new_addr != NEW_ADDR) {
- struct node_info ni;
-
- get_node_info(sbi, dn.nid, &ni);
- f2fs_replace_block(sbi, &dn, old_addr, new_addr,
- ni.version, true);
- }
- f2fs_put_dnode(&dn);
-next:
+ ret = __exchange_data_block(inode, idx, idx + delta, false);
f2fs_unlock_op(sbi);
+ if (ret)
+ break;
}
- i_size_write(inode, new_size);
- return 0;
-out:
- f2fs_unlock_op(sbi);
+ /* write out all moved pages, if possible */
+ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
+ truncate_pagecache(inode, offset);
+
+ if (!ret)
+ i_size_write(inode, new_size);
return ret;
}
@@ -1204,9 +1188,10 @@ noalloc:
if (pg_start == pg_end)
new_size = offset + len;
else if (index == pg_start && off_start)
- new_size = (index + 1) << PAGE_CACHE_SHIFT;
+ new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT;
else if (index == pg_end)
- new_size = (index << PAGE_CACHE_SHIFT) + off_end;
+ new_size = ((loff_t)index << PAGE_CACHE_SHIFT) +
+ off_end;
else
new_size += PAGE_CACHE_SIZE;
}
@@ -1228,6 +1213,10 @@ static long f2fs_fallocate(struct file *file, int mode,
struct inode *inode = file_inode(file);
long ret = 0;
+ /* f2fs only support ->fallocate for regular file */
+ if (!S_ISREG(inode->i_mode))
+ return -EINVAL;
+
if (f2fs_encrypted_inode(inode) &&
(mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE)))
return -EOPNOTSUPP;
@@ -1437,8 +1426,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp)
if (!f2fs_is_first_block_written(inode))
return truncate_partial_data_page(inode, 0, true);
- punch_hole(inode, 0, F2FS_BLKSIZE);
- return 0;
+ return punch_hole(inode, 0, F2FS_BLKSIZE);
}
static int f2fs_ioc_abort_volatile_write(struct file *filp)
@@ -1455,13 +1443,9 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
f2fs_balance_fs(F2FS_I_SB(inode));
- if (f2fs_is_atomic_file(inode)) {
- clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
- commit_inmem_pages(inode, true);
- }
-
- if (f2fs_is_volatile_file(inode))
- clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+ clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+ commit_inmem_pages(inode, true);
mnt_drop_write_file(filp);
return ret;
@@ -1496,6 +1480,10 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
case F2FS_GOING_DOWN_NOSYNC:
f2fs_stop_checkpoint(sbi);
break;
+ case F2FS_GOING_DOWN_METAFLUSH:
+ sync_meta_pages(sbi, META, LONG_MAX);
+ f2fs_stop_checkpoint(sbi);
+ break;
default:
return -EINVAL;
}
@@ -1616,27 +1604,44 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- __u32 i, count;
+ __u32 sync;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (get_user(count, (__u32 __user *)arg))
+ if (get_user(sync, (__u32 __user *)arg))
return -EFAULT;
- if (!count || count > F2FS_BATCH_GC_MAX_NUM)
- return -EINVAL;
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
- for (i = 0; i < count; i++) {
+ if (!sync) {
if (!mutex_trylock(&sbi->gc_mutex))
- break;
-
- if (f2fs_gc(sbi))
- break;
+ return -EBUSY;
+ } else {
+ mutex_lock(&sbi->gc_mutex);
}
- if (put_user(i, (__u32 __user *)arg))
- return -EFAULT;
+ return f2fs_gc(sbi, sync);
+}
+
+static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+ struct cp_control cpc;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (f2fs_readonly(sbi->sb))
+ return -EROFS;
+
+ cpc.reason = __get_cp_reason(sbi);
+
+ mutex_lock(&sbi->gc_mutex);
+ write_checkpoint(sbi, &cpc);
+ mutex_unlock(&sbi->gc_mutex);
return 0;
}
@@ -1672,6 +1677,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_ioc_get_encryption_pwsalt(filp, arg);
case F2FS_IOC_GARBAGE_COLLECT:
return f2fs_ioc_gc(filp, arg);
+ case F2FS_IOC_WRITE_CHECKPOINT:
+ return f2fs_ioc_write_checkpoint(filp, arg);
default:
return -ENOTTY;
}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 782b8e72c094..fedbf67a0842 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -78,9 +78,12 @@ static int gc_thread_func(void *data)
stat_inc_bggc_count(sbi);
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi))
+ if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
wait_ms = gc_th->no_gc_sleep_time;
+ trace_f2fs_background_gc(sbi->sb, wait_ms,
+ prefree_segments(sbi), free_segments(sbi));
+
/* balancing f2fs's metadata periodically */
f2fs_balance_fs_bg(sbi);
@@ -257,6 +260,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
struct victim_sel_policy p;
unsigned int secno, max_cost;
+ unsigned int last_segment = MAIN_SEGS(sbi);
int nsearched = 0;
mutex_lock(&dirty_i->seglist_lock);
@@ -267,6 +271,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
p.min_segno = NULL_SEGNO;
p.min_cost = max_cost = get_max_cost(sbi, &p);
+ if (p.max_search == 0)
+ goto out;
+
if (p.alloc_mode == LFS && gc_type == FG_GC) {
p.min_segno = check_bg_victims(sbi);
if (p.min_segno != NULL_SEGNO)
@@ -277,9 +284,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi,
unsigned long cost;
unsigned int segno;
- segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset);
- if (segno >= MAIN_SEGS(sbi)) {
+ segno = find_next_bit(p.dirty_segmap, last_segment, p.offset);
+ if (segno >= last_segment) {
if (sbi->last_victim[p.gc_mode]) {
+ last_segment = sbi->last_victim[p.gc_mode];
sbi->last_victim[p.gc_mode] = 0;
p.offset = 0;
continue;
@@ -327,6 +335,7 @@ got_it:
sbi->cur_victim_sec,
prefree_segments(sbi), free_segments(sbi));
}
+out:
mutex_unlock(&dirty_i->seglist_lock);
return (p.min_segno == NULL_SEGNO) ? 0 : 1;
@@ -541,7 +550,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
int err;
/* do not read out */
- page = grab_cache_page(inode->i_mapping, bidx);
+ page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
if (!page)
return;
@@ -550,8 +559,16 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
if (err)
goto out;
- if (unlikely(dn.data_blkaddr == NULL_ADDR))
+ if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
+ ClearPageUptodate(page);
goto put_out;
+ }
+
+ /*
+ * don't cache encrypted data into meta inode until previous dirty
+ * data were writebacked to avoid racing between GC and flush.
+ */
+ f2fs_wait_on_page_writeback(page, DATA);
get_node_info(fio.sbi, dn.nid, &ni);
set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
@@ -580,7 +597,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
goto put_page_out;
set_page_dirty(fio.encrypted_page);
- f2fs_wait_on_page_writeback(fio.encrypted_page, META);
+ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA);
if (clear_page_dirty_for_io(fio.encrypted_page))
dec_page_count(fio.sbi, F2FS_DIRTY_META);
@@ -611,7 +628,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
{
struct page *page;
- page = get_lock_data_page(inode, bidx);
+ page = get_lock_data_page(inode, bidx, true);
if (IS_ERR(page))
return;
@@ -705,7 +722,7 @@ next_step:
start_bidx = start_bidx_of_node(nofs, F2FS_I(inode));
data_page = get_read_data_page(inode,
- start_bidx + ofs_in_node, READA);
+ start_bidx + ofs_in_node, READA, true);
if (IS_ERR(data_page)) {
iput(inode);
continue;
@@ -797,13 +814,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
return nfree;
}
-int f2fs_gc(struct f2fs_sb_info *sbi)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
{
- unsigned int segno = NULL_SEGNO;
- unsigned int i;
- int gc_type = BG_GC;
- int nfree = 0;
- int ret = -1;
+ unsigned int segno, i;
+ int gc_type = sync ? FG_GC : BG_GC;
+ int sec_freed = 0;
+ int ret = -EINVAL;
struct cp_control cpc;
struct gc_inode_list gc_list = {
.ilist = LIST_HEAD_INIT(gc_list.ilist),
@@ -812,12 +828,14 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
cpc.reason = __get_cp_reason(sbi);
gc_more:
+ segno = NULL_SEGNO;
+
if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
goto stop;
if (unlikely(f2fs_cp_error(sbi)))
goto stop;
- if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) {
+ if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
gc_type = FG_GC;
if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi))
write_checkpoint(sbi, &cpc);
@@ -830,23 +848,38 @@ gc_more:
/* readahead multi ssa blocks those have contiguous address */
if (sbi->segs_per_sec > 1)
ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec,
- META_SSA);
+ META_SSA, true);
- for (i = 0; i < sbi->segs_per_sec; i++)
- nfree += do_garbage_collect(sbi, segno + i, &gc_list, gc_type);
+ for (i = 0; i < sbi->segs_per_sec; i++) {
+ /*
+ * for FG_GC case, halt gcing left segments once failed one
+ * of segments in selected section to avoid long latency.
+ */
+ if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) &&
+ gc_type == FG_GC)
+ break;
+ }
+
+ if (i == sbi->segs_per_sec && gc_type == FG_GC)
+ sec_freed++;
if (gc_type == FG_GC)
sbi->cur_victim_sec = NULL_SEGNO;
- if (has_not_enough_free_secs(sbi, nfree))
- goto gc_more;
+ if (!sync) {
+ if (has_not_enough_free_secs(sbi, sec_freed))
+ goto gc_more;
- if (gc_type == FG_GC)
- write_checkpoint(sbi, &cpc);
+ if (gc_type == FG_GC)
+ write_checkpoint(sbi, &cpc);
+ }
stop:
mutex_unlock(&sbi->gc_mutex);
put_gc_inode(&gc_list);
+
+ if (sync)
+ ret = sec_freed ? 0 : -EAGAIN;
return ret;
}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index c5a055b3376e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -19,12 +19,6 @@
#define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */
#define LIMIT_FREE_BLOCK 40 /* percentage over invalid + free space */
-/*
- * with this macro, we can control the max time we do garbage collection,
- * when user triggers batch mode gc by ioctl.
- */
-#define F2FS_BATCH_GC_MAX_NUM 16
-
/* Search max. number of dirty segments to select a victim segment */
#define DEF_MAX_VICTIM_SEARCH 4096 /* covers 8GB */
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 3d143be42895..bda7126466c0 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -12,6 +12,7 @@
#include <linux/f2fs_fs.h>
#include "f2fs.h"
+#include "node.h"
bool f2fs_may_inline_data(struct inode *inode)
{
@@ -274,12 +275,14 @@ process_inline:
if (f2fs_has_inline_data(inode)) {
ipage = get_node_page(sbi, inode->i_ino);
f2fs_bug_on(sbi, IS_ERR(ipage));
- truncate_inline_inode(ipage, 0);
+ if (!truncate_inline_inode(ipage, 0))
+ return false;
f2fs_clear_inline_inode(inode);
update_inode(inode, ipage);
f2fs_put_page(ipage, 1);
} else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) {
- truncate_blocks(inode, 0, false);
+ if (truncate_blocks(inode, 0, false))
+ return false;
goto process_inline;
}
return false;
@@ -568,3 +571,38 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
f2fs_put_page(ipage, 1);
return 0;
}
+
+int f2fs_inline_data_fiemap(struct inode *inode,
+ struct fiemap_extent_info *fieinfo, __u64 start, __u64 len)
+{
+ __u64 byteaddr, ilen;
+ __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED |
+ FIEMAP_EXTENT_LAST;
+ struct node_info ni;
+ struct page *ipage;
+ int err = 0;
+
+ ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
+ if (IS_ERR(ipage))
+ return PTR_ERR(ipage);
+
+ if (!f2fs_has_inline_data(inode)) {
+ err = -EAGAIN;
+ goto out;
+ }
+
+ ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode));
+ if (start >= ilen)
+ goto out;
+ if (start + len < ilen)
+ ilen = start + len;
+ ilen -= start;
+
+ get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni);
+ byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits;
+ byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage);
+ err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags);
+out:
+ f2fs_put_page(ipage, 1);
+ return err;
+}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 35aae65b3e5d..97e20decacb4 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -296,16 +296,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
return 0;
/*
- * We need to lock here to prevent from producing dirty node pages
+ * We need to balance fs here to prevent from producing dirty node pages
* during the urgent cleaning time when runing out of free sections.
*/
- f2fs_lock_op(sbi);
update_inode_page(inode);
- f2fs_unlock_op(sbi);
-
- if (wbc)
- f2fs_balance_fs(sbi);
+ f2fs_balance_fs(sbi);
return 0;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index a680bf38e4f0..2c32110f9fc0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -410,11 +410,14 @@ err_out:
* If the symlink path is stored into inline_data, there is no
* performance regression.
*/
- if (!err)
+ if (!err) {
filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1);
- if (IS_DIRSYNC(dir))
- f2fs_sync_fs(sbi->sb, 1);
+ if (IS_DIRSYNC(dir))
+ f2fs_sync_fs(sbi->sb, 1);
+ } else {
+ f2fs_unlink(dir, dentry);
+ }
kfree(sd);
f2fs_fname_crypto_free_buffer(&disk_link);
@@ -478,9 +481,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
struct inode *inode;
int err = 0;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
f2fs_balance_fs(sbi);
inode = f2fs_new_inode(dir, mode);
@@ -947,8 +947,13 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
/* Symlink is encrypted */
sd = (struct f2fs_encrypted_symlink_data *)caddr;
- cstr.name = sd->encrypted_path;
cstr.len = le16_to_cpu(sd->len);
+ cstr.name = kmalloc(cstr.len, GFP_NOFS);
+ if (!cstr.name) {
+ res = -ENOMEM;
+ goto errout;
+ }
+ memcpy(cstr.name, sd->encrypted_path, cstr.len);
/* this is broken symlink case */
if (cstr.name[0] == 0 && cstr.len == 0) {
@@ -970,6 +975,8 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
if (res < 0)
goto errout;
+ kfree(cstr.name);
+
paddr = pstr.name;
/* Null-terminate the name */
@@ -979,6 +986,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
page_cache_release(cpage);
return *cookie = paddr;
errout:
+ kfree(cstr.name);
f2fs_fname_crypto_free_buffer(&pstr);
kunmap(cpage);
page_cache_release(cpage);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 27d1a74dd6f3..7bcbc6e9c40d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1323,23 +1323,24 @@ static int f2fs_write_node_page(struct page *page,
nid = nid_of_node(page);
f2fs_bug_on(sbi, page->index != nid);
+ if (wbc->for_reclaim) {
+ if (!down_read_trylock(&sbi->node_write))
+ goto redirty_out;
+ } else {
+ down_read(&sbi->node_write);
+ }
+
get_node_info(sbi, nid, &ni);
/* This page is already truncated */
if (unlikely(ni.blk_addr == NULL_ADDR)) {
ClearPageUptodate(page);
dec_page_count(sbi, F2FS_DIRTY_NODES);
+ up_read(&sbi->node_write);
unlock_page(page);
return 0;
}
- if (wbc->for_reclaim) {
- if (!down_read_trylock(&sbi->node_write))
- goto redirty_out;
- } else {
- down_read(&sbi->node_write);
- }
-
set_page_writeback(page);
fio.blk_addr = ni.blk_addr;
write_node_page(nid, &fio);
@@ -1528,7 +1529,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
return;
/* readahead nat pages to be scanned */
- ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT);
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
+ META_NAT, true);
while (1) {
struct page *page = get_current_nat_page(sbi, nid);
@@ -1558,6 +1560,9 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
remove_free_nid(nm_i, nid);
}
mutex_unlock(&curseg->curseg_mutex);
+
+ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
+ nm_i->ra_nid_pages, META_NAT, false);
}
/*
@@ -1803,10 +1808,10 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
nrpages = min(last_offset - i, bio_blocks);
/* readahead node pages */
- ra_meta_pages(sbi, addr, nrpages, META_POR);
+ ra_meta_pages(sbi, addr, nrpages, META_POR, true);
for (idx = addr; idx < addr + nrpages; idx++) {
- struct page *page = get_meta_page(sbi, idx);
+ struct page *page = get_tmp_page(sbi, idx);
rn = F2FS_NODE(page);
sum_entry->nid = rn->footer.nid;
@@ -2000,6 +2005,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
nm_i->fcnt = 0;
nm_i->nat_cnt = 0;
nm_i->ram_thresh = DEF_RAM_THRESHOLD;
+ nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
INIT_LIST_HEAD(&nm_i->free_nid_list);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 7427e956ad81..e4fffd2d98c4 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -14,9 +14,11 @@
/* node block offset on the NAT area dedicated to the given start node id */
#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK)
-/* # of pages to perform readahead before building free nids */
+/* # of pages to perform synchronous readahead before building free nids */
#define FREE_NID_PAGES 4
+#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */
+
/* maximum readahead size for node during getting data blocks */
#define MAX_RA_NODE 128
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index faec2ca004b9..cbf74f47cce8 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,7 +180,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
- ra_meta_pages(sbi, blkaddr, 1, META_POR);
+ ra_meta_pages(sbi, blkaddr, 1, META_POR, true);
while (1) {
struct fsync_inode_entry *entry;
@@ -188,7 +188,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
return 0;
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page))
break;
@@ -383,15 +383,11 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
start = start_bidx_of_node(ofs_of_node(page), fi);
end = start + ADDRS_PER_PAGE(page, fi);
- f2fs_lock_op(sbi);
-
set_new_dnode(&dn, inode, NULL, NULL, 0);
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
- if (err) {
- f2fs_unlock_op(sbi);
+ if (err)
goto out;
- }
f2fs_wait_on_page_writeback(dn.node_page, NODE);
@@ -456,7 +452,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
set_page_dirty(dn.node_page);
err:
f2fs_put_dnode(&dn);
- f2fs_unlock_op(sbi);
out:
f2fs_msg(sbi->sb, KERN_NOTICE,
"recover_data: ino = %lx, recovered = %d blocks, err = %d",
@@ -485,7 +480,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
ra_meta_pages_cond(sbi, blkaddr);
- page = get_meta_page(sbi, blkaddr);
+ page = get_tmp_page(sbi, blkaddr);
if (cp_ver != cpver_of_node(page)) {
f2fs_put_page(page, 1);
@@ -570,7 +565,7 @@ out:
/* truncate meta pages to be used by the recovery */
truncate_inode_pages_range(META_MAPPING(sbi),
- MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
+ (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1);
if (err) {
truncate_inode_pages_final(NODE_MAPPING(sbi));
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 78e6d0696847..f77b3258454a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -14,8 +14,8 @@
#include <linux/blkdev.h>
#include <linux/prefetch.h>
#include <linux/kthread.h>
-#include <linux/vmalloc.h>
#include <linux/swap.h>
+#include <linux/timer.h>
#include "f2fs.h"
#include "segment.h"
@@ -29,6 +29,21 @@ static struct kmem_cache *discard_entry_slab;
static struct kmem_cache *sit_entry_set_slab;
static struct kmem_cache *inmem_entry_slab;
+static unsigned long __reverse_ulong(unsigned char *str)
+{
+ unsigned long tmp = 0;
+ int shift = 24, idx = 0;
+
+#if BITS_PER_LONG == 64
+ shift = 56;
+#endif
+ while (shift >= 0) {
+ tmp |= (unsigned long)str[idx++] << shift;
+ shift -= BITS_PER_BYTE;
+ }
+ return tmp;
+}
+
/*
* __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since
* MSB and LSB are reversed in a byte by f2fs_set_bit.
@@ -38,27 +53,31 @@ static inline unsigned long __reverse_ffs(unsigned long word)
int num = 0;
#if BITS_PER_LONG == 64
- if ((word & 0xffffffff) == 0) {
+ if ((word & 0xffffffff00000000UL) == 0)
num += 32;
+ else
word >>= 32;
- }
#endif
- if ((word & 0xffff) == 0) {
+ if ((word & 0xffff0000) == 0)
num += 16;
+ else
word >>= 16;
- }
- if ((word & 0xff) == 0) {
+
+ if ((word & 0xff00) == 0)
num += 8;
+ else
word >>= 8;
- }
+
if ((word & 0xf0) == 0)
num += 4;
else
word >>= 4;
+
if ((word & 0xc) == 0)
num += 2;
else
word >>= 2;
+
if ((word & 0x2) == 0)
num += 1;
return num;
@@ -68,26 +87,16 @@ static inline unsigned long __reverse_ffs(unsigned long word)
* __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
* f2fs_set_bit makes MSB and LSB reversed in a byte.
* Example:
- * LSB <--> MSB
- * f2fs_set_bit(0, bitmap) => 0000 0001
- * f2fs_set_bit(7, bitmap) => 1000 0000
+ * MSB <--> LSB
+ * f2fs_set_bit(0, bitmap) => 1000 0000
+ * f2fs_set_bit(7, bitmap) => 0000 0001
*/
static unsigned long __find_rev_next_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (!f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
unsigned long result = offset & ~(BITS_PER_LONG - 1);
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
@@ -97,14 +106,9 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
if (!offset)
goto aligned;
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~0UL << quot;
- submask = (unsigned char)(0xff << rest) >> rest;
- submask <<= quot;
- mask &= submask;
- tmp &= mask;
+ tmp = __reverse_ulong((unsigned char *)p);
+ tmp &= ~0UL >> offset;
+
if (size < BITS_PER_LONG)
goto found_first;
if (tmp)
@@ -112,42 +116,34 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
+ p++;
aligned:
while (size & ~(BITS_PER_LONG-1)) {
- tmp = *(p++);
+ tmp = __reverse_ulong((unsigned char *)p);
if (tmp)
goto found_middle;
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
+ p++;
}
if (!size)
return result;
- tmp = *p;
+
+ tmp = __reverse_ulong((unsigned char *)p);
found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
+ tmp &= (~0UL << (BITS_PER_LONG - size));
+ if (!tmp) /* Are any bits set? */
return result + size; /* Nope. */
found_middle:
return result + __reverse_ffs(tmp);
-#endif
}
static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset)
{
- while (f2fs_test_bit(offset, (unsigned char *)addr))
- offset++;
-
- if (offset > size)
- offset = size;
-
- return offset;
-#if 0
const unsigned long *p = addr + BIT_WORD(offset);
unsigned long result = offset & ~(BITS_PER_LONG - 1);
unsigned long tmp;
- unsigned long mask, submask;
- unsigned long quot, rest;
if (offset >= size)
return size;
@@ -157,40 +153,36 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
if (!offset)
goto aligned;
- tmp = *(p++);
- quot = (offset >> 3) << 3;
- rest = offset & 0x7;
- mask = ~(~0UL << quot);
- submask = (unsigned char)~((unsigned char)(0xff << rest) >> rest);
- submask <<= quot;
- mask += submask;
- tmp |= mask;
+ tmp = __reverse_ulong((unsigned char *)p);
+ tmp |= ~((~0UL << offset) >> offset);
+
if (size < BITS_PER_LONG)
goto found_first;
- if (~tmp)
+ if (tmp != ~0UL)
goto found_middle;
size -= BITS_PER_LONG;
result += BITS_PER_LONG;
+ p++;
aligned:
while (size & ~(BITS_PER_LONG - 1)) {
- tmp = *(p++);
- if (~tmp)
+ tmp = __reverse_ulong((unsigned char *)p);
+ if (tmp != ~0UL)
goto found_middle;
result += BITS_PER_LONG;
size -= BITS_PER_LONG;
+ p++;
}
if (!size)
return result;
- tmp = *p;
+ tmp = __reverse_ulong((unsigned char *)p);
found_first:
- tmp |= ~0UL << size;
- if (tmp == ~0UL) /* Are any bits zero? */
+ tmp |= ~(~0UL << (BITS_PER_LONG - size));
+ if (tmp == ~0UL) /* Are any bits zero? */
return result + size; /* Nope. */
found_middle:
return result + __reverse_ffz(tmp);
-#endif
}
void register_inmem_page(struct inode *inode, struct page *page)
@@ -257,11 +249,12 @@ int commit_inmem_pages(struct inode *inode, bool abort)
trace_f2fs_commit_inmem_page(cur->page, INMEM);
fio.page = cur->page;
err = do_write_data_page(&fio);
- submit_bio = true;
if (err) {
unlock_page(cur->page);
break;
}
+ clear_cold_data(cur->page);
+ submit_bio = true;
}
} else {
trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
@@ -296,7 +289,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi)
*/
if (has_not_enough_free_secs(sbi, 0)) {
mutex_lock(&sbi->gc_mutex);
- f2fs_gc(sbi);
+ f2fs_gc(sbi, false);
}
}
@@ -316,7 +309,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
/* checkpoint is the only way to shrink partial cached entries */
if (!available_free_memory(sbi, NAT_ENTRIES) ||
excess_prefree_segs(sbi) ||
- !available_free_memory(sbi, INO_ENTRIES))
+ !available_free_memory(sbi, INO_ENTRIES) ||
+ jiffies > sbi->cp_expires)
f2fs_sync_fs(sbi->sb, true);
}
@@ -767,6 +761,30 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
mutex_unlock(&sit_i->sentry_lock);
}
+bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+ struct sit_info *sit_i = SIT_I(sbi);
+ unsigned int segno, offset;
+ struct seg_entry *se;
+ bool is_cp = false;
+
+ if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
+ return true;
+
+ mutex_lock(&sit_i->sentry_lock);
+
+ segno = GET_SEGNO(sbi, blkaddr);
+ se = get_seg_entry(sbi, segno);
+ offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
+
+ if (f2fs_test_bit(offset, se->ckpt_valid_map))
+ is_cp = true;
+
+ mutex_unlock(&sit_i->sentry_lock);
+
+ return is_cp;
+}
+
/*
* This function should be resided under the curseg_mutex lock
*/
@@ -1292,6 +1310,9 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
.encrypted_page = NULL,
};
+ if (unlikely(page->index >= MAIN_BLKADDR(sbi)))
+ fio.rw &= ~REQ_META;
+
set_page_writeback(page);
f2fs_submit_page_mbio(&fio);
}
@@ -1369,7 +1390,14 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
__add_sum_entry(sbi, type, sum);
- refresh_sit_entry(sbi, old_blkaddr, new_blkaddr);
+ if (!recover_curseg)
+ update_sit_entry(sbi, new_blkaddr, 1);
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ update_sit_entry(sbi, old_blkaddr, -1);
+
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+ locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr));
+
locate_dirty_segment(sbi, old_cursegno);
if (recover_curseg) {
@@ -1449,6 +1477,23 @@ void f2fs_wait_on_page_writeback(struct page *page,
}
}
+void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ struct page *cpage;
+
+ if (blkaddr == NEW_ADDR)
+ return;
+
+ f2fs_bug_on(sbi, blkaddr == NULL_ADDR);
+
+ cpage = find_lock_page(META_MAPPING(sbi), blkaddr);
+ if (cpage) {
+ f2fs_wait_on_page_writeback(cpage, DATA);
+ f2fs_put_page(cpage, 1);
+ }
+}
+
static int read_compacted_summaries(struct f2fs_sb_info *sbi)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1586,7 +1631,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (npages >= 2)
ra_meta_pages(sbi, start_sum_block(sbi), npages,
- META_CP);
+ META_CP, true);
/* restore for compacted data summary */
if (read_compacted_summaries(sbi))
@@ -1596,7 +1641,7 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
if (__exist_node_summaries(sbi))
ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
- NR_CURSEG_TYPE - type, META_CP);
+ NR_CURSEG_TYPE - type, META_CP, true);
for (; type <= CURSEG_COLD_NODE; type++) {
err = read_normal_summaries(sbi, type);
@@ -1955,12 +2000,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
SM_I(sbi)->sit_info = sit_i;
- sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry));
+ sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) *
+ sizeof(struct seg_entry), GFP_KERNEL);
if (!sit_i->sentries)
return -ENOMEM;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL);
+ sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!sit_i->dirty_sentries_bitmap)
return -ENOMEM;
@@ -1982,8 +2028,8 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
return -ENOMEM;
if (sbi->segs_per_sec > 1) {
- sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
- sizeof(struct sec_entry));
+ sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) *
+ sizeof(struct sec_entry), GFP_KERNEL);
if (!sit_i->sec_entries)
return -ENOMEM;
}
@@ -2028,12 +2074,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
SM_I(sbi)->free_info = free_i;
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
- free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL);
+ free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL);
if (!free_i->free_segmap)
return -ENOMEM;
sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL);
+ free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL);
if (!free_i->free_secmap)
return -ENOMEM;
@@ -2082,7 +2128,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
int nrpages = MAX_BIO_BLOCKS(sbi);
do {
- readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT);
+ readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
start = start_blk * sit_i->sents_per_block;
end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2174,7 +2220,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi)
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi));
- dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->victim_secmap)
return -ENOMEM;
return 0;
@@ -2196,7 +2242,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi));
for (i = 0; i < NR_DIRTY_TYPE; i++) {
- dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
+ dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL);
if (!dirty_i->dirty_segmap[i])
return -ENOMEM;
}
@@ -2301,7 +2347,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
mutex_lock(&dirty_i->seglist_lock);
- kfree(dirty_i->dirty_segmap[dirty_type]);
+ kvfree(dirty_i->dirty_segmap[dirty_type]);
dirty_i->nr_dirty[dirty_type] = 0;
mutex_unlock(&dirty_i->seglist_lock);
}
@@ -2309,7 +2355,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi,
static void destroy_victim_secmap(struct f2fs_sb_info *sbi)
{
struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
- kfree(dirty_i->victim_secmap);
+ kvfree(dirty_i->victim_secmap);
}
static void destroy_dirty_segmap(struct f2fs_sb_info *sbi)
@@ -2348,8 +2394,8 @@ static void destroy_free_segmap(struct f2fs_sb_info *sbi)
if (!free_i)
return;
SM_I(sbi)->free_info = NULL;
- kfree(free_i->free_segmap);
- kfree(free_i->free_secmap);
+ kvfree(free_i->free_segmap);
+ kvfree(free_i->free_secmap);
kfree(free_i);
}
@@ -2370,9 +2416,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
}
kfree(sit_i->tmp_map);
- vfree(sit_i->sentries);
- vfree(sit_i->sec_entries);
- kfree(sit_i->dirty_sentries_bitmap);
+ kvfree(sit_i->sentries);
+ kvfree(sit_i->sec_entries);
+ kvfree(sit_i->dirty_sentries_bitmap);
SM_I(sbi)->sit_info = NULL;
kfree(sit_i->sit_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index b6e4ed15c698..ee44d346ea44 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -137,10 +137,12 @@ enum {
/*
* BG_GC means the background cleaning job.
* FG_GC means the on-demand cleaning job.
+ * FORCE_FG_GC means on-demand cleaning job in background.
*/
enum {
BG_GC = 0,
- FG_GC
+ FG_GC,
+ FORCE_FG_GC,
};
/* for a function parameter to select a victim segment */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f79478115d37..3a65e0132352 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -213,8 +213,10 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
+F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -231,6 +233,8 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(max_victim_search),
ATTR_LIST(dir_level),
ATTR_LIST(ram_thresh),
+ ATTR_LIST(ra_nid_pages),
+ ATTR_LIST(cp_interval),
NULL,
};
@@ -292,11 +296,16 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
- if (strlen(name) == 2 && !strncmp(name, "on", 2))
+ if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
set_opt(sbi, BG_GC);
- else if (strlen(name) == 3 && !strncmp(name, "off", 3))
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
clear_opt(sbi, BG_GC);
- else {
+ clear_opt(sbi, FORCE_FG_GC);
+ } else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
+ set_opt(sbi, BG_GC);
+ set_opt(sbi, FORCE_FG_GC);
+ } else {
kfree(name);
return -EINVAL;
}
@@ -631,10 +640,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC))
- seq_printf(seq, ",background_gc=%s", "on");
- else
+ if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
+ if (test_opt(sbi, FORCE_FG_GC))
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else
+ seq_printf(seq, ",background_gc=%s", "on");
+ } else {
seq_printf(seq, ",background_gc=%s", "off");
+ }
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
if (test_opt(sbi, DISCARD))
@@ -742,6 +755,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
int err, active_logs;
bool need_restart_gc = false;
bool need_stop_gc = false;
+ bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE);
sync_filesystem(sb);
@@ -767,6 +781,14 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
goto skip;
+ /* disallow enable/disable extent_cache dynamically */
+ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
+ err = -EINVAL;
+ f2fs_msg(sbi->sb, KERN_WARNING,
+ "switch extent_cache option is not allowed");
+ goto restore_opts;
+ }
+
/*
* We stop the GC thread if FS is mounted as RO
* or if background_gc = off is passed in mount
@@ -996,6 +1018,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
atomic_set(&sbi->nr_pages[i], 0);
sbi->dir_level = DEF_DIR_LEVEL;
+ sbi->cp_interval = DEF_CP_INTERVAL;
clear_sbi_flag(sbi, SBI_NEED_FSCK);
INIT_LIST_HEAD(&sbi->s_list);
@@ -1332,6 +1355,8 @@ try_onemore:
f2fs_commit_super(sbi, true);
}
+ sbi->cp_expires = round_jiffies_up(jiffies);
+
return 0;
free_kobj:
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 4de2286c0e4d..862368a32e53 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,49 +25,45 @@
#include "f2fs.h"
#include "xattr.h"
-static size_t f2fs_xattr_generic_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t len)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- int total_len, prefix_len = 0;
- const char *prefix = NULL;
+ int total_len, prefix_len;
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
- prefix = XATTR_USER_PREFIX;
- prefix_len = XATTR_USER_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_TRUSTED:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- prefix = XATTR_TRUSTED_PREFIX;
- prefix_len = XATTR_TRUSTED_PREFIX_LEN;
break;
case F2FS_XATTR_INDEX_SECURITY:
- prefix = XATTR_SECURITY_PREFIX;
- prefix_len = XATTR_SECURITY_PREFIX_LEN;
break;
default:
return -EINVAL;
}
+ prefix_len = strlen(handler->prefix);
total_len = prefix_len + len + 1;
if (list && total_len <= list_size) {
- memcpy(list, prefix, prefix_len);
+ memcpy(list, handler->prefix, prefix_len);
memcpy(list + prefix_len, name, len);
list[prefix_len + len] = '\0';
}
return total_len;
}
-static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
@@ -83,15 +79,17 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name,
}
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL);
+ return f2fs_getxattr(d_inode(dentry), handler->flags, name,
+ buffer, size, NULL);
}
-static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
- switch (type) {
+ switch (handler->flags) {
case F2FS_XATTR_INDEX_USER:
if (!test_opt(sbi, XATTR_USER))
return -EOPNOTSUPP;
@@ -108,27 +106,26 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name,
if (strcmp(name, "") == 0)
return -EINVAL;
- return f2fs_setxattr(d_inode(dentry), type, name,
+ return f2fs_setxattr(d_inode(dentry), handler->flags, name,
value, size, NULL, flags);
}
-static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t len, int type)
+static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t len)
{
const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
size_t size;
- if (type != F2FS_XATTR_INDEX_ADVISE)
- return 0;
-
size = strlen(xname) + 1;
if (list && size <= list_size)
memcpy(list, xname, size);
return size;
}
-static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, void *buffer,
+ size_t size)
{
struct inode *inode = d_inode(dentry);
@@ -140,8 +137,9 @@ static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name,
return sizeof(char);
}
-static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
{
struct inode *inode = d_inode(dentry);
@@ -462,8 +460,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!handler)
continue;
- size = handler->list(dentry, buffer, rest, entry->e_name,
- entry->e_name_len, handler->flags);
+ size = handler->list(handler, dentry, buffer, rest,
+ entry->e_name, entry->e_name_len);
if (buffer && size > rest) {
error = -ERANGE;
goto cleanup;
diff --git a/fs/file.c b/fs/file.c
index 6c672ad329e9..39f8f15921da 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -56,9 +56,35 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
__free_fdtable(container_of(rcu, struct fdtable, rcu));
}
+#define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
+#define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
+
+/*
+ * Copy 'count' fd bits from the old table to the new table and clear the extra
+ * space if any. This does not copy the file pointers. Called with the files
+ * spinlock held for write.
+ */
+static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
+ unsigned int count)
+{
+ unsigned int cpy, set;
+
+ cpy = count / BITS_PER_BYTE;
+ set = (nfdt->max_fds - count) / BITS_PER_BYTE;
+ memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+ memset((char *)nfdt->open_fds + cpy, 0, set);
+ memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+ memset((char *)nfdt->close_on_exec + cpy, 0, set);
+
+ cpy = BITBIT_SIZE(count);
+ set = BITBIT_SIZE(nfdt->max_fds) - cpy;
+ memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
+ memset((char *)nfdt->full_fds_bits + cpy, 0, set);
+}
+
/*
- * Expand the fdset in the files_struct. Called with the files spinlock
- * held for write.
+ * Copy all file descriptors from the old table to the new, expanded table and
+ * clear the extra space. Called with the files spinlock held for write.
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
@@ -69,14 +95,9 @@ static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
cpy = ofdt->max_fds * sizeof(struct file *);
set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
memcpy(nfdt->fd, ofdt->fd, cpy);
- memset((char *)(nfdt->fd) + cpy, 0, set);
+ memset((char *)nfdt->fd + cpy, 0, set);
- cpy = ofdt->max_fds / BITS_PER_BYTE;
- set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
- memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
- memset((char *)(nfdt->open_fds) + cpy, 0, set);
- memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
- memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
+ copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
static struct fdtable * alloc_fdtable(unsigned int nr)
@@ -115,12 +136,14 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
fdt->fd = data;
data = alloc_fdmem(max_t(size_t,
- 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+ 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES));
if (!data)
goto out_arr;
fdt->open_fds = data;
data += nr / BITS_PER_BYTE;
fdt->close_on_exec = data;
+ data += nr / BITS_PER_BYTE;
+ fdt->full_fds_bits = data;
return fdt;
@@ -226,17 +249,22 @@ static inline void __set_close_on_exec(int fd, struct fdtable *fdt)
static inline void __clear_close_on_exec(int fd, struct fdtable *fdt)
{
- __clear_bit(fd, fdt->close_on_exec);
+ if (test_bit(fd, fdt->close_on_exec))
+ __clear_bit(fd, fdt->close_on_exec);
}
-static inline void __set_open_fd(int fd, struct fdtable *fdt)
+static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
{
__set_bit(fd, fdt->open_fds);
+ fd /= BITS_PER_LONG;
+ if (!~fdt->open_fds[fd])
+ __set_bit(fd, fdt->full_fds_bits);
}
-static inline void __clear_open_fd(int fd, struct fdtable *fdt)
+static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
{
__clear_bit(fd, fdt->open_fds);
+ __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
}
static int count_open_files(struct fdtable *fdt)
@@ -262,7 +290,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
{
struct files_struct *newf;
struct file **old_fds, **new_fds;
- int open_files, size, i;
+ int open_files, i;
struct fdtable *old_fdt, *new_fdt;
*errorp = -ENOMEM;
@@ -280,6 +308,7 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
new_fdt->max_fds = NR_OPEN_DEFAULT;
new_fdt->close_on_exec = newf->close_on_exec_init;
new_fdt->open_fds = newf->open_fds_init;
+ new_fdt->full_fds_bits = newf->full_fds_bits_init;
new_fdt->fd = &newf->fd_array[0];
spin_lock(&oldf->file_lock);
@@ -318,12 +347,11 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
open_files = count_open_files(old_fdt);
}
+ copy_fd_bitmaps(new_fdt, old_fdt, open_files);
+
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
- memcpy(new_fdt->open_fds, old_fdt->open_fds, open_files / 8);
- memcpy(new_fdt->close_on_exec, old_fdt->close_on_exec, open_files / 8);
-
for (i = open_files; i != 0; i--) {
struct file *f = *old_fds++;
if (f) {
@@ -341,19 +369,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
}
spin_unlock(&oldf->file_lock);
- /* compute the remainder to be cleared */
- size = (new_fdt->max_fds - open_files) * sizeof(struct file *);
-
- /* This is long word aligned thus could use a optimized version */
- memset(new_fds, 0, size);
-
- if (new_fdt->max_fds > open_files) {
- int left = (new_fdt->max_fds - open_files) / 8;
- int start = open_files / BITS_PER_LONG;
-
- memset(&new_fdt->open_fds[start], 0, left);
- memset(&new_fdt->close_on_exec[start], 0, left);
- }
+ /* clear the remainder */
+ memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
rcu_assign_pointer(newf->fdt, new_fdt);
@@ -454,10 +471,25 @@ struct files_struct init_files = {
.fd = &init_files.fd_array[0],
.close_on_exec = init_files.close_on_exec_init,
.open_fds = init_files.open_fds_init,
+ .full_fds_bits = init_files.full_fds_bits_init,
},
.file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
};
+static unsigned long find_next_fd(struct fdtable *fdt, unsigned long start)
+{
+ unsigned long maxfd = fdt->max_fds;
+ unsigned long maxbit = maxfd / BITS_PER_LONG;
+ unsigned long bitbit = start / BITS_PER_LONG;
+
+ bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
+ if (bitbit > maxfd)
+ return maxfd;
+ if (bitbit > start)
+ start = bitbit;
+ return find_next_zero_bit(fdt->open_fds, maxfd, start);
+}
+
/*
* allocate a file descriptor, mark it busy.
*/
@@ -476,7 +508,7 @@ repeat:
fd = files->next_fd;
if (fd < fdt->max_fds)
- fd = find_next_zero_bit(fdt->open_fds, fdt->max_fds, fd);
+ fd = find_next_fd(fdt, fd);
/*
* N.B. For clone tasks sharing a files structure, this test
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 587ac08eabb6..023f6a1f23cd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
struct wb_writeback_work *base_work,
bool skip_if_busy)
{
- int next_memcg_id = 0;
- struct bdi_writeback *wb;
- struct wb_iter iter;
+ struct bdi_writeback *last_wb = NULL;
+ struct bdi_writeback *wb = list_entry(&bdi->wb_list,
+ struct bdi_writeback, bdi_node);
might_sleep();
restart:
rcu_read_lock();
- bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+ list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
struct wb_writeback_work fallback_work;
struct wb_writeback_work *work;
long nr_pages;
+ if (last_wb) {
+ wb_put(last_wb);
+ last_wb = NULL;
+ }
+
/* SYNC_ALL writes out I_DIRTY_TIME too */
if (!wb_has_dirty_io(wb) &&
(base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,12 +824,22 @@ restart:
wb_queue_work(wb, work);
- next_memcg_id = wb->memcg_css->id + 1;
+ /*
+ * Pin @wb so that it stays on @bdi->wb_list. This allows
+ * continuing iteration from @wb after dropping and
+ * regrabbing rcu read lock.
+ */
+ wb_get(wb);
+ last_wb = wb;
+
rcu_read_unlock();
wb_wait_for_completion(bdi, &fallback_work_done);
goto restart;
}
rcu_read_unlock();
+
+ if (last_wb)
+ wb_put(last_wb);
}
#else /* CONFIG_CGROUP_WRITEBACK */
@@ -1481,6 +1496,21 @@ static long writeback_sb_inodes(struct super_block *sb,
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
wrote += write_chunk - wbc.nr_to_write;
+
+ if (need_resched()) {
+ /*
+ * We're trying to balance between building up a nice
+ * long list of IOs to improve our merge rate, and
+ * getting those IOs out quickly for anyone throttling
+ * in balance_dirty_pages(). cond_resched() doesn't
+ * unplug, so get our IOs out the door before we
+ * give up the CPU.
+ */
+ blk_flush_plug(current);
+ cond_resched();
+ }
+
+
spin_lock(&wb->list_lock);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
@@ -1488,7 +1518,7 @@ static long writeback_sb_inodes(struct super_block *sb,
requeue_inode(inode, wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
- cond_resched_lock(&wb->list_lock);
+
/*
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
@@ -1842,12 +1872,11 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
if (!bdi_has_dirty_io(bdi))
continue;
- bdi_for_each_wb(wb, bdi, &iter, 0)
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
false, reason);
}
@@ -1879,11 +1908,10 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
rcu_read_lock();
list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
struct bdi_writeback *wb;
- struct wb_iter iter;
- bdi_for_each_wb(wb, bdi, &iter, 0)
- if (!list_empty(&bdi->wb.b_dirty_time))
- wb_wakeup(&bdi->wb);
+ list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+ if (!list_empty(&wb->b_dirty_time))
+ wb_wakeup(wb);
}
rcu_read_unlock();
schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1953,9 +1981,9 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
* page->mapping->host, so the page-dirtying time is recorded in the internal
* blockdev inode.
*/
-#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
void __mark_inode_dirty(struct inode *inode, int flags)
{
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
struct super_block *sb = inode->i_sb;
int dirtytime;
@@ -2065,6 +2093,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
out_unlock_inode:
spin_unlock(&inode->i_lock);
+#undef I_DIRTY_INODE
}
EXPORT_SYMBOL(__mark_inode_dirty);
@@ -2121,7 +2150,12 @@ static void wait_sb_inodes(struct super_block *sb)
iput(old_inode);
old_inode = inode;
- filemap_fdatawait(mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(mapping);
cond_resched();
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index d403c69bee08..4304072161aa 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -111,7 +111,7 @@ struct fscache_cookie *__fscache_acquire_cookie(
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
- INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_WAIT);
+ INIT_RADIX_TREE(&cookie->stores, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
switch (cookie->def->type) {
case FSCACHE_COOKIE_TYPE_INDEX:
diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c
index 6d941f56faf4..9b28649df3a1 100644
--- a/fs/fscache/netfs.c
+++ b/fs/fscache/netfs.c
@@ -22,6 +22,7 @@ static LIST_HEAD(fscache_netfs_list);
int __fscache_register_netfs(struct fscache_netfs *netfs)
{
struct fscache_netfs *ptr;
+ struct fscache_cookie *cookie;
int ret;
_enter("{%s}", netfs->name);
@@ -29,29 +30,25 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
INIT_LIST_HEAD(&netfs->link);
/* allocate a cookie for the primary index */
- netfs->primary_index =
- kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
+ cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
- if (!netfs->primary_index) {
+ if (!cookie) {
_leave(" = -ENOMEM");
return -ENOMEM;
}
/* initialise the primary index cookie */
- atomic_set(&netfs->primary_index->usage, 1);
- atomic_set(&netfs->primary_index->n_children, 0);
- atomic_set(&netfs->primary_index->n_active, 1);
+ atomic_set(&cookie->usage, 1);
+ atomic_set(&cookie->n_children, 0);
+ atomic_set(&cookie->n_active, 1);
- netfs->primary_index->def = &fscache_fsdef_netfs_def;
- netfs->primary_index->parent = &fscache_fsdef_index;
- netfs->primary_index->netfs_data = netfs;
- netfs->primary_index->flags = 1 << FSCACHE_COOKIE_ENABLED;
+ cookie->def = &fscache_fsdef_netfs_def;
+ cookie->parent = &fscache_fsdef_index;
+ cookie->netfs_data = netfs;
+ cookie->flags = 1 << FSCACHE_COOKIE_ENABLED;
- atomic_inc(&netfs->primary_index->parent->usage);
- atomic_inc(&netfs->primary_index->parent->n_children);
-
- spin_lock_init(&netfs->primary_index->lock);
- INIT_HLIST_HEAD(&netfs->primary_index->backing_objects);
+ spin_lock_init(&cookie->lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
/* check the netfs type is not already present */
down_write(&fscache_addremove_sem);
@@ -62,6 +59,10 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
goto already_registered;
}
+ atomic_inc(&cookie->parent->usage);
+ atomic_inc(&cookie->parent->n_children);
+
+ netfs->primary_index = cookie;
list_add(&netfs->link, &fscache_netfs_list);
ret = 0;
@@ -70,11 +71,8 @@ int __fscache_register_netfs(struct fscache_netfs *netfs)
already_registered:
up_write(&fscache_addremove_sem);
- if (ret < 0) {
- netfs->primary_index->parent = NULL;
- __fscache_cookie_put(netfs->primary_index);
- netfs->primary_index = NULL;
- }
+ if (ret < 0)
+ kmem_cache_free(fscache_cookie_jar, cookie);
_leave(" = %d", ret);
return ret;
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 51dde817e1f2..6b028b7c4250 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -316,7 +316,7 @@ static const struct seq_operations fscache_objlist_ops = {
static void fscache_objlist_config(struct fscache_objlist_data *data)
{
#ifdef CONFIG_KEYS
- struct user_key_payload *confkey;
+ const struct user_key_payload *confkey;
unsigned long config;
struct key *key;
const char *buf;
@@ -329,7 +329,7 @@ static void fscache_objlist_config(struct fscache_objlist_data *data)
config = 0;
rcu_read_lock();
- confkey = key->payload.data;
+ confkey = user_key_payload(key);
buf = confkey->data;
for (len = confkey->datalen - 1; len >= 0; len--) {
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 483bbc613bf0..6b35fc4860a0 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -58,7 +58,7 @@ bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
/*
* decide whether a page can be released, possibly by cancelling a store to it
- * - we're allowed to sleep if __GFP_WAIT is flagged
+ * - we're allowed to sleep if __GFP_DIRECT_RECLAIM is flagged
*/
bool __fscache_maybe_release_page(struct fscache_cookie *cookie,
struct page *page,
@@ -122,7 +122,7 @@ page_busy:
* allocator as the work threads writing to the cache may all end up
* sleeping on memory allocation, so we may need to impose a timeout
* too. */
- if (!(gfp & __GFP_WAIT) || !(gfp & __GFP_FS)) {
+ if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) {
fscache_stat(&fscache_n_store_vmscan_busy);
return false;
}
@@ -132,7 +132,7 @@ page_busy:
_debug("fscache writeout timeout page: %p{%lx}",
page, page->index);
- gfp &= ~__GFP_WAIT;
+ gfp &= ~__GFP_DIRECT_RECLAIM;
goto try_again;
}
EXPORT_SYMBOL(__fscache_maybe_release_page);
@@ -816,7 +816,7 @@ static void fscache_write_op(struct fscache_operation *_op)
goto superseded;
page = results[0];
_debug("gang %d [%lx]", n, page->index);
- if (page->index > op->store_limit) {
+ if (page->index >= op->store_limit) {
fscache_stat(&fscache_n_store_pages_over_limit);
goto superseded;
}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index f523f2f04c19..e0faf8f2c868 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2189,7 +2189,7 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
int err;
if (fc->no_flock) {
- err = flock_lock_file_wait(file, fl);
+ err = locks_lock_file_wait(file, fl);
} else {
struct fuse_file *ff = file->private_data;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 487527b42d94..ad8a5b757cc7 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -388,8 +388,13 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip)
*/
void gfs2_dir_hash_inval(struct gfs2_inode *ip)
{
- __be64 *hc = ip->i_hash_cache;
+ __be64 *hc;
+
+ spin_lock(&ip->i_inode.i_lock);
+ hc = ip->i_hash_cache;
ip->i_hash_cache = NULL;
+ spin_unlock(&ip->i_inode.i_lock);
+
kvfree(hc);
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cf4ab89159f4..5e425469f0c2 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -897,8 +897,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t
if (!(mode & FALLOC_FL_KEEP_SIZE) && (pos + count) > inode->i_size) {
i_size_write(inode, pos + count);
- /* Marks the inode as dirty */
file_update_time(file);
+ mark_inode_dirty(inode);
}
return generic_write_sync(file, pos, count);
@@ -1000,7 +1000,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
}
if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
if (fl->fl_type == F_UNLCK)
- posix_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
return -EIO;
}
if (IS_GETLK(cmd))
@@ -1031,7 +1031,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (gl) {
if (fl_gh->gh_state == state)
goto out;
- flock_lock_file_wait(file,
+ locks_lock_file_wait(file,
&(struct file_lock){.fl_type = F_UNLCK});
gfs2_glock_dq(fl_gh);
gfs2_holder_reinit(state, flags, fl_gh);
@@ -1056,7 +1056,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
if (error == GLR_TRYFAILED)
error = -EAGAIN;
} else {
- error = flock_lock_file_wait(file, fl);
+ error = locks_lock_file_wait(file, fl);
gfs2_assert_warn(GFS2_SB(&ip->i_inode), !error);
}
@@ -1071,7 +1071,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
struct gfs2_holder *fl_gh = &fp->f_fl_gh;
mutex_lock(&fp->f_fl_mutex);
- flock_lock_file_wait(file, fl);
+ locks_lock_file_wait(file, fl);
if (fl_gh->gh_gl) {
gfs2_glock_dq(fl_gh);
gfs2_holder_uninit(fl_gh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 9bd1244caf38..32e74710b1aa 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -246,8 +246,8 @@ static inline void do_error(struct gfs2_glock *gl, const int ret)
*/
static int do_promote(struct gfs2_glock *gl)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_holder *gh, *tmp;
@@ -260,10 +260,10 @@ restart:
if (may_grant(gl, gh)) {
if (gh->gh_list.prev == &gl->gl_holders &&
glops->go_lock) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
/* FIXME: eliminate this eventually */
ret = glops->go_lock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (ret) {
if (ret == 1)
return 2;
@@ -361,7 +361,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret)
unsigned state = ret & LM_OUT_ST_MASK;
int rv;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
trace_gfs2_glock_state_change(gl, state);
state_change(gl, state);
gh = find_first_waiter(gl);
@@ -405,7 +405,7 @@ retry:
pr_err("wanted %u got %u\n", gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
@@ -414,9 +414,9 @@ retry:
gfs2_demote_wake(gl);
if (state != LM_ST_UNLOCKED) {
if (glops->go_xmote_bh) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
rv = glops->go_xmote_bh(gl, gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (rv) {
do_error(gl, rv);
goto out;
@@ -429,7 +429,7 @@ retry:
out:
clear_bit(GLF_LOCK, &gl->gl_flags);
out_locked:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -441,8 +441,8 @@ out_locked:
*/
static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
const struct gfs2_glock_operations *glops = gl->gl_ops;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -464,7 +464,7 @@ __acquires(&gl->gl_spin)
(gl->gl_state == LM_ST_EXCLUSIVE) ||
(lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
clear_bit(GLF_BLOCKING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (glops->go_sync)
glops->go_sync(gl);
if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
@@ -485,7 +485,7 @@ __acquires(&gl->gl_spin)
gfs2_glock_put(gl);
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
/**
@@ -513,8 +513,8 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl)
*/
static void run_queue(struct gfs2_glock *gl, const int nonblock)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_holder *gh = NULL;
int ret;
@@ -596,7 +596,7 @@ static void glock_work_func(struct work_struct *work)
finish_xmote(gl, gl->gl_reply);
drop_ref = 1;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) &&
gl->gl_state != LM_ST_UNLOCKED &&
gl->gl_demote_state != LM_ST_EXCLUSIVE) {
@@ -612,7 +612,7 @@ static void glock_work_func(struct work_struct *work)
}
}
run_queue(gl, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!delay)
gfs2_glock_put(gl);
else {
@@ -876,8 +876,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
*/
static inline void add_to_queue(struct gfs2_holder *gh)
-__releases(&gl->gl_spin)
-__acquires(&gl->gl_spin)
+__releases(&gl->gl_lockref.lock)
+__acquires(&gl->gl_lockref.lock)
{
struct gfs2_glock *gl = gh->gh_gl;
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
@@ -926,10 +926,10 @@ fail:
do_cancel:
gh = list_entry(gl->gl_holders.next, struct gfs2_holder, gh_list);
if (!(gh->gh_flags & LM_FLAG_PRIORITY)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (sdp->sd_lockstruct.ls_ops->lm_cancel)
sdp->sd_lockstruct.ls_ops->lm_cancel(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
}
return;
@@ -967,7 +967,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
add_to_queue(gh);
if (unlikely((LM_FLAG_NOEXP & gh->gh_flags) &&
test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))) {
@@ -977,7 +977,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
gl->gl_lockref.count--;
}
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
@@ -1010,7 +1010,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
unsigned delay = 0;
int fast_path = 0;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gh->gh_flags & GL_NOCACHE)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
@@ -1018,9 +1018,9 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
if (find_first_holder(gl) == NULL) {
if (glops->go_unlock) {
GLOCK_BUG_ON(gl, test_and_set_bit(GLF_LOCK, &gl->gl_flags));
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
glops->go_unlock(gh);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
}
if (list_empty(&gl->gl_holders) &&
@@ -1033,7 +1033,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh)
gfs2_glock_add_to_lru(gl);
trace_gfs2_glock_queue(gh, 0);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (likely(fast_path))
return;
@@ -1217,9 +1217,9 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state)
delay = gl->gl_hold_time;
}
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
handle_callback(gl, state, delay, true);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0)
gfs2_glock_put(gl);
}
@@ -1259,7 +1259,7 @@ static int gfs2_should_freeze(const struct gfs2_glock *gl)
* @gl: Pointer to the glock
* @ret: The return value from the dlm
*
- * The gl_reply field is under the gl_spin lock so that it is ok
+ * The gl_reply field is under the gl_lockref.lock lock so that it is ok
* to use a bitfield shared with other glock state fields.
*/
@@ -1267,20 +1267,20 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
{
struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return;
}
}
gl->gl_lockref.count++;
set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
@@ -1326,14 +1326,14 @@ __acquires(&lru_lock)
while(!list_empty(list)) {
gl = list_entry(list->next, struct gfs2_glock, gl_lru);
list_del_init(&gl->gl_lru);
- if (!spin_trylock(&gl->gl_spin)) {
+ if (!spin_trylock(&gl->gl_lockref.lock)) {
add_back_to_lru:
list_add(&gl->gl_lru, &lru_list);
atomic_inc(&lru_count);
continue;
}
if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
goto add_back_to_lru;
}
clear_bit(GLF_LRU, &gl->gl_flags);
@@ -1343,7 +1343,7 @@ add_back_to_lru:
WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gl->gl_lockref.count--;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
cond_resched_lock(&lru_lock);
}
}
@@ -1461,10 +1461,10 @@ static void clear_glock(struct gfs2_glock *gl)
{
gfs2_glock_remove_from_lru(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
if (gl->gl_state != LM_ST_UNLOCKED)
handle_callback(gl, LM_ST_UNLOCKED, 0, false);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
gfs2_glock_put(gl);
}
@@ -1482,9 +1482,9 @@ void gfs2_glock_thaw(struct gfs2_sbd *sdp)
static void dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
{
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gfs2_dump_glock(seq, gl);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static void dump_glock_func(struct gfs2_glock *gl)
@@ -1518,10 +1518,10 @@ void gfs2_glock_finish_truncate(struct gfs2_inode *ip)
ret = gfs2_truncatei_resume(ip);
gfs2_assert_withdraw(gl->gl_name.ln_sbd, ret == 0);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
clear_bit(GLF_LOCK, &gl->gl_flags);
run_queue(gl, 1);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
static const char *state2str(unsigned state)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 32572f71f027..f7cdaa8b4c83 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -141,7 +141,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
struct pid *pid;
/* Look in glock's list of holders for one with current task as owner */
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
pid = task_pid(current);
list_for_each_entry(gh, &gl->gl_holders, gh_list) {
if (!test_bit(HIF_HOLDER, &gh->gh_iflags))
@@ -151,7 +151,7 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *
}
gh = NULL;
out:
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
return gh;
}
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 1f6c9c3fe5cb..f348cfb6b69a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -146,11 +146,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
struct gfs2_rgrpd *rgd;
int error;
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_rgrp_brelse(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
return;
@@ -162,11 +162,11 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
mapping_set_error(mapping, error);
gfs2_ail_empty_gl(gl);
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
rgd = gl->gl_object;
if (rgd)
gfs2_free_clones(rgd);
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
}
/**
@@ -542,7 +542,7 @@ static int freeze_go_demote_ok(const struct gfs2_glock *gl)
* iopen_go_callback - schedule the dcache entry for the inode to be deleted
* @gl: the glock
*
- * gl_spin lock is held while calling this
+ * gl_lockref.lock lock is held while calling this
*/
static void iopen_go_callback(struct gfs2_glock *gl, bool remote)
{
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 121ed08d9d9f..de7b4f97ac75 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -334,9 +334,8 @@ struct gfs2_glock {
struct lm_lockname gl_name;
struct lockref gl_lockref;
-#define gl_spin gl_lockref.lock
- /* State fields protected by gl_spin */
+ /* State fields protected by gl_lockref.lock */
unsigned int gl_state:2, /* Current state */
gl_target:2, /* Target state */
gl_demote_state:2, /* State requested by remote node */
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 284c1542783e..8b907c5cc913 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -50,7 +50,7 @@ static inline void gfs2_update_stats(struct gfs2_lkstats *s, unsigned index,
s64 delta = sample - s->stats[index];
s->stats[index] += (delta >> 3);
index++;
- s->stats[index] += ((abs64(delta) - s->stats[index]) >> 2);
+ s->stats[index] += ((abs(delta) - s->stats[index]) >> 2);
}
/**
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 241a399bf83d..fb2b42cf46b5 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -50,7 +50,7 @@ static void gfs2_init_glock_once(void *foo)
struct gfs2_glock *gl = foo;
INIT_HLIST_BL_NODE(&gl->gl_list);
- spin_lock_init(&gl->gl_spin);
+ spin_lock_init(&gl->gl_lockref.lock);
INIT_LIST_HEAD(&gl->gl_holders);
INIT_LIST_HEAD(&gl->gl_lru);
INIT_LIST_HEAD(&gl->gl_ail_list);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 02586e7eb964..baab99b69d8a 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1291,6 +1291,9 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
up_write(&s->s_umount);
blkdev_put(bdev, mode);
down_write(&s->s_umount);
+ } else {
+ /* s_mode must be set before deactivate_locked_super calls */
+ s->s_mode = mode;
}
memset(&args, 0, sizeof(args));
@@ -1314,7 +1317,6 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
} else {
char b[BDEVNAME_SIZE];
- s->s_mode = mode;
strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
sb_set_blocksize(s, block_size(bdev));
error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 475985d14758..c134c0462cee 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -729,9 +729,9 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
rb_erase(n, &sdp->sd_rindex_tree);
if (gl) {
- spin_lock(&gl->gl_spin);
+ spin_lock(&gl->gl_lockref.lock);
gl->gl_object = NULL;
- spin_unlock(&gl->gl_spin);
+ spin_unlock(&gl->gl_lockref.lock);
gfs2_glock_add_to_lru(gl);
gfs2_glock_put(gl);
}
@@ -933,8 +933,9 @@ static int read_rindex_entry(struct gfs2_inode *ip)
goto fail;
rgd->rd_gl->gl_object = rgd;
- rgd->rd_gl->gl_vm.start = rgd->rd_addr * bsize;
- rgd->rd_gl->gl_vm.end = rgd->rd_gl->gl_vm.start + (rgd->rd_length * bsize) - 1;
+ rgd->rd_gl->gl_vm.start = (rgd->rd_addr * bsize) & PAGE_CACHE_MASK;
+ rgd->rd_gl->gl_vm.end = PAGE_CACHE_ALIGN((rgd->rd_addr +
+ rgd->rd_length) * bsize) - 1;
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index b95d0d625f32..0c1bde395062 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -176,6 +176,8 @@ void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
unlock_buffer(bh);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+ else
+ bd = bh->b_private;
lock_buffer(bh);
gfs2_log_lock(sdp);
}
@@ -236,6 +238,8 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
lock_page(bh->b_page);
if (bh->b_private == NULL)
bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+ else
+ bd = bh->b_private;
unlock_page(bh->b_page);
lock_buffer(bh);
gfs2_log_lock(sdp);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 4c096fa9e2a1..53ce76a374fe 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -583,11 +583,13 @@ out:
*
* Returns: actual size of data on success, -errno on error
*/
-static int gfs2_xattr_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int gfs2_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
struct gfs2_ea_location el;
+ int type = handler->flags;
int error;
if (!ip->i_eattr)
@@ -1227,11 +1229,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name,
return error;
}
-static int gfs2_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int gfs2_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
return __gfs2_xattr_set(d_inode(dentry), name, value,
- size, flags, type);
+ size, flags, handler->flags);
}
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index 416b1dbafe51..e41a010cd89c 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -849,8 +849,9 @@ end_removexattr:
return err;
}
-static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -871,8 +872,9 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name,
return __hfsplus_getxattr(d_inode(dentry), name, buffer, size);
}
-static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -893,19 +895,8 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name,
return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags);
}
-static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_osx_handler = {
.prefix = XATTR_MAC_OSX_PREFIX,
- .list = hfsplus_osx_listxattr,
.get = hfsplus_osx_getxattr,
.set = hfsplus_osx_setxattr,
};
diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c
index aacff00a9ff9..72a68a3a0c99 100644
--- a/fs/hfsplus/xattr_security.c
+++ b/fs/hfsplus/xattr_security.c
@@ -13,32 +13,24 @@
#include "xattr.h"
#include "acl.h"
-static int hfsplus_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static int hfsplus_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_SECURITY_PREFIX,
XATTR_SECURITY_PREFIX_LEN);
}
-static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
static int hfsplus_initxattrs(struct inode *inode,
const struct xattr *xattr_array,
void *fs_info)
@@ -92,7 +84,6 @@ int hfsplus_init_inode_security(struct inode *inode,
const struct xattr_handler hfsplus_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = hfsplus_security_listxattr,
.get = hfsplus_security_getxattr,
.set = hfsplus_security_setxattr,
};
diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c
index bcf65089b7f7..95a7704c7abb 100644
--- a/fs/hfsplus/xattr_trusted.c
+++ b/fs/hfsplus/xattr_trusted.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_TRUSTED_PREFIX,
XATTR_TRUSTED_PREFIX_LEN);
}
-static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
}
-static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = hfsplus_trusted_listxattr,
.get = hfsplus_trusted_getxattr,
.set = hfsplus_trusted_setxattr,
};
diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c
index 5aa0e6dc4a1e..6fc269baf959 100644
--- a/fs/hfsplus/xattr_user.c
+++ b/fs/hfsplus/xattr_user.c
@@ -11,34 +11,25 @@
#include "hfsplus_fs.h"
#include "xattr.h"
-static int hfsplus_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int hfsplus_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
return hfsplus_getxattr(dentry, name, buffer, size,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static int hfsplus_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int hfsplus_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
return hfsplus_setxattr(dentry, name, buffer, size, flags,
XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
}
-static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- /*
- * This method is not used.
- * It is used hfsplus_listxattr() instead of generic_listxattr().
- */
- return -EOPNOTSUPP;
-}
-
const struct xattr_handler hfsplus_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = hfsplus_user_listxattr,
.get = hfsplus_user_getxattr,
.set = hfsplus_user_setxattr,
};
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index 9e92c9c2d319..ae4d5a1fa4c9 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -227,8 +227,6 @@ static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, de
int err;
if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err;
if (hpfs_sb(dir->i_sb)->sb_eas < 2) return -EPERM;
- if (!new_valid_dev(rdev))
- return -EINVAL;
hpfs_lock(dir->i_sb);
err = -ENOSPC;
fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh);
diff --git a/fs/inode.c b/fs/inode.c
index 78a17b8859e1..1be5f9003eb3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1597,6 +1597,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
/**
* touch_atime - update the access time
* @path: the &struct path to update
+ * @inode: inode to update
*
* Update the accessed time on an inode and mark it for writeback.
* This function automatically handles read only file systems and media,
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 8c44654ce274..684996c8a3a4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -427,7 +427,6 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
struct journal_head *last_jh;
struct journal_head *next_jh = jh;
int ret;
- int freed = 0;
if (!jh)
return 0;
@@ -441,10 +440,9 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
else
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
if (!ret)
- return freed;
+ return 0;
if (ret == 2)
return 1;
- freed = 1;
/*
* This function only frees up some memory
* if possible so we dont have an obligation
@@ -452,10 +450,10 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
* requested:
*/
if (need_resched())
- return freed;
+ return 0;
} while (jh != last_jh);
- return freed;
+ return 0;
}
/*
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 362e5f614450..36345fefa3ff 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -142,8 +142,7 @@ static int journal_submit_commit_record(journal_t *journal,
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
@@ -157,8 +156,7 @@ static int journal_submit_commit_record(journal_t *journal,
bh->b_end_io = journal_end_buffer_io_sync;
if (journal->j_flags & JBD2_BARRIER &&
- !JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
+ !jbd2_has_feature_async_commit(journal))
ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
else
ret = submit_bh(WRITE_SYNC, bh);
@@ -317,7 +315,7 @@ static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
unsigned long long block)
{
tag->t_blocknr = cpu_to_be32(block & (u32)~0);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(j))
tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
}
@@ -356,7 +354,7 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
bh->b_size);
kunmap_atomic(addr);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
tag3->t_checksum = cpu_to_be32(csum32);
else
tag->t_checksum = cpu_to_be16(csum32);
@@ -730,8 +728,7 @@ start_journal_io:
/*
* Compute checksum.
*/
- if (JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_has_feature_checksum(journal)) {
crc32_sum =
jbd2_checksum_data(crc32_sum, bh);
}
@@ -797,8 +794,7 @@ start_journal_io:
blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
/* Done it all: now write the commit record asynchronously. */
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -889,8 +885,7 @@ start_journal_io:
commit_transaction->t_state = T_COMMIT_JFLUSH;
write_unlock(&journal->j_state_lock);
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
@@ -898,8 +893,7 @@ start_journal_io:
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
+ if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
}
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 8270fe9e3641..81e622681c82 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -124,7 +124,7 @@ EXPORT_SYMBOL(__jbd2_debug);
/* Checksumming functions */
static int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
{
- if (!jbd2_journal_has_csum_v2or3(j))
+ if (!jbd2_journal_has_csum_v2or3_feature(j))
return 1;
return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
@@ -1523,16 +1523,16 @@ static int journal_get_superblock(journal_t *journal)
goto out;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) &&
- JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) {
+ if (jbd2_has_feature_csum2(journal) &&
+ jbd2_has_feature_csum3(journal)) {
/* Can't have checksum v2 and v3 at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v2 and v3 "
"at the same time!\n");
goto out;
}
- if (jbd2_journal_has_csum_v2or3(journal) &&
- JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal) &&
+ jbd2_has_feature_checksum(journal)) {
/* Can't have checksum v1 and v2 on at the same time! */
printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 "
"at the same time!\n");
@@ -1545,7 +1545,7 @@ static int journal_get_superblock(journal_t *journal)
}
/* Load the checksum driver */
- if (jbd2_journal_has_csum_v2or3(journal)) {
+ if (jbd2_journal_has_csum_v2or3_feature(journal)) {
journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
if (IS_ERR(journal->j_chksum_driver)) {
printk(KERN_ERR "JBD2: Cannot load crc32c driver.\n");
@@ -1558,6 +1558,7 @@ static int journal_get_superblock(journal_t *journal)
/* Check superblock checksum */
if (!jbd2_superblock_csum_verify(journal, sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
+ err = -EFSBADCRC;
goto out;
}
@@ -1649,7 +1650,7 @@ int jbd2_journal_load(journal_t *journal)
printk(KERN_ERR "JBD2: journal transaction %u on %s "
"is corrupt.\n", journal->j_failed_commit,
journal->j_devname);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* OK, we've finished with the dynamic journal bits:
@@ -2071,8 +2072,12 @@ static void __journal_abort_soft (journal_t *journal, int errno)
__jbd2_journal_abort_hard(journal);
- if (errno)
+ if (errno) {
jbd2_journal_update_sb_errno(journal);
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
+ }
}
/**
@@ -2197,15 +2202,15 @@ size_t journal_tag_bytes(journal_t *journal)
{
size_t sz;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(journal))
return sizeof(journal_block_tag3_t);
sz = sizeof(journal_block_tag_t);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+ if (jbd2_has_feature_csum2(journal))
sz += sizeof(__u16);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
return sz;
else
return sz - sizeof(__u32);
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a9079d035ae5..7f277e49fe88 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -140,7 +140,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
if (offset >= journal->j_maxlen) {
printk(KERN_ERR "JBD2: corrupted journal superblock\n");
- return -EIO;
+ return -EFSCORRUPTED;
}
err = jbd2_journal_bmap(journal, offset, &blocknr);
@@ -342,7 +342,7 @@ static inline unsigned long long read_tag_block(journal_t *journal,
journal_block_tag_t *tag)
{
unsigned long long block = be32_to_cpu(tag->t_blocknr);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
return block;
}
@@ -411,7 +411,7 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
- if (JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V3))
+ if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
else
return tag->t_checksum == cpu_to_be16(csum32);
@@ -527,7 +527,7 @@ static int do_one_pass(journal_t *journal,
printk(KERN_ERR "JBD2: Invalid checksum "
"recovering block %lu in log\n",
next_log_block);
- err = -EIO;
+ err = -EFSBADCRC;
brelse(bh);
goto failed;
}
@@ -538,8 +538,7 @@ static int do_one_pass(journal_t *journal,
* just skip over the blocks it describes. */
if (pass != PASS_REPLAY) {
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM) &&
+ jbd2_has_feature_checksum(journal) &&
!info->end_transaction) {
if (calc_chksums(journal, bh,
&next_log_block,
@@ -602,7 +601,7 @@ static int do_one_pass(journal_t *journal,
journal, tag, obh->b_data,
be32_to_cpu(tmp->h_sequence))) {
brelse(obh);
- success = -EIO;
+ success = -EFSBADCRC;
printk(KERN_ERR "JBD2: Invalid "
"checksum recovering "
"block %llu in log\n",
@@ -694,8 +693,7 @@ static int do_one_pass(journal_t *journal,
* much to do other than move on to the next sequence
* number. */
if (pass == PASS_SCAN &&
- JBD2_HAS_COMPAT_FEATURE(journal,
- JBD2_FEATURE_COMPAT_CHECKSUM)) {
+ jbd2_has_feature_checksum(journal)) {
int chksum_err, chksum_seen;
struct commit_header *cbh =
(struct commit_header *)bh->b_data;
@@ -735,8 +733,7 @@ static int do_one_pass(journal_t *journal,
if (chksum_err) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)){
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -750,8 +747,7 @@ static int do_one_pass(journal_t *journal,
bh->b_data)) {
info->end_transaction = next_commit_ID;
- if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
- JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+ if (!jbd2_has_feature_async_commit(journal)) {
journal->j_failed_commit =
next_commit_ID;
brelse(bh);
@@ -851,7 +847,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
rcount = be32_to_cpu(header->r_count);
if (!jbd2_revoke_block_csum_verify(journal, header))
- return -EINVAL;
+ return -EFSBADCRC;
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
@@ -859,7 +855,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
return -EINVAL;
max = rcount;
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
record_len = 8;
while (offset + record_len <= max) {
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 0abf2e7f725b..705ae577882b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -589,7 +589,7 @@ static void write_one_revoke_record(journal_t *journal,
if (jbd2_journal_has_csum_v2or3(journal))
csum_size = sizeof(struct jbd2_journal_revoke_tail);
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
sz = 8;
else
sz = 4;
@@ -619,7 +619,7 @@ static void write_one_revoke_record(journal_t *journal,
*descriptorp = descriptor;
}
- if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
+ if (jbd2_has_feature_64bit(journal))
* ((__be64 *)(&descriptor->b_data[offset])) =
cpu_to_be64(record->blocknr);
else
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6b8338ec2464..89463eee6791 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1937,8 +1937,8 @@ out:
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
- * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
- * release the buffers.
+ * buffers. If __GFP_DIRECT_RECLAIM and __GFP_FS is set, we wait for commit
+ * code to release the buffers.
*
*
* For all the buffers on this page,
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index bb9cebc9ca8a..e5c1783ab64a 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -80,7 +80,6 @@ static int jffs2_garbage_collect_thread(void *_c)
siginitset(&hupmask, sigmask(SIGHUP));
allow_signal(SIGKILL);
allow_signal(SIGSTOP);
- allow_signal(SIGCONT);
allow_signal(SIGHUP);
c->gc_task = current;
@@ -121,20 +120,18 @@ static int jffs2_garbage_collect_thread(void *_c)
/* Put_super will send a SIGKILL and then wait on the sem.
*/
while (signal_pending(current) || freezing(current)) {
- siginfo_t info;
unsigned long signr;
if (try_to_freeze())
goto again;
- signr = dequeue_signal_lock(current, &current->blocked, &info);
+ signr = kernel_dequeue_signal(NULL);
switch(signr) {
case SIGSTOP:
jffs2_dbg(1, "%s(): SIGSTOP received\n",
__func__);
- set_current_state(TASK_STOPPED);
- schedule();
+ kernel_signal_stop();
break;
case SIGKILL:
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 81180022923f..d211b8e18566 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -621,9 +621,6 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode
uint32_t alloclen;
int ret;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
ri = jffs2_alloc_raw_inode();
if (!ri)
return -ENOMEM;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index b8fd651307a4..ce1189793288 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -97,25 +97,16 @@ int __init jffs2_create_slab_caches(void)
void jffs2_destroy_slab_caches(void)
{
- if(full_dnode_slab)
- kmem_cache_destroy(full_dnode_slab);
- if(raw_dirent_slab)
- kmem_cache_destroy(raw_dirent_slab);
- if(raw_inode_slab)
- kmem_cache_destroy(raw_inode_slab);
- if(tmp_dnode_info_slab)
- kmem_cache_destroy(tmp_dnode_info_slab);
- if(raw_node_ref_slab)
- kmem_cache_destroy(raw_node_ref_slab);
- if(node_frag_slab)
- kmem_cache_destroy(node_frag_slab);
- if(inode_cache_slab)
- kmem_cache_destroy(inode_cache_slab);
+ kmem_cache_destroy(full_dnode_slab);
+ kmem_cache_destroy(raw_dirent_slab);
+ kmem_cache_destroy(raw_inode_slab);
+ kmem_cache_destroy(tmp_dnode_info_slab);
+ kmem_cache_destroy(raw_node_ref_slab);
+ kmem_cache_destroy(node_frag_slab);
+ kmem_cache_destroy(inode_cache_slab);
#ifdef CONFIG_JFFS2_FS_XATTR
- if (xattr_datum_cache)
- kmem_cache_destroy(xattr_datum_cache);
- if (xattr_ref_cache)
- kmem_cache_destroy(xattr_ref_cache);
+ kmem_cache_destroy(xattr_datum_cache);
+ kmem_cache_destroy(xattr_ref_cache);
#endif
}
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 28e0aab42bc3..bfebbf13698c 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -660,8 +660,12 @@ static inline int read_direntry(struct jffs2_sb_info *c, struct jffs2_raw_node_r
err = jffs2_flash_read(c, (ref_offset(ref)) + read,
rd->nsize - already, &read, &fd->name[already]);
- if (unlikely(read != rd->nsize - already) && likely(!err))
+ if (unlikely(read != rd->nsize - already) && likely(!err)) {
+ jffs2_free_full_dirent(fd);
+ JFFS2_ERROR("short read: wanted %d bytes, got %zd\n",
+ rd->nsize - already, read);
return -EIO;
+ }
if (unlikely(err)) {
JFFS2_ERROR("read remainder of name: error %d\n", err);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index d4b43fb7adb1..bf12fe5f83d7 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -48,8 +48,9 @@ int jffs2_init_security(struct inode *inode, struct inode *dir,
}
/* ---- XATTR Handler for "security.*" ----------------- */
-static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_security_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -58,8 +59,9 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name,
name, buffer, size);
}
-static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_security_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -68,8 +70,10 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name,
name, buffer, size, flags);
}
-static size_t jffs2_security_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_security_listxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
{
size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index 09ed55190ee2..f3a4857ff071 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1264,7 +1264,7 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
if ((c->flash_size % c->sector_size) != 0) {
c->flash_size = (c->flash_size / c->sector_size) * c->sector_size;
pr_warn("flash size adjusted to %dKiB\n", c->flash_size);
- };
+ }
c->wbuf_ofs = 0xFFFFFFFF;
c->wbuf = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
@@ -1274,7 +1274,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
if (!c->wbuf_verify) {
- kfree(c->oobbuf);
kfree(c->wbuf);
return -ENOMEM;
}
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index f092fee5be50..4c2c03663533 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -1001,11 +1001,12 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
if (!xhandle)
continue;
if (buffer) {
- rc = xhandle->list(dentry, buffer+len, size-len,
- xd->xname, xd->name_len, xd->flags);
+ rc = xhandle->list(xhandle, dentry, buffer + len,
+ size - len, xd->xname,
+ xd->name_len);
} else {
- rc = xhandle->list(dentry, NULL, 0, xd->xname,
- xd->name_len, xd->flags);
+ rc = xhandle->list(xhandle, dentry, NULL, 0,
+ xd->xname, xd->name_len);
}
if (rc < 0)
goto out;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index ceaf9c693225..a562da0d6a26 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -16,8 +16,9 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -25,8 +26,9 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name,
name, buffer, size);
}
-static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -34,11 +36,16 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name,
name, buffer, size, flags);
}
-static size_t jffs2_trusted_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
{
size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
+ if (!capable(CAP_SYS_ADMIN))
+ return 0;
+
if (list && retlen<=list_size) {
strcpy(list, XATTR_TRUSTED_PREFIX);
strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index a71391eba514..cbc0472e59a8 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -16,8 +16,9 @@
#include <linux/mtd/mtd.h>
#include "nodelist.h"
-static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int jffs2_user_getxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -25,8 +26,9 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name,
name, buffer, size);
}
-static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
- const void *buffer, size_t size, int flags, int type)
+static int jffs2_user_setxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *buffer, size_t size, int flags)
{
if (!strcmp(name, ""))
return -EINVAL;
@@ -34,8 +36,10 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name,
name, buffer, size, flags);
}
-static size_t jffs2_user_listxattr(struct dentry *dentry, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static size_t jffs2_user_listxattr(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
{
size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 35976bdccafc..9d7551f5c32a 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1372,9 +1372,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
tid_t tid;
struct tblock *tblk;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
jfs_info("jfs_mknod: %pd", dentry);
rc = dquot_initialize(dir);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 4cd9798f4948..8f9176caf098 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -496,9 +496,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
jfs_info("In jfs_read_super: s_flags=0x%lx", sb->s_flags);
- if (!new_valid_dev(sb->s_bdev->bd_dev))
- return -EOVERFLOW;
-
sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index acd394716349..112952037933 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -474,18 +474,7 @@ static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *ho
static int do_vfs_lock(struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(fl->fl_file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(fl->fl_file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(fl->fl_file, fl);
}
/*
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 969d589c848d..d716c9993a26 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -116,7 +116,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
atomic_inc(&nsm->sm_count);
else {
host = NULL;
- nsm = nsm_get_handle(ni->sap, ni->salen,
+ nsm = nsm_get_handle(ni->net, ni->sap, ni->salen,
ni->hostname, ni->hostname_len);
if (unlikely(nsm == NULL)) {
dprintk("lockd: %s failed; no nsm handle\n",
@@ -161,6 +161,7 @@ static struct nlm_host *nlm_alloc_host(struct nlm_lookup_host_info *ni,
host->h_nsmhandle = nsm;
host->h_addrbuf = nsm->sm_addrbuf;
host->net = ni->net;
+ strlcpy(host->nodename, utsname()->nodename, sizeof(host->nodename));
out:
return host;
@@ -534,17 +535,18 @@ static struct nlm_host *next_host_state(struct hlist_head *cache,
/**
* nlm_host_rebooted - Release all resources held by rebooted host
+ * @net: network namespace
* @info: pointer to decoded results of NLM_SM_NOTIFY call
*
* We were notified that the specified host has rebooted. Release
* all resources held by that peer.
*/
-void nlm_host_rebooted(const struct nlm_reboot *info)
+void nlm_host_rebooted(const struct net *net, const struct nlm_reboot *info)
{
struct nsm_handle *nsm;
struct nlm_host *host;
- nsm = nsm_reboot_lookup(info);
+ nsm = nsm_reboot_lookup(net, info);
if (unlikely(nsm == NULL))
return;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 47a32b6d9b90..19166d4a8d31 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -42,7 +42,7 @@ struct nsm_args {
u32 proc;
char *mon_name;
- char *nodename;
+ const char *nodename;
};
struct nsm_res {
@@ -51,7 +51,6 @@ struct nsm_res {
};
static const struct rpc_program nsm_program;
-static LIST_HEAD(nsm_handles);
static DEFINE_SPINLOCK(nsm_lock);
/*
@@ -87,69 +86,18 @@ static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
return rpc_create(&args);
}
-static struct rpc_clnt *nsm_client_set(struct lockd_net *ln,
- struct rpc_clnt *clnt)
-{
- spin_lock(&ln->nsm_clnt_lock);
- if (ln->nsm_users == 0) {
- if (clnt == NULL)
- goto out;
- ln->nsm_clnt = clnt;
- }
- clnt = ln->nsm_clnt;
- ln->nsm_users++;
-out:
- spin_unlock(&ln->nsm_clnt_lock);
- return clnt;
-}
-
-static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
-{
- struct rpc_clnt *clnt, *new;
- struct lockd_net *ln = net_generic(net, lockd_net_id);
-
- clnt = nsm_client_set(ln, NULL);
- if (clnt != NULL)
- goto out;
-
- clnt = new = nsm_create(net, nodename);
- if (IS_ERR(clnt))
- goto out;
-
- clnt = nsm_client_set(ln, new);
- if (clnt != new)
- rpc_shutdown_client(new);
-out:
- return clnt;
-}
-
-static void nsm_client_put(struct net *net)
-{
- struct lockd_net *ln = net_generic(net, lockd_net_id);
- struct rpc_clnt *clnt = NULL;
-
- spin_lock(&ln->nsm_clnt_lock);
- ln->nsm_users--;
- if (ln->nsm_users == 0) {
- clnt = ln->nsm_clnt;
- ln->nsm_clnt = NULL;
- }
- spin_unlock(&ln->nsm_clnt_lock);
- if (clnt != NULL)
- rpc_shutdown_client(clnt);
-}
-
static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
- struct rpc_clnt *clnt)
+ const struct nlm_host *host)
{
int status;
+ struct rpc_clnt *clnt;
struct nsm_args args = {
.priv = &nsm->sm_priv,
.prog = NLM_PROGRAM,
.vers = 3,
.proc = NLMPROC_NSM_NOTIFY,
.mon_name = nsm->sm_mon_name,
- .nodename = clnt->cl_nodename,
+ .nodename = host->nodename,
};
struct rpc_message msg = {
.rpc_argp = &args,
@@ -158,6 +106,13 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
memset(res, 0, sizeof(*res));
+ clnt = nsm_create(host->net, host->nodename);
+ if (IS_ERR(clnt)) {
+ dprintk("lockd: failed to create NSM upcall transport, "
+ "status=%ld, net=%p\n", PTR_ERR(clnt), host->net);
+ return PTR_ERR(clnt);
+ }
+
msg.rpc_proc = &clnt->cl_procinfo[proc];
status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN);
if (status == -ECONNREFUSED) {
@@ -171,6 +126,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res,
status);
else
status = 0;
+
+ rpc_shutdown_client(clnt);
return status;
}
@@ -190,32 +147,19 @@ int nsm_monitor(const struct nlm_host *host)
struct nsm_handle *nsm = host->h_nsmhandle;
struct nsm_res res;
int status;
- struct rpc_clnt *clnt;
- const char *nodename = NULL;
dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
if (nsm->sm_monitored)
return 0;
- if (host->h_rpcclnt)
- nodename = host->h_rpcclnt->cl_nodename;
-
/*
* Choose whether to record the caller_name or IP address of
* this peer in the local rpc.statd's database.
*/
nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
- clnt = nsm_client_get(host->net, nodename);
- if (IS_ERR(clnt)) {
- status = PTR_ERR(clnt);
- dprintk("lockd: failed to create NSM upcall transport, "
- "status=%d, net=%p\n", status, host->net);
- return status;
- }
-
- status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_MON, &res, host);
if (unlikely(res.status != 0))
status = -EIO;
if (unlikely(status < 0)) {
@@ -247,11 +191,9 @@ void nsm_unmonitor(const struct nlm_host *host)
if (atomic_read(&nsm->sm_count) == 1
&& nsm->sm_monitored && !nsm->sm_sticky) {
- struct lockd_net *ln = net_generic(host->net, lockd_net_id);
-
dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name);
- status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, ln->nsm_clnt);
+ status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res, host);
if (res.status != 0)
status = -EIO;
if (status < 0)
@@ -259,38 +201,38 @@ void nsm_unmonitor(const struct nlm_host *host)
nsm->sm_name);
else
nsm->sm_monitored = 0;
-
- nsm_client_put(host->net);
}
}
-static struct nsm_handle *nsm_lookup_hostname(const char *hostname,
- const size_t len)
+static struct nsm_handle *nsm_lookup_hostname(const struct list_head *nsm_handles,
+ const char *hostname, const size_t len)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (strlen(nsm->sm_name) == len &&
memcmp(nsm->sm_name, hostname, len) == 0)
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap)
+static struct nsm_handle *nsm_lookup_addr(const struct list_head *nsm_handles,
+ const struct sockaddr *sap)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (rpc_cmp_addr(nsm_addr(nsm), sap))
return nsm;
return NULL;
}
-static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv)
+static struct nsm_handle *nsm_lookup_priv(const struct list_head *nsm_handles,
+ const struct nsm_private *priv)
{
struct nsm_handle *nsm;
- list_for_each_entry(nsm, &nsm_handles, sm_link)
+ list_for_each_entry(nsm, nsm_handles, sm_link)
if (memcmp(nsm->sm_priv.data, priv->data,
sizeof(priv->data)) == 0)
return nsm;
@@ -353,6 +295,7 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
/**
* nsm_get_handle - Find or create a cached nsm_handle
+ * @net: network namespace
* @sap: pointer to socket address of handle to find
* @salen: length of socket address
* @hostname: pointer to C string containing hostname to find
@@ -365,11 +308,13 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
* @hostname cannot be found in the handle cache. Returns NULL if
* an error occurs.
*/
-struct nsm_handle *nsm_get_handle(const struct sockaddr *sap,
+struct nsm_handle *nsm_get_handle(const struct net *net,
+ const struct sockaddr *sap,
const size_t salen, const char *hostname,
const size_t hostname_len)
{
struct nsm_handle *cached, *new = NULL;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
if (hostname && memchr(hostname, '/', hostname_len) != NULL) {
if (printk_ratelimit()) {
@@ -384,9 +329,10 @@ retry:
spin_lock(&nsm_lock);
if (nsm_use_hostnames && hostname != NULL)
- cached = nsm_lookup_hostname(hostname, hostname_len);
+ cached = nsm_lookup_hostname(&ln->nsm_handles,
+ hostname, hostname_len);
else
- cached = nsm_lookup_addr(sap);
+ cached = nsm_lookup_addr(&ln->nsm_handles, sap);
if (cached != NULL) {
atomic_inc(&cached->sm_count);
@@ -400,7 +346,7 @@ retry:
}
if (new != NULL) {
- list_add(&new->sm_link, &nsm_handles);
+ list_add(&new->sm_link, &ln->nsm_handles);
spin_unlock(&nsm_lock);
dprintk("lockd: created nsm_handle for %s (%s)\n",
new->sm_name, new->sm_addrbuf);
@@ -417,19 +363,22 @@ retry:
/**
* nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle
+ * @net: network namespace
* @info: pointer to NLMPROC_SM_NOTIFY arguments
*
* Returns a matching nsm_handle if found in the nsm cache. The returned
* nsm_handle's reference count is bumped. Otherwise returns NULL if some
* error occurred.
*/
-struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info)
+struct nsm_handle *nsm_reboot_lookup(const struct net *net,
+ const struct nlm_reboot *info)
{
struct nsm_handle *cached;
+ struct lockd_net *ln = net_generic(net, lockd_net_id);
spin_lock(&nsm_lock);
- cached = nsm_lookup_priv(&info->priv);
+ cached = nsm_lookup_priv(&ln->nsm_handles, &info->priv);
if (unlikely(cached == NULL)) {
spin_unlock(&nsm_lock);
dprintk("lockd: never saw rebooted peer '%.*s' before\n",
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 097bfa3adb1c..5426189406c1 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -12,9 +12,7 @@ struct lockd_net {
struct delayed_work grace_period_end;
struct lock_manager lockd_manager;
- spinlock_t nsm_clnt_lock;
- unsigned int nsm_users;
- struct rpc_clnt *nsm_clnt;
+ struct list_head nsm_handles;
};
extern int lockd_net_id;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index d678bcc3cbcb..5f31ebd96c06 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -592,7 +592,7 @@ static int lockd_init_net(struct net *net)
INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender);
INIT_LIST_HEAD(&ln->lockd_manager.list);
ln->lockd_manager.block_opens = false;
- spin_lock_init(&ln->nsm_clnt_lock);
+ INIT_LIST_HEAD(&ln->nsm_handles);
return 0;
}
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b147d1ae71fd..09c576f26c7b 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -421,7 +421,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 21171f0c6477..fb26b9f522e7 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -464,7 +464,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp,
return rpc_system_err;
}
- nlm_host_rebooted(argp);
+ nlm_host_rebooted(SVC_NET(rqstp), argp);
return rpc_success;
}
diff --git a/fs/locks.c b/fs/locks.c
index 2a54c800a223..0d2b3267e2a3 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -205,28 +205,32 @@ static struct kmem_cache *filelock_cache __read_mostly;
static struct file_lock_context *
locks_get_lock_context(struct inode *inode, int type)
{
- struct file_lock_context *new;
+ struct file_lock_context *ctx;
- if (likely(inode->i_flctx) || type == F_UNLCK)
+ /* paired with cmpxchg() below */
+ ctx = smp_load_acquire(&inode->i_flctx);
+ if (likely(ctx) || type == F_UNLCK)
goto out;
- new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
- if (!new)
+ ctx = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+ if (!ctx)
goto out;
- spin_lock_init(&new->flc_lock);
- INIT_LIST_HEAD(&new->flc_flock);
- INIT_LIST_HEAD(&new->flc_posix);
- INIT_LIST_HEAD(&new->flc_lease);
+ spin_lock_init(&ctx->flc_lock);
+ INIT_LIST_HEAD(&ctx->flc_flock);
+ INIT_LIST_HEAD(&ctx->flc_posix);
+ INIT_LIST_HEAD(&ctx->flc_lease);
/*
* Assign the pointer if it's not already assigned. If it is, then
* free the context we just allocated.
*/
- if (cmpxchg(&inode->i_flctx, NULL, new))
- kmem_cache_free(flctx_cache, new);
+ if (cmpxchg(&inode->i_flctx, NULL, ctx)) {
+ kmem_cache_free(flctx_cache, ctx);
+ ctx = smp_load_acquire(&inode->i_flctx);
+ }
out:
- return inode->i_flctx;
+ return ctx;
}
void
@@ -762,7 +766,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
struct file_lock_context *ctx;
struct inode *inode = file_inode(filp);
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix)) {
fl->fl_type = F_UNLCK;
return;
@@ -1167,10 +1171,9 @@ EXPORT_SYMBOL(posix_lock_file);
* @inode: inode of file to which lock request should be applied
* @fl: The lock to be applied
*
- * Variant of posix_lock_file_wait that does not take a filp, and so can be
- * used after the filp has already been torn down.
+ * Apply a POSIX style lock request to an inode.
*/
-int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep ();
@@ -1187,7 +1190,6 @@ int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(posix_lock_inode_wait);
/**
* locks_mandatory_locked - Check for an active lock
@@ -1203,7 +1205,7 @@ int locks_mandatory_locked(struct file *file)
struct file_lock_context *ctx;
struct file_lock *fl;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx || list_empty_careful(&ctx->flc_posix))
return 0;
@@ -1388,7 +1390,7 @@ any_leases_conflict(struct inode *inode, struct file_lock *breaker)
int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
{
int error = 0;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *new_fl, *fl, *tmp;
unsigned long break_time;
int want_write = (mode & O_ACCMODE) != O_RDONLY;
@@ -1400,6 +1402,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
new_fl->fl_flags = type;
/* typically we will check that ctx is non-NULL before calling */
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
WARN_ON_ONCE(1);
return error;
@@ -1494,9 +1497,10 @@ EXPORT_SYMBOL(__break_lease);
void lease_get_mtime(struct inode *inode, struct timespec *time)
{
bool has_lease = false;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
struct file_lock *fl;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
if (!list_empty(&ctx->flc_lease)) {
@@ -1543,10 +1547,11 @@ int fcntl_getlease(struct file *filp)
{
struct file_lock *fl;
struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
+ struct file_lock_context *ctx;
int type = F_UNLCK;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (ctx && !list_empty_careful(&ctx->flc_lease)) {
spin_lock(&ctx->flc_lock);
time_out_leases(file_inode(filp), &dispose);
@@ -1711,11 +1716,11 @@ static int generic_delete_lease(struct file *filp, void *owner)
{
int error = -EAGAIN;
struct file_lock *fl, *victim = NULL;
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
- struct file_lock_context *ctx = inode->i_flctx;
+ struct inode *inode = file_inode(filp);
+ struct file_lock_context *ctx;
LIST_HEAD(dispose);
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx) {
trace_generic_delete_lease(inode, NULL);
return error;
@@ -1751,8 +1756,7 @@ static int generic_delete_lease(struct file *filp, void *owner)
int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
void **priv)
{
- struct dentry *dentry = filp->f_path.dentry;
- struct inode *inode = dentry->d_inode;
+ struct inode *inode = file_inode(filp);
int error;
if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
@@ -1856,7 +1860,7 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
*
* Apply a FLOCK style lock request to an inode.
*/
-int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
{
int error;
might_sleep();
@@ -1873,7 +1877,30 @@ int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
}
return error;
}
-EXPORT_SYMBOL(flock_lock_inode_wait);
+
+/**
+ * locks_lock_inode_wait - Apply a lock to an inode
+ * @inode: inode of the file to apply to
+ * @fl: The lock to be applied
+ *
+ * Apply a POSIX or FLOCK style lock request to an inode.
+ */
+int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
+{
+ int res = 0;
+ switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+ case FL_POSIX:
+ res = posix_lock_inode_wait(inode, fl);
+ break;
+ case FL_FLOCK:
+ res = flock_lock_inode_wait(inode, fl);
+ break;
+ default:
+ BUG();
+ }
+ return res;
+}
+EXPORT_SYMBOL(locks_lock_inode_wait);
/**
* sys_flock: - flock() system call.
@@ -1931,7 +1958,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
(can_sleep) ? F_SETLKW : F_SETLK,
lock);
else
- error = flock_lock_file_wait(f.file, lock);
+ error = locks_lock_file_wait(f.file, lock);
out_free:
locks_free_lock(lock);
@@ -2107,7 +2134,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
return error;
}
-/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+/* Ensure that fl->fl_file has compatible f_mode for F_SETLK calls */
static int
check_fmode_for_setlk(struct file_lock *fl)
{
@@ -2359,13 +2386,14 @@ out:
void locks_remove_posix(struct file *filp, fl_owner_t owner)
{
struct file_lock lock;
- struct file_lock_context *ctx = file_inode(filp)->i_flctx;
+ struct file_lock_context *ctx;
/*
* If there are no locks held on this file, we don't need to call
* posix_lock_file(). Another process could be setting a lock on this
* file at the same time, but we wouldn't remove that lock anyway.
*/
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
if (!ctx || list_empty(&ctx->flc_posix))
return;
@@ -2389,7 +2417,7 @@ EXPORT_SYMBOL(locks_remove_posix);
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_flock(struct file *filp)
+locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
{
struct file_lock fl = {
.fl_owner = filp,
@@ -2400,7 +2428,6 @@ locks_remove_flock(struct file *filp)
.fl_end = OFFSET_MAX,
};
struct inode *inode = file_inode(filp);
- struct file_lock_context *flctx = inode->i_flctx;
if (list_empty(&flctx->flc_flock))
return;
@@ -2416,10 +2443,8 @@ locks_remove_flock(struct file *filp)
/* The i_flctx must be valid when calling into here */
static void
-locks_remove_lease(struct file *filp)
+locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
{
- struct inode *inode = file_inode(filp);
- struct file_lock_context *ctx = inode->i_flctx;
struct file_lock *fl, *tmp;
LIST_HEAD(dispose);
@@ -2439,17 +2464,20 @@ locks_remove_lease(struct file *filp)
*/
void locks_remove_file(struct file *filp)
{
- if (!file_inode(filp)->i_flctx)
+ struct file_lock_context *ctx;
+
+ ctx = smp_load_acquire(&file_inode(filp)->i_flctx);
+ if (!ctx)
return;
/* remove any OFD locks */
locks_remove_posix(filp, filp);
/* remove flock locks */
- locks_remove_flock(filp);
+ locks_remove_flock(filp, ctx);
/* remove any leases */
- locks_remove_lease(filp);
+ locks_remove_lease(filp, ctx);
}
/**
@@ -2616,7 +2644,7 @@ void show_fd_locks(struct seq_file *f,
struct file_lock_context *ctx;
int id = 0;
- ctx = inode->i_flctx;
+ ctx = smp_load_acquire(&inode->i_flctx);
if (!ctx)
return;
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
index a7fdbd868474..a709d80c8ebc 100644
--- a/fs/logfs/dev_bdev.c
+++ b/fs/logfs/dev_bdev.c
@@ -81,7 +81,7 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
@@ -171,7 +171,7 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
unsigned int max_pages;
int i;
- max_pages = min(nr_pages, BIO_MAX_PAGES);
+ max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
bio = bio_alloc(GFP_NOFS, max_pages);
BUG_ON(!bio);
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 7f9b096d8d57..6de0fbfc6c00 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -57,7 +57,7 @@ static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
filler_t *filler = super->s_devops->readpage;
struct page *page;
- BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
+ BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
if (use_filler)
page = read_cache_page(mapping, index, filler, sb);
else {
diff --git a/fs/mpage.c b/fs/mpage.c
index 778a4ddef77a..1480d3a18037 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -139,7 +139,8 @@ map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block)
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, struct buffer_head *map_bh,
- unsigned long *first_logical_block, get_block_t get_block)
+ unsigned long *first_logical_block, get_block_t get_block,
+ gfp_t gfp)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
@@ -277,8 +278,7 @@ alloc_new:
goto out;
}
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
- min_t(int, nr_pages, BIO_MAX_PAGES),
- GFP_KERNEL);
+ min_t(int, nr_pages, BIO_MAX_PAGES), gfp);
if (bio == NULL)
goto confused;
}
@@ -361,6 +361,7 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
+ gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
@@ -370,12 +371,13 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping,
- page->index, GFP_KERNEL)) {
+ page->index,
+ gfp)) {
bio = do_mpage_readpage(bio, page,
nr_pages - page_idx,
&last_block_in_bio, &map_bh,
&first_logical_block,
- get_block);
+ get_block, gfp);
}
page_cache_release(page);
}
@@ -395,11 +397,12 @@ int mpage_readpage(struct page *page, get_block_t get_block)
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
+ gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL);
map_bh.b_state = 0;
map_bh.b_size = 0;
bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
- &map_bh, &first_logical_block, get_block);
+ &map_bh, &first_logical_block, get_block, gfp);
if (bio)
mpage_bio_submit(READ, bio);
return 0;
@@ -482,6 +485,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
struct buffer_head map_bh;
loff_t i_size = i_size_read(inode);
int ret = 0;
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
if (page_has_buffers(page)) {
struct buffer_head *head = page_buffers(page);
@@ -590,7 +594,7 @@ page_is_mapped:
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && mpd->last_block_in_bio != blocks[0] - 1)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
alloc_new:
if (bio == NULL) {
@@ -617,7 +621,7 @@ alloc_new:
wbc_account_io(wbc, page, PAGE_SIZE);
length = first_unmapped << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
goto alloc_new;
}
@@ -627,7 +631,7 @@ alloc_new:
set_page_writeback(page);
unlock_page(page);
if (boundary || (first_unmapped != blocks_per_page)) {
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (boundary_block) {
write_boundary_block(boundary_bdev,
boundary_block, 1 << blkbits);
@@ -639,7 +643,7 @@ alloc_new:
confused:
if (bio)
- bio = mpage_bio_submit(WRITE, bio);
+ bio = mpage_bio_submit(wr, bio);
if (mpd->use_writepage) {
ret = mapping->a_ops->writepage(page, wbc);
@@ -695,8 +699,11 @@ mpage_writepages(struct address_space *mapping,
};
ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
}
blk_finish_plug(&plug);
return ret;
@@ -713,8 +720,11 @@ int mpage_writepage(struct page *page, get_block_t get_block,
.use_writepage = 0,
};
int ret = __mpage_writepage(page, wbc, &mpd);
- if (mpd.bio)
- mpage_bio_submit(WRITE, mpd.bio);
+ if (mpd.bio) {
+ int wr = (wbc->sync_mode == WB_SYNC_ALL ?
+ WRITE_SYNC : WRITE);
+ mpage_bio_submit(wr, mpd.bio);
+ }
return ret;
}
EXPORT_SYMBOL(mpage_writepage);
diff --git a/fs/namei.c b/fs/namei.c
index 726d211db484..d84d7c7515fc 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -955,26 +955,23 @@ static bool safe_hardlink_source(struct inode *inode)
* - sysctl_protected_hardlinks enabled
* - fsuid does not match inode
* - hardlink source is unsafe (see safe_hardlink_source() above)
- * - not CAP_FOWNER
+ * - not CAP_FOWNER in a namespace with the inode owner uid mapped
*
* Returns 0 if successful, -ve on error.
*/
static int may_linkat(struct path *link)
{
- const struct cred *cred;
struct inode *inode;
if (!sysctl_protected_hardlinks)
return 0;
- cred = current_cred();
inode = link->dentry->d_inode;
/* Source inode owner (or CAP_FOWNER) can hardlink all they like,
* otherwise, it must be a safe source.
*/
- if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
- capable(CAP_FOWNER))
+ if (inode_owner_or_capable(inode) || safe_hardlink_source(inode))
return 0;
audit_log_link_denied("linkat", link);
@@ -1558,8 +1555,6 @@ static int lookup_fast(struct nameidata *nd,
negative = d_is_negative(dentry);
if (read_seqcount_retry(&dentry->d_seq, seq))
return -ECHILD;
- if (negative)
- return -ENOENT;
/*
* This sequence count validates that the parent had no
@@ -1580,6 +1575,12 @@ static int lookup_fast(struct nameidata *nd,
goto unlazy;
}
}
+ /*
+ * Note: do negative dentry check after revalidation in
+ * case that drops it.
+ */
+ if (negative)
+ return -ENOENT;
path->mnt = mnt;
path->dentry = dentry;
if (likely(__follow_mount_rcu(nd, path, inode, seqp)))
@@ -1965,7 +1966,7 @@ OK:
if (err) {
const char *s = get_link(nd);
- if (unlikely(IS_ERR(s)))
+ if (IS_ERR(s))
return PTR_ERR(s);
err = 0;
if (unlikely(!s)) {
@@ -3379,7 +3380,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
return ERR_PTR(-ELOOP);
filename = getname_kernel(name);
- if (unlikely(IS_ERR(filename)))
+ if (IS_ERR(filename))
return ERR_CAST(filename);
set_nameidata(&nd, -1, filename);
@@ -4603,7 +4604,7 @@ EXPORT_SYMBOL(__page_symlink);
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
+ !mapping_gfp_constraint(inode->i_mapping, __GFP_FS));
}
EXPORT_SYMBOL(page_symlink);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 93575e91a7aa..f0e3e9e747dd 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -597,7 +597,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
qname.name = __name;
newdent = d_hash_and_lookup(dentry, &qname);
- if (unlikely(IS_ERR(newdent)))
+ if (IS_ERR(newdent))
goto end_advance;
if (!newdent) {
newdent = d_alloc(dentry, &qname);
@@ -1165,8 +1165,6 @@ out:
static int ncp_mknod(struct inode * dir, struct dentry *dentry,
umode_t mode, dev_t rdev)
{
- if (!new_valid_dev(rdev))
- return -EINVAL;
if (ncp_is_nfs_extras(NCP_SERVER(dir), NCP_FINFO(dir)->volNumber)) {
ncp_dbg(1, "mode = 0%ho\n", mode);
return ncp_create_new(dir, dentry, mode, rdev, 0);
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9cd4eb3a1e22..ddd0138f410c 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -229,7 +229,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
struct parallel_io *par;
loff_t f_offset = header->args.offset;
size_t bytes_left = header->args.count;
- unsigned int pg_offset, pg_len;
+ unsigned int pg_offset = header->args.pgbase, pg_len;
struct page **pages = header->args.pages;
int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
const bool is_dio = (header->dreq != NULL);
@@ -262,7 +262,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length = be.be_length - (isect - be.be_f_offset);
}
- pg_offset = f_offset & ~PAGE_CACHE_MASK;
if (is_dio) {
if (pg_offset + bytes_left > PAGE_CACHE_SIZE)
pg_len = PAGE_CACHE_SIZE - pg_offset;
@@ -273,9 +272,6 @@ bl_read_pagelist(struct nfs_pgio_header *header)
pg_len = PAGE_CACHE_SIZE;
}
- isect += (pg_offset >> SECTOR_SHIFT);
- extent_length -= (pg_offset >> SECTOR_SHIFT);
-
if (is_hole(&be)) {
bio = bl_submit_bio(READ, bio);
/* Fill hole w/ zeroes w/o accessing device */
@@ -301,6 +297,7 @@ bl_read_pagelist(struct nfs_pgio_header *header)
extent_length -= (pg_len >> SECTOR_SHIFT);
f_offset += pg_len;
bytes_left -= pg_len;
+ pg_offset = 0;
}
if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
header->res.eof = 1;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 75f7c0a7538a..a7f2e6e33305 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -99,17 +99,6 @@ nfs4_callback_up(struct svc_serv *serv)
}
#if defined(CONFIG_NFS_V4_1)
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- /*
- * Create an svc_sock for the back channel service that shares the
- * fore channel connection.
- * Returns the input port (0) and sets the svc_serv bc_xprt on success
- */
- return svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
- SVC_SOCK_ANONYMOUS);
-}
-
/*
* The callback service for NFSv4.1 callbacks
*/
@@ -184,11 +173,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
xprt->bc_serv = serv;
}
#else
-static int nfs41_callback_up_net(struct svc_serv *serv, struct net *net)
-{
- return 0;
-}
-
static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
{
@@ -259,7 +243,8 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc
svc_shutdown_net(serv, net);
}
-static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct net *net)
+static int nfs_callback_up_net(int minorversion, struct svc_serv *serv,
+ struct net *net, struct rpc_xprt *xprt)
{
struct nfs_net *nn = net_generic(net, nfs_net_id);
int ret;
@@ -275,20 +260,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, struct n
goto err_bind;
}
- switch (minorversion) {
- case 0:
- ret = nfs4_callback_up_net(serv, net);
- break;
- case 1:
- case 2:
- ret = nfs41_callback_up_net(serv, net);
- break;
- default:
- printk(KERN_ERR "NFS: unknown callback version: %d\n",
- minorversion);
- ret = -EINVAL;
- break;
- }
+ ret = -EPROTONOSUPPORT;
+ if (minorversion == 0)
+ ret = nfs4_callback_up_net(serv, net);
+ else if (xprt->ops->bc_up)
+ ret = xprt->ops->bc_up(serv, net);
if (ret < 0) {
printk(KERN_ERR "NFS: callback service start failed\n");
@@ -364,7 +340,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
goto err_create;
}
- ret = nfs_callback_up_net(minorversion, serv, net);
+ ret = nfs_callback_up_net(minorversion, serv, net, xprt);
if (ret < 0)
goto err_net;
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 84326e9fb47a..ff8195bd75ea 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -61,7 +61,6 @@ struct cb_compound_hdr_res {
};
struct cb_getattrargs {
- struct sockaddr *addr;
struct nfs_fh fh;
uint32_t bitmap[2];
};
@@ -76,7 +75,6 @@ struct cb_getattrres {
};
struct cb_recallargs {
- struct sockaddr *addr;
struct nfs_fh fh;
nfs4_stateid stateid;
uint32_t truncate;
@@ -119,9 +117,6 @@ extern __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
struct cb_sequenceres *res,
struct cb_process_state *cps);
-extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-
#define RCA4_TYPE_MASK_RDATA_DLG 0
#define RCA4_TYPE_MASK_WDATA_DLG 1
#define RCA4_TYPE_MASK_DIR_DLG 2
@@ -134,7 +129,6 @@ extern int nfs41_validate_delegation_stateid(struct nfs_delegation *delegation,
#define RCA4_TYPE_MASK_ALL 0xf31f
struct cb_recallanyargs {
- struct sockaddr *craa_addr;
uint32_t craa_objs_to_keep;
uint32_t craa_type_mask;
};
@@ -144,7 +138,6 @@ extern __be32 nfs4_callback_recallany(struct cb_recallanyargs *args,
struct cb_process_state *cps);
struct cb_recallslotargs {
- struct sockaddr *crsa_addr;
uint32_t crsa_target_highest_slotid;
};
extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
@@ -152,7 +145,6 @@ extern __be32 nfs4_callback_recallslot(struct cb_recallslotargs *args,
struct cb_process_state *cps);
struct cb_layoutrecallargs {
- struct sockaddr *cbl_addr;
uint32_t cbl_recall_type;
uint32_t cbl_layout_type;
uint32_t cbl_layoutchanged;
@@ -196,9 +188,6 @@ extern __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
#if IS_ENABLED(CONFIG_NFS_V4)
extern int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt);
extern void nfs_callback_down(int minorversion, struct net *net);
-extern int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation,
- const nfs4_stateid *stateid);
-extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#endif /* CONFIG_NFS_V4 */
/*
* nfs41: Callbacks are expected to not cause substantial latency,
@@ -209,6 +198,5 @@ extern int nfs4_set_callback_sessionid(struct nfs_client *clp);
#define NFS41_BC_MAX_CALLBACKS 1
extern unsigned int nfs_callback_set_tcpport;
-extern unsigned short nfs_callback_tcpport;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b85cf7a30232..807eb6ef4f91 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -17,9 +17,7 @@
#include "nfs4session.h"
#include "nfs4trace.h"
-#ifdef NFS_DEBUG
#define NFSDBG_FACILITY NFSDBG_CALLBACK
-#endif
__be32 nfs4_callback_getattr(struct cb_getattrargs *args,
struct cb_getattrres *res,
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 6b1697a01dde..646cdac73488 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -18,19 +18,21 @@
#include "internal.h"
#include "nfs4session.h"
-#define CB_OP_TAGLEN_MAXSZ (512)
-#define CB_OP_HDR_RES_MAXSZ (2 + CB_OP_TAGLEN_MAXSZ)
-#define CB_OP_GETATTR_BITMAP_MAXSZ (4)
-#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- CB_OP_GETATTR_BITMAP_MAXSZ + \
- 2 + 2 + 3 + 3)
-#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_TAGLEN_MAXSZ (512)
+#define CB_OP_HDR_RES_MAXSZ (2 * 4) // opcode, status
+#define CB_OP_GETATTR_BITMAP_MAXSZ (4 * 4) // bitmap length, 3 bitmaps
+#define CB_OP_GETATTR_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
+ CB_OP_GETATTR_BITMAP_MAXSZ + \
+ /* change, size, ctime, mtime */\
+ (2 + 2 + 3 + 3) * 4)
+#define CB_OP_RECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#if defined(CONFIG_NFS_V4_1)
#define CB_OP_LAYOUTRECALL_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_DEVICENOTIFY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_SEQUENCE_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ + \
- 4 + 1 + 3)
+ NFS4_MAX_SESSIONID_LEN + \
+ (1 + 3) * 4) // seqid, 3 slotids
#define CB_OP_RECALLANY_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#define CB_OP_RECALLSLOT_RES_MAXSZ (CB_OP_HDR_RES_MAXSZ)
#endif /* CONFIG_NFS_V4_1 */
@@ -157,7 +159,7 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound
if (unlikely(status != 0))
return status;
/* We do not like overly long tags! */
- if (hdr->taglen > CB_OP_TAGLEN_MAXSZ - 12) {
+ if (hdr->taglen > CB_OP_TAGLEN_MAXSZ) {
printk("NFS: NFSv4 CALLBACK %s: client sent tag of length %u\n",
__func__, hdr->taglen);
return htonl(NFS4ERR_RESOURCE);
@@ -198,7 +200,6 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr
status = decode_fh(xdr, &args->fh);
if (unlikely(status != 0))
goto out;
- args->addr = svc_addr(rqstp);
status = decode_bitmap(xdr, args->bitmap);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
@@ -210,7 +211,6 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr,
__be32 *p;
__be32 status;
- args->addr = svc_addr(rqstp);
status = decode_stateid(xdr, &args->stateid);
if (unlikely(status != 0))
goto out;
@@ -236,7 +236,6 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp,
__be32 status = 0;
uint32_t iomode;
- args->cbl_addr = svc_addr(rqstp);
p = read_buf(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL)) {
status = htonl(NFS4ERR_BADXDR);
@@ -383,13 +382,12 @@ static __be32 decode_sessionid(struct xdr_stream *xdr,
struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = read_buf(xdr, len);
+ p = read_buf(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(sid->data, p, len);
+ memcpy(sid->data, p, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -500,7 +498,6 @@ static __be32 decode_recallany_args(struct svc_rqst *rqstp,
uint32_t bitmap[2];
__be32 *p, status;
- args->craa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -519,7 +516,6 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
{
__be32 *p;
- args->crsa_addr = svc_addr(rqstp);
p = read_buf(xdr, 4);
if (unlikely(p == NULL))
return htonl(NFS4ERR_BADXDR);
@@ -684,13 +680,12 @@ static __be32 encode_sessionid(struct xdr_stream *xdr,
const struct nfs4_sessionid *sid)
{
__be32 *p;
- int len = NFS4_MAX_SESSIONID_LEN;
- p = xdr_reserve_space(xdr, len);
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
if (unlikely(p == NULL))
return htonl(NFS4ERR_RESOURCE);
- memcpy(p, sid, len);
+ memcpy(p, sid, NFS4_MAX_SESSIONID_LEN);
return 0;
}
@@ -704,7 +699,9 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp,
if (unlikely(status != 0))
goto out;
- encode_sessionid(xdr, &res->csr_sessionid);
+ status = encode_sessionid(xdr, &res->csr_sessionid);
+ if (status)
+ goto out;
p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t));
if (unlikely(p == NULL))
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 57c5a02f6213..d6d5d2a48e83 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -764,6 +764,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server,
server->time_delta = fsinfo->time_delta;
+ server->clone_blksize = fsinfo->clone_blksize;
/* We're airborne Set socket buffersize */
rpc_setbufsize(server->client, server->wsize + 100, server->rsize + 100);
}
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 2714ef835bdd..5166adcfc0fb 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -113,7 +113,8 @@ out:
return status;
}
-static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *stateid)
+static int nfs_delegation_claim_opens(struct inode *inode,
+ const nfs4_stateid *stateid, fmode_t type)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *ctx;
@@ -140,7 +141,7 @@ again:
/* Block nfs4_proc_unlck */
mutex_lock(&sp->so_delegreturn_mutex);
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
- err = nfs4_open_delegation_recall(ctx, state, stateid);
+ err = nfs4_open_delegation_recall(ctx, state, stateid, type);
if (!err)
err = nfs_delegation_claim_locks(ctx, state, stateid);
if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
@@ -411,7 +412,8 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation
do {
if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
break;
- err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+ err = nfs_delegation_claim_opens(inode, &delegation->stateid,
+ delegation->type);
if (!issync || err != -EAGAIN)
break;
/*
@@ -719,14 +721,12 @@ int nfs_async_inode_return_delegation(struct inode *inode,
struct nfs_client *clp = server->nfs_client;
struct nfs_delegation *delegation;
- filemap_flush(inode->i_mapping);
-
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation == NULL)
goto out_enoent;
-
- if (!clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
+ if (stateid != NULL &&
+ !clp->cl_mvops->match_stateid(&delegation->stateid, stateid))
goto out_enoent;
nfs_mark_return_delegation(server, delegation);
rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index a44829173e57..333063e032f0 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -54,7 +54,7 @@ void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
/* NFSv4 delegation-related procedures */
int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid);
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid);
bool nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3d8e4ffa0a33..ce5a21861074 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1714,9 +1714,6 @@ nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
dfprintk(VFS, "NFS: mknod(%s/%lu), %pd\n",
dir->i_sb->s_id, dir->i_ino, dentry);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
attr.ia_mode = mode;
attr.ia_valid = ATTR_MODE;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 38678d9a5cc4..4b1d08f56aba 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -166,8 +166,11 @@ nfs_direct_select_verf(struct nfs_direct_req *dreq,
struct nfs_writeverf *verfp = &dreq->verf;
#ifdef CONFIG_NFS_V4_1
- if (ds_clp) {
- /* pNFS is in use, use the DS verf */
+ /*
+ * pNFS is in use, use the DS verf except commit_through_mds is set
+ * for layout segment where nbuckets is zero.
+ */
+ if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
else
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c0f9b1ed12b9..93e236429c5d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -473,8 +473,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page);
/* Always try to initiate a 'commit' if relevant, but only
- * wait for it if __GFP_WAIT is set. Even then, only wait 1
- * second and only if the 'bdi' is not congested.
+ * wait for it if the caller allows blocking. Even then,
+ * only wait 1 second and only if the 'bdi' is not congested.
* Waiting indefinitely can cause deadlocks when the NFS
* server is on this machine, when a new TCP connection is
* needed and in other rare cases. There is no particular
@@ -484,7 +484,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp)
if (mapping) {
struct nfs_server *nfss = NFS_SERVER(mapping->host);
nfs_commit_inode(mapping->host, 0);
- if ((gfp & __GFP_WAIT) &&
+ if (gfpflags_allow_blocking(gfp) &&
!bdi_write_congested(&nfss->backing_dev_info)) {
wait_on_page_bit_killable_timeout(page, PG_private,
HZ);
@@ -738,18 +738,7 @@ out_noconflict:
static int do_vfs_lock(struct file *file, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_file_wait(file, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_file_wait(file, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_file_wait(file, fl);
}
static int
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index b34f2e228601..02ec07973bc4 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -629,23 +629,18 @@ out_put:
goto out;
}
-static void filelayout_free_fh_array(struct nfs4_filelayout_segment *fl)
+static void _filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
{
int i;
- for (i = 0; i < fl->num_fh; i++) {
- if (!fl->fh_array[i])
- break;
- kfree(fl->fh_array[i]);
+ if (fl->fh_array) {
+ for (i = 0; i < fl->num_fh; i++) {
+ if (!fl->fh_array[i])
+ break;
+ kfree(fl->fh_array[i]);
+ }
+ kfree(fl->fh_array);
}
- kfree(fl->fh_array);
- fl->fh_array = NULL;
-}
-
-static void
-_filelayout_free_lseg(struct nfs4_filelayout_segment *fl)
-{
- filelayout_free_fh_array(fl);
kfree(fl);
}
@@ -716,21 +711,21 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
/* Do we want to use a mempool here? */
fl->fh_array[i] = kmalloc(sizeof(struct nfs_fh), gfp_flags);
if (!fl->fh_array[i])
- goto out_err_free;
+ goto out_err;
p = xdr_inline_decode(&stream, 4);
if (unlikely(!p))
- goto out_err_free;
+ goto out_err;
fl->fh_array[i]->size = be32_to_cpup(p++);
if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) {
printk(KERN_ERR "NFS: Too big fh %d received %d\n",
i, fl->fh_array[i]->size);
- goto out_err_free;
+ goto out_err;
}
p = xdr_inline_decode(&stream, fl->fh_array[i]->size);
if (unlikely(!p))
- goto out_err_free;
+ goto out_err;
memcpy(fl->fh_array[i]->data, p, fl->fh_array[i]->size);
dprintk("DEBUG: %s: fh len %d\n", __func__,
fl->fh_array[i]->size);
@@ -739,8 +734,6 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo,
__free_page(scratch);
return 0;
-out_err_free:
- filelayout_free_fh_array(fl);
out_err:
__free_page(scratch);
return -EIO;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index fbc5a56de875..03516c80855a 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -339,6 +339,19 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
}
}
+static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
+{
+ struct nfs4_deviceid_node *node;
+ int i;
+
+ if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
+ return;
+ for (i = 0; i < fls->mirror_array_cnt; i++) {
+ node = &fls->mirror_array[i]->mirror_ds->id_node;
+ clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
+ }
+}
+
static struct pnfs_layout_segment *
ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
struct nfs4_layoutget_res *lgr,
@@ -499,6 +512,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
rc = ff_layout_check_layout(lgr);
if (rc)
goto out_err_free;
+ ff_layout_mark_devices_valid(fls);
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
@@ -741,17 +755,17 @@ ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
}
static struct nfs4_pnfs_ds *
-ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg,
+ int start_idx,
int *best_idx)
{
- struct nfs4_ff_layout_segment *fls;
+ struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
struct nfs4_pnfs_ds *ds;
int idx;
- fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
/* mirrors are sorted by efficiency */
- for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
- ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) {
+ ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
if (ds) {
*best_idx = idx;
return ds;
@@ -782,7 +796,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
if (pgio->pg_lseg == NULL)
goto out_mds;
- ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+ ds = ff_layout_choose_best_ds_for_read(pgio->pg_lseg, 0, &ds_idx);
if (!ds)
goto out_mds;
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
@@ -1035,7 +1049,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
rpc_wake_up(&tbl->slot_tbl_waitq);
/* fall through */
default:
- if (ff_layout_has_available_ds(lseg))
+ if (ff_layout_no_fallback_to_mds(lseg) ||
+ ff_layout_has_available_ds(lseg))
return -NFS4ERR_RESET_TO_PNFS;
reset:
dprintk("%s Retry through MDS. Error %d\n", __func__,
@@ -1153,7 +1168,6 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
/* NFS_PROTO call done callback routines */
-
static int ff_layout_read_done_cb(struct rpc_task *task,
struct nfs_pgio_header *hdr)
{
@@ -1171,6 +1185,10 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
switch (err) {
case -NFS4ERR_RESET_TO_PNFS:
+ if (ff_layout_choose_best_ds_for_read(hdr->lseg,
+ hdr->pgio_mirror_idx + 1,
+ &hdr->pgio_mirror_idx))
+ goto out_eagain;
set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
&hdr->lseg->pls_layout->plh_flags);
pnfs_read_resend_pnfs(hdr);
@@ -1179,11 +1197,13 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
ff_layout_reset_read(hdr);
return task->tk_status;
case -EAGAIN:
- rpc_restart_call_prepare(task);
- return -EAGAIN;
+ goto out_eagain;
}
return 0;
+out_eagain:
+ rpc_restart_call_prepare(task);
+ return -EAGAIN;
}
static bool
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 68cc0d9828f9..2bb08bc6aaf0 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -10,6 +10,7 @@
#define FS_NFS_NFS4FLEXFILELAYOUT_H
#define FF_FLAGS_NO_LAYOUTCOMMIT 1
+#define FF_FLAGS_NO_IO_THRU_MDS 2
#include "../pnfs.h"
@@ -146,6 +147,12 @@ FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
}
static inline bool
+ff_layout_no_fallback_to_mds(struct pnfs_layout_segment *lseg)
+{
+ return FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_IO_THRU_MDS;
+}
+
+static inline bool
ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
{
return nfs4_test_deviceid_unavailable(node);
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 99a45283b9ee..09b190015df4 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -16,9 +16,7 @@
#include <linux/nfs_fs.h>
#include "internal.h"
-#ifdef NFS_DEBUG
-# define NFSDBG_FACILITY NFSDBG_MOUNT
-#endif
+#define NFSDBG_FACILITY NFSDBG_MOUNT
/*
* Defined by RFC 1094, section A.3; and RFC 1813, section 5.1.4
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index 814c1255f1d2..b587ccd31083 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -17,5 +17,6 @@ int nfs42_proc_deallocate(struct file *, loff_t, loff_t);
loff_t nfs42_proc_llseek(struct file *, loff_t, int);
int nfs42_proc_layoutstats_generic(struct nfs_server *,
struct nfs42_layoutstat_data *);
+int nfs42_proc_clone(struct file *, struct file *, loff_t, loff_t, loff_t);
#endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index d731bbf974aa..3e92a3cde15d 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -175,10 +175,12 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
{
struct nfs_server *server = NFS_SERVER(file_inode(filep));
struct nfs4_exception exception = { };
- int err;
+ loff_t err;
do {
err = _nfs42_proc_llseek(filep, offset, whence);
+ if (err >= 0)
+ break;
if (err == -ENOTSUPP)
return -EOPNOTSUPP;
err = nfs4_handle_exception(server, err, &exception);
@@ -269,3 +271,74 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
return PTR_ERR(task);
return 0;
}
+
+static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
+ struct file *dst_f, loff_t src_offset,
+ loff_t dst_offset, loff_t count)
+{
+ struct inode *src_inode = file_inode(src_f);
+ struct inode *dst_inode = file_inode(dst_f);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct nfs42_clone_args args = {
+ .src_fh = NFS_FH(src_inode),
+ .dst_fh = NFS_FH(dst_inode),
+ .src_offset = src_offset,
+ .dst_offset = dst_offset,
+ .dst_bitmask = server->cache_consistency_bitmask,
+ };
+ struct nfs42_clone_res res = {
+ .server = server,
+ };
+ int status;
+
+ msg->rpc_argp = &args;
+ msg->rpc_resp = &res;
+
+ status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+ if (status)
+ return status;
+
+ status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+ if (status)
+ return status;
+
+ res.dst_fattr = nfs_alloc_fattr();
+ if (!res.dst_fattr)
+ return -ENOMEM;
+
+ status = nfs4_call_sync(server->client, server, msg,
+ &args.seq_args, &res.seq_res, 0);
+ if (status == 0)
+ status = nfs_post_op_update_inode(dst_inode, res.dst_fattr);
+
+ kfree(res.dst_fattr);
+ return status;
+}
+
+int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
+ loff_t src_offset, loff_t dst_offset, loff_t count)
+{
+ struct rpc_message msg = {
+ .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLONE],
+ };
+ struct inode *inode = file_inode(src_f);
+ struct nfs_server *server = NFS_SERVER(file_inode(src_f));
+ struct nfs4_exception exception = { };
+ int err;
+
+ if (!nfs_server_capable(inode, NFS_CAP_CLONE))
+ return -EOPNOTSUPP;
+
+ do {
+ err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
+ dst_offset, count);
+ if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
+ NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
+ return -EOPNOTSUPP;
+ }
+ err = nfs4_handle_exception(server, err, &exception);
+ } while (exception.retry);
+
+ return err;
+
+}
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 0eb29e14070d..0ca482a51e53 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -34,6 +34,12 @@
1 /* opaque devaddr4 length */ + \
XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE))
#define decode_layoutstats_maxsz (op_decode_hdr_maxsz)
+#define encode_clone_maxsz (encode_stateid_maxsz + \
+ encode_stateid_maxsz + \
+ 2 /* src offset */ + \
+ 2 /* dst offset */ + \
+ 2 /* count */)
+#define decode_clone_maxsz (op_decode_hdr_maxsz)
#define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \
encode_putfh_maxsz + \
@@ -65,7 +71,20 @@
decode_sequence_maxsz + \
decode_putfh_maxsz + \
PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz)
-
+#define NFS4_enc_clone_sz (compound_encode_hdr_maxsz + \
+ encode_sequence_maxsz + \
+ encode_putfh_maxsz + \
+ encode_savefh_maxsz + \
+ encode_putfh_maxsz + \
+ encode_clone_maxsz + \
+ encode_getattr_maxsz)
+#define NFS4_dec_clone_sz (compound_decode_hdr_maxsz + \
+ decode_sequence_maxsz + \
+ decode_putfh_maxsz + \
+ decode_savefh_maxsz + \
+ decode_putfh_maxsz + \
+ decode_clone_maxsz + \
+ decode_getattr_maxsz)
static void encode_fallocate(struct xdr_stream *xdr,
struct nfs42_falloc_args *args)
@@ -128,6 +147,21 @@ static void encode_layoutstats(struct xdr_stream *xdr,
encode_uint32(xdr, 0);
}
+static void encode_clone(struct xdr_stream *xdr,
+ struct nfs42_clone_args *args,
+ struct compound_hdr *hdr)
+{
+ __be32 *p;
+
+ encode_op_hdr(xdr, OP_CLONE, decode_clone_maxsz, hdr);
+ encode_nfs4_stateid(xdr, &args->src_stateid);
+ encode_nfs4_stateid(xdr, &args->dst_stateid);
+ p = reserve_space(xdr, 3*8);
+ p = xdr_encode_hyper(p, args->src_offset);
+ p = xdr_encode_hyper(p, args->dst_offset);
+ xdr_encode_hyper(p, args->count);
+}
+
/*
* Encode ALLOCATE request
*/
@@ -206,6 +240,27 @@ static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req,
encode_nops(&hdr);
}
+/*
+ * Encode CLONE request
+ */
+static void nfs4_xdr_enc_clone(struct rpc_rqst *req,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_args *args)
+{
+ struct compound_hdr hdr = {
+ .minorversion = nfs4_xdr_minorversion(&args->seq_args),
+ };
+
+ encode_compound_hdr(xdr, req, &hdr);
+ encode_sequence(xdr, &args->seq_args, &hdr);
+ encode_putfh(xdr, args->src_fh, &hdr);
+ encode_savefh(xdr, &hdr);
+ encode_putfh(xdr, args->dst_fh, &hdr);
+ encode_clone(xdr, args, &hdr);
+ encode_getfattr(xdr, args->dst_bitmask, &hdr);
+ encode_nops(&hdr);
+}
+
static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res)
{
return decode_op_hdr(xdr, OP_ALLOCATE);
@@ -243,6 +298,11 @@ static int decode_layoutstats(struct xdr_stream *xdr)
return decode_op_hdr(xdr, OP_LAYOUTSTATS);
}
+static int decode_clone(struct xdr_stream *xdr)
+{
+ return decode_op_hdr(xdr, OP_CLONE);
+}
+
/*
* Decode ALLOCATE request
*/
@@ -351,4 +411,39 @@ out:
return status;
}
+/*
+ * Decode CLONE request
+ */
+static int nfs4_xdr_dec_clone(struct rpc_rqst *rqstp,
+ struct xdr_stream *xdr,
+ struct nfs42_clone_res *res)
+{
+ struct compound_hdr hdr;
+ int status;
+
+ status = decode_compound_hdr(xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_sequence(xdr, &res->seq_res, rqstp);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(xdr);
+ if (status)
+ goto out;
+ status = decode_putfh(xdr);
+ if (status)
+ goto out;
+ status = decode_clone(xdr);
+ if (status)
+ goto out;
+ status = decode_getfattr(xdr, res->dst_fattr, res->server);
+
+out:
+ res->rpc_status = status;
+ return status;
+}
+
#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 50cfc4ca7a02..4afdee420d25 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -183,10 +183,12 @@ struct nfs4_state {
struct nfs4_exception {
- long timeout;
- int retry;
struct nfs4_state *state;
struct inode *inode;
+ long timeout;
+ unsigned char delay : 1,
+ recovering : 1,
+ retry : 1;
};
struct nfs4_state_recovery_ops {
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index b0dbe0abed53..4aa571956cd6 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,6 +4,7 @@
* Copyright (C) 1992 Rick Sladkey
*/
#include <linux/fs.h>
+#include <linux/file.h>
#include <linux/falloc.h>
#include <linux/nfs_fs.h>
#include "delegation.h"
@@ -192,8 +193,138 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
return nfs42_proc_deallocate(filep, offset, len);
return nfs42_proc_allocate(filep, offset, len);
}
+
+static noinline long
+nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+ u64 src_off, u64 dst_off, u64 count)
+{
+ struct inode *dst_inode = file_inode(dst_file);
+ struct nfs_server *server = NFS_SERVER(dst_inode);
+ struct fd src_file;
+ struct inode *src_inode;
+ unsigned int bs = server->clone_blksize;
+ int ret;
+
+ /* dst file must be opened for writing */
+ if (!(dst_file->f_mode & FMODE_WRITE))
+ return -EINVAL;
+
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
+
+ src_file = fdget(srcfd);
+ if (!src_file.file) {
+ ret = -EBADF;
+ goto out_drop_write;
+ }
+
+ src_inode = file_inode(src_file.file);
+
+ /* src and dst must be different files */
+ ret = -EINVAL;
+ if (src_inode == dst_inode)
+ goto out_fput;
+
+ /* src file must be opened for reading */
+ if (!(src_file.file->f_mode & FMODE_READ))
+ goto out_fput;
+
+ /* src and dst must be regular files */
+ ret = -EISDIR;
+ if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
+ goto out_fput;
+
+ ret = -EXDEV;
+ if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
+ src_inode->i_sb != dst_inode->i_sb)
+ goto out_fput;
+
+ /* check alignment w.r.t. clone_blksize */
+ ret = -EINVAL;
+ if (bs) {
+ if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
+ goto out_fput;
+ if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
+ goto out_fput;
+ }
+
+ /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+ if (dst_inode < src_inode) {
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+ } else {
+ mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+ }
+
+ /* flush all pending writes on both src and dst so that server
+ * has the latest data */
+ ret = nfs_sync_inode(src_inode);
+ if (ret)
+ goto out_unlock;
+ ret = nfs_sync_inode(dst_inode);
+ if (ret)
+ goto out_unlock;
+
+ ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
+
+ /* truncate inode page cache of the dst range so that future reads can fetch
+ * new data from server */
+ if (!ret)
+ truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+ if (dst_inode < src_inode) {
+ mutex_unlock(&src_inode->i_mutex);
+ mutex_unlock(&dst_inode->i_mutex);
+ } else {
+ mutex_unlock(&dst_inode->i_mutex);
+ mutex_unlock(&src_inode->i_mutex);
+ }
+out_fput:
+ fdput(src_file);
+out_drop_write:
+ mnt_drop_write_file(dst_file);
+ return ret;
+}
+
+static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
+{
+ struct nfs_ioctl_clone_range_args args;
+
+ if (copy_from_user(&args, argp, sizeof(args)))
+ return -EFAULT;
+
+ return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_off, args.dst_off, args.count);
+}
+#else
+static long nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+ u64 src_off, u64 dst_off, u64 count)
+{
+ return -ENOTTY;
+}
+
+static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
+{
+ return -ENOTTY;
+}
#endif /* CONFIG_NFS_V4_2 */
+long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+
+ switch (cmd) {
+ case NFS_IOC_CLONE:
+ return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+ case NFS_IOC_CLONE_RANGE:
+ return nfs42_ioctl_clone_range(file, argp);
+ }
+
+ return -ENOTTY;
+}
+
const struct file_operations nfs4_file_operations = {
#ifdef CONFIG_NFS_V4_2
.llseek = nfs4_file_llseek,
@@ -216,4 +347,9 @@ const struct file_operations nfs4_file_operations = {
#endif /* CONFIG_NFS_V4_2 */
.check_flags = nfs_check_flags,
.setlease = simple_nosetlease,
+#ifdef CONFIG_COMPAT
+ .unlocked_ioctl = nfs4_ioctl,
+#else
+ .compat_ioctl = nfs4_ioctl,
+#endif /* CONFIG_COMPAT */
};
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 2e4902203c35..5ba22c6b0ffa 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -297,7 +297,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
{
const struct cred *saved_cred;
struct key *rkey;
- struct user_key_payload *payload;
+ const struct user_key_payload *payload;
ssize_t ret;
saved_cred = override_creds(id_resolver_cache);
@@ -316,7 +316,7 @@ static ssize_t nfs_idmap_get_key(const char *name, size_t namelen,
if (ret < 0)
goto out_up;
- payload = rcu_dereference(rkey->payload.rcudata);
+ payload = user_key_payload(rkey);
if (IS_ERR_OR_NULL(payload)) {
ret = PTR_ERR(payload);
goto out_up;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 693b903b48bd..765a03559363 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -78,7 +78,6 @@ struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int _nfs4_recover_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *);
static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr);
static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
@@ -239,6 +238,7 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE
FATTR4_WORD1_TIME_DELTA
| FATTR4_WORD1_FS_LAYOUT_TYPES,
FATTR4_WORD2_LAYOUT_BLKSIZE
+ | FATTR4_WORD2_CLONE_BLKSIZE
};
const u32 nfs4_fs_locations_bitmap[3] = {
@@ -344,13 +344,16 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
/* This is the error handling routine for processes that are allowed
* to sleep.
*/
-int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+static int nfs4_do_handle_exception(struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
{
struct nfs_client *clp = server->nfs_client;
struct nfs4_state *state = exception->state;
struct inode *inode = exception->inode;
int ret = errorcode;
+ exception->delay = 0;
+ exception->recovering = 0;
exception->retry = 0;
switch(errorcode) {
case 0:
@@ -359,11 +362,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
case -NFS4ERR_DELEG_REVOKED:
case -NFS4ERR_ADMIN_REVOKED:
case -NFS4ERR_BAD_STATEID:
- if (inode && nfs4_have_delegation(inode, FMODE_READ)) {
- nfs4_inode_return_delegation(inode);
- exception->retry = 1;
- return 0;
- }
+ if (inode && nfs_async_inode_return_delegation(inode,
+ NULL) == 0)
+ goto wait_on_recovery;
if (state == NULL)
break;
ret = nfs4_schedule_stateid_recovery(server, state);
@@ -409,11 +410,12 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
ret = -EBUSY;
break;
}
- case -NFS4ERR_GRACE:
case -NFS4ERR_DELAY:
- ret = nfs4_delay(server->client, &exception->timeout);
- if (ret != 0)
- break;
+ nfs_inc_server_stats(server, NFSIOS_DELAY);
+ case -NFS4ERR_GRACE:
+ exception->delay = 1;
+ return 0;
+
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_OLD_STATEID:
exception->retry = 1;
@@ -434,14 +436,85 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_
/* We failed to handle the error */
return nfs4_map_errors(ret);
wait_on_recovery:
- ret = nfs4_wait_clnt_recover(clp);
+ exception->recovering = 1;
+ return 0;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ ret = nfs4_delay(server->client, &exception->timeout);
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ ret = nfs4_wait_clnt_recover(clp);
+ if (test_bit(NFS_MIG_FAILED, &server->mig_status))
+ return -EIO;
+ goto out_retry;
+ }
+ return ret;
+out_retry:
+ if (ret == 0)
+ exception->retry = 1;
+ return ret;
+}
+
+static int
+nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server,
+ int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ int ret;
+
+ ret = nfs4_do_handle_exception(server, errorcode, exception);
+ if (exception->delay) {
+ rpc_delay(task, nfs4_update_delay(&exception->timeout));
+ goto out_retry;
+ }
+ if (exception->recovering) {
+ rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
+ rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
+ goto out_retry;
+ }
if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- return -EIO;
+ ret = -EIO;
+ return ret;
+out_retry:
if (ret == 0)
exception->retry = 1;
return ret;
}
+static int
+nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server,
+ struct nfs4_state *state, long *timeout)
+{
+ struct nfs4_exception exception = {
+ .state = state,
+ };
+
+ if (task->tk_status >= 0)
+ return 0;
+ if (timeout)
+ exception.timeout = *timeout;
+ task->tk_status = nfs4_async_handle_exception(task, server,
+ task->tk_status,
+ &exception);
+ if (exception.delay && timeout)
+ *timeout = exception.timeout;
+ if (exception.retry)
+ return -EAGAIN;
+ return 0;
+}
+
/*
* Return 'true' if 'clp' is using an rpc_client that is integrity protected
* or 'false' otherwise.
@@ -1127,6 +1200,21 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
return ret;
}
+static bool nfs4_mode_match_open_stateid(struct nfs4_state *state,
+ fmode_t fmode)
+{
+ switch(fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ return state->n_rdwr != 0;
+ case FMODE_WRITE:
+ return state->n_wronly != 0;
+ case FMODE_READ:
+ return state->n_rdonly != 0;
+ }
+ WARN_ON_ONCE(1);
+ return false;
+}
+
static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
{
int ret = 0;
@@ -1443,12 +1531,18 @@ nfs4_opendata_check_deleg(struct nfs4_opendata *data, struct nfs4_state *state)
if (delegation)
delegation_flags = delegation->flags;
rcu_read_unlock();
- if (data->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) {
+ switch (data->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
pr_err_ratelimited("NFS: Broken NFSv4 server %s is "
"returning a delegation for "
"OPEN(CLAIM_DELEGATE_CUR)\n",
clp->cl_hostname);
- } else if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
+ return;
+ }
+ if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
data->owner->so_cred,
&data->o_res);
@@ -1571,17 +1665,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
return opendata;
}
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
+ fmode_t fmode)
{
struct nfs4_state *newstate;
int ret;
- if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR ||
- opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) &&
- (opendata->o_arg.u.delegation_type & fmode) != fmode)
- /* This mode can't have been delegated, so we must have
- * a valid open_stateid to cover it - not need to reclaim.
- */
+ if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
return 0;
opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
@@ -1597,14 +1687,14 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod
newstate = nfs4_opendata_to_nfs4_state(opendata);
if (IS_ERR(newstate))
return PTR_ERR(newstate);
+ if (newstate != opendata->state)
+ ret = -ESTALE;
nfs4_close_state(newstate, fmode);
- *res = newstate;
- return 0;
+ return ret;
}
static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *state)
{
- struct nfs4_state *newstate;
int ret;
/* Don't trigger recovery in nfs_test_and_clear_all_open_stateid */
@@ -1615,27 +1705,15 @@ static int nfs4_open_recover(struct nfs4_opendata *opendata, struct nfs4_state *
clear_bit(NFS_DELEGATED_STATE, &state->flags);
clear_bit(NFS_OPEN_STATE, &state->flags);
smp_rmb();
- if (state->n_rdwr != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
- if (state->n_wronly != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_WRITE, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
- if (state->n_rdonly != 0) {
- ret = nfs4_open_recover_helper(opendata, FMODE_READ, &newstate);
- if (ret != 0)
- return ret;
- if (newstate != state)
- return -ESTALE;
- }
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (ret != 0)
+ return ret;
+ ret = nfs4_open_recover_helper(opendata, FMODE_READ);
+ if (ret != 0)
+ return ret;
/*
* We may have performed cached opens for all three recoveries.
* Check if we need to update the current stateid.
@@ -1759,18 +1837,35 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
return err;
}
-int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid)
+int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
+ struct nfs4_state *state, const nfs4_stateid *stateid,
+ fmode_t type)
{
struct nfs_server *server = NFS_SERVER(state->inode);
struct nfs4_opendata *opendata;
- int err;
+ int err = 0;
opendata = nfs4_open_recoverdata_alloc(ctx, state,
NFS4_OPEN_CLAIM_DELEG_CUR_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
- err = nfs4_open_recover(opendata, state);
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ write_sequnlock(&state->seqlock);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ switch (type & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ|FMODE_WRITE:
+ case FMODE_WRITE:
+ err = nfs4_open_recover_helper(opendata, FMODE_READ|FMODE_WRITE);
+ if (err)
+ break;
+ err = nfs4_open_recover_helper(opendata, FMODE_WRITE);
+ if (err)
+ break;
+ case FMODE_READ:
+ err = nfs4_open_recover_helper(opendata, FMODE_READ);
+ }
nfs4_opendata_put(opendata);
return nfs4_handle_delegation_recall_error(server, state, stateid, err);
}
@@ -1850,6 +1945,8 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
data->rpc_done = 0;
data->rpc_status = 0;
data->timestamp = jiffies;
+ if (data->is_recover)
+ nfs4_set_sequence_privileged(&data->c_arg.seq_args);
task = rpc_run_task(&task_setup_data);
if (IS_ERR(task))
return PTR_ERR(task);
@@ -2645,6 +2742,15 @@ out:
return err;
}
+static bool
+nfs4_wait_on_layoutreturn(struct inode *inode, struct rpc_task *task)
+{
+ if (inode == NULL || !nfs_have_layout(inode))
+ return false;
+
+ return pnfs_wait_on_layoutreturn(inode, task);
+}
+
struct nfs4_closedata {
struct inode *inode;
struct nfs4_state *state;
@@ -2763,6 +2869,11 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
goto out_no_action;
}
+ if (nfs4_wait_on_layoutreturn(inode, task)) {
+ nfs_release_seqid(calldata->arg.seqid);
+ goto out_wait;
+ }
+
if (calldata->arg.fmode == 0)
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
if (calldata->roc)
@@ -4492,7 +4603,7 @@ static inline int nfs4_server_supports_acls(struct nfs_server *server)
#define NFS4ACL_MAXPAGES DIV_ROUND_UP(XATTR_SIZE_MAX, PAGE_SIZE)
static int buf_to_pages_noslab(const void *buf, size_t buflen,
- struct page **pages, unsigned int *pgbase)
+ struct page **pages)
{
struct page *newpage, **spages;
int rc = 0;
@@ -4636,7 +4747,6 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, size_t bu
goto out_free;
args.acl_len = npages * PAGE_SIZE;
- args.acl_pgbase = 0;
dprintk("%s buf %p buflen %zu npages %d args.acl_len %zu\n",
__func__, buf, buflen, npages, args.acl_len);
@@ -4728,7 +4838,7 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
return -EOPNOTSUPP;
if (npages > ARRAY_SIZE(pages))
return -ERANGE;
- i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+ i = buf_to_pages_noslab(buf, buflen, arg.acl_pages);
if (i < 0)
return i;
nfs4_inode_return_delegation(inode);
@@ -4917,79 +5027,6 @@ out:
#endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
- struct nfs4_state *state, long *timeout)
-{
- struct nfs_client *clp = server->nfs_client;
-
- if (task->tk_status >= 0)
- return 0;
- switch(task->tk_status) {
- case -NFS4ERR_DELEG_REVOKED:
- case -NFS4ERR_ADMIN_REVOKED:
- case -NFS4ERR_BAD_STATEID:
- case -NFS4ERR_OPENMODE:
- if (state == NULL)
- break;
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_EXPIRED:
- if (state != NULL) {
- if (nfs4_schedule_stateid_recovery(server, state) < 0)
- goto recovery_failed;
- }
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_STALE_CLIENTID:
- nfs4_schedule_lease_recovery(clp);
- goto wait_on_recovery;
- case -NFS4ERR_MOVED:
- if (nfs4_schedule_migration_recovery(server) < 0)
- goto recovery_failed;
- goto wait_on_recovery;
- case -NFS4ERR_LEASE_MOVED:
- nfs4_schedule_lease_moved_recovery(clp);
- goto wait_on_recovery;
-#if defined(CONFIG_NFS_V4_1)
- case -NFS4ERR_BADSESSION:
- case -NFS4ERR_BADSLOT:
- case -NFS4ERR_BAD_HIGH_SLOT:
- case -NFS4ERR_DEADSESSION:
- case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- case -NFS4ERR_SEQ_FALSE_RETRY:
- case -NFS4ERR_SEQ_MISORDERED:
- dprintk("%s ERROR %d, Reset session\n", __func__,
- task->tk_status);
- nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
- goto wait_on_recovery;
-#endif /* CONFIG_NFS_V4_1 */
- case -NFS4ERR_DELAY:
- nfs_inc_server_stats(server, NFSIOS_DELAY);
- rpc_delay(task, nfs4_update_delay(timeout));
- goto restart_call;
- case -NFS4ERR_GRACE:
- rpc_delay(task, NFS4_POLL_RETRY_MAX);
- case -NFS4ERR_RETRY_UNCACHED_REP:
- case -NFS4ERR_OLD_STATEID:
- goto restart_call;
- }
- task->tk_status = nfs4_map_errors(task->tk_status);
- return 0;
-recovery_failed:
- task->tk_status = -EIO;
- return 0;
-wait_on_recovery:
- rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
- if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
- rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
- if (test_bit(NFS_MIG_FAILED, &server->mig_status))
- goto recovery_failed;
-restart_call:
- task->tk_status = 0;
- return -EAGAIN;
-}
-
static void nfs4_init_boot_verifier(const struct nfs_client *clp,
nfs4_verifier *bootverf)
{
@@ -5308,6 +5345,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
+ if (nfs4_wait_on_layoutreturn(d_data->inode, task))
+ return;
+
if (d_data->roc)
pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
@@ -5472,18 +5512,7 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
{
- int res = 0;
- switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
- case FL_POSIX:
- res = posix_lock_inode_wait(inode, fl);
- break;
- case FL_FLOCK:
- res = flock_lock_inode_wait(inode, fl);
- break;
- default:
- BUG();
- }
- return res;
+ return locks_lock_inode_wait(inode, fl);
}
struct nfs4_unlockdata {
@@ -5492,7 +5521,7 @@ struct nfs4_unlockdata {
struct nfs4_lock_state *lsp;
struct nfs_open_context *ctx;
struct file_lock fl;
- const struct nfs_server *server;
+ struct nfs_server *server;
unsigned long timestamp;
};
@@ -6219,9 +6248,10 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp)
#define XATTR_NAME_NFSV4_ACL "system.nfs4_acl"
-static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
+static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
const void *buf, size_t buflen,
- int flags, int type)
+ int flags)
{
if (strcmp(key, "") != 0)
return -EINVAL;
@@ -6229,8 +6259,9 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key,
return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
}
-static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
if (strcmp(key, "") != 0)
return -EINVAL;
@@ -6238,9 +6269,10 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key,
return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
}
-static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list,
+static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
size_t list_len, const char *name,
- size_t name_len, int type)
+ size_t name_len)
{
size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
@@ -6258,9 +6290,10 @@ static inline int nfs4_server_supports_labels(struct nfs_server *server)
return server->caps & NFS_CAP_SECURITY_LABEL;
}
-static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
- const void *buf, size_t buflen,
- int flags, int type)
+static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ const void *buf, size_t buflen,
+ int flags)
{
if (security_ismaclabel(key))
return nfs4_set_security_label(dentry, buf, buflen);
@@ -6268,17 +6301,19 @@ static int nfs4_xattr_set_nfs4_label(struct dentry *dentry, const char *key,
return -EOPNOTSUPP;
}
-static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key,
- void *buf, size_t buflen, int type)
+static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *key,
+ void *buf, size_t buflen)
{
if (security_ismaclabel(key))
return nfs4_get_security_label(d_inode(dentry), buf, buflen);
return -EOPNOTSUPP;
}
-static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list,
- size_t list_len, const char *name,
- size_t name_len, int type)
+static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
+ size_t list_len, const char *name,
+ size_t name_len)
{
size_t len = 0;
@@ -7800,39 +7835,46 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
dprintk("%s: NFS4ERR_RECALLCONFLICT waiting %lu\n",
__func__, delay);
rpc_delay(task, delay);
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- goto out; /* Do not call nfs4_async_handle_error() */
+ /* Do not call nfs4_async_handle_error() */
+ goto out_restart;
}
break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_BAD_STATEID:
spin_lock(&inode->i_lock);
- lo = NFS_I(inode)->layout;
- if (!lo || list_empty(&lo->plh_segs)) {
+ if (nfs4_stateid_match(&lgp->args.stateid,
+ &lgp->args.ctx->state->stateid)) {
spin_unlock(&inode->i_lock);
/* If the open stateid was bad, then recover it. */
state = lgp->args.ctx->state;
- } else {
+ break;
+ }
+ lo = NFS_I(inode)->layout;
+ if (lo && nfs4_stateid_match(&lgp->args.stateid,
+ &lo->plh_stateid)) {
LIST_HEAD(head);
/*
* Mark the bad layout state as invalid, then retry
* with the current stateid.
*/
+ set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
spin_unlock(&inode->i_lock);
pnfs_free_lseg_list(&head);
-
- task->tk_status = 0;
- rpc_restart_call_prepare(task);
- }
+ } else
+ spin_unlock(&inode->i_lock);
+ goto out_restart;
}
if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN)
- rpc_restart_call_prepare(task);
+ goto out_restart;
out:
dprintk("<-- %s\n", __func__);
return;
+out_restart:
+ task->tk_status = 0;
+ rpc_restart_call_prepare(task);
+ return;
out_overflow:
task->tk_status = -EOVERFLOW;
goto out;
@@ -8681,7 +8723,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
| NFS_CAP_ALLOCATE
| NFS_CAP_DEALLOCATE
| NFS_CAP_SEEK
- | NFS_CAP_LAYOUTSTATS,
+ | NFS_CAP_LAYOUTSTATS
+ | NFS_CAP_CLONE,
.init_client = nfs41_init_client,
.shutdown_client = nfs41_shutdown_client,
.match_stateid = nfs41_match_stateid,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index da73bc443238..d854693a15b0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1481,7 +1481,7 @@ restart:
spin_unlock(&state->state_lock);
}
nfs4_put_open_state(state);
- clear_bit(NFS4CLNT_RECLAIM_NOGRACE,
+ clear_bit(NFS_STATE_RECLAIM_NOGRACE,
&state->flags);
spin_lock(&sp->so_lock);
goto restart;
@@ -1725,7 +1725,8 @@ restart:
if (!test_and_clear_bit(ops->owner_flag_bit,
&sp->so_flags))
continue;
- atomic_inc(&sp->so_count);
+ if (!atomic_inc_not_zero(&sp->so_count))
+ continue;
spin_unlock(&clp->cl_lock);
rcu_read_unlock();
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 28df12e525ba..671cf68fe56b 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -409,7 +409,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
__entry->flags = flags;
__entry->fmode = (__force unsigned int)ctx->mode;
__entry->dev = ctx->dentry->d_sb->s_dev;
- if (!IS_ERR(state))
+ if (!IS_ERR_OR_NULL(state))
inode = state->inode;
if (inode != NULL) {
__entry->fileid = NFS_FILEID(inode);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 788adf3897c7..dfed4f5c8fcc 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1659,7 +1659,7 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
*p = cpu_to_be32(FATTR4_WORD0_ACL);
p = reserve_space(xdr, 4);
*p = cpu_to_be32(arg->acl_len);
- xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+ xdr_write_pages(xdr, arg->acl_pages, 0, arg->acl_len);
}
static void
@@ -2491,7 +2491,7 @@ static void nfs4_xdr_enc_getacl(struct rpc_rqst *req, struct xdr_stream *xdr,
encode_getattr_two(xdr, FATTR4_WORD0_ACL, 0, &hdr);
xdr_inline_pages(&req->rq_rcv_buf, replen << 2,
- args->acl_pages, args->acl_pgbase, args->acl_len);
+ args->acl_pages, 0, args->acl_len);
encode_nops(&hdr);
}
@@ -4375,6 +4375,11 @@ static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
goto xdr_error;
if ((status = decode_attr_files_total(xdr, bitmap, &fsstat->tfiles)) != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
if ((status = decode_attr_space_avail(xdr, bitmap, &fsstat->abytes)) != 0)
goto xdr_error;
if ((status = decode_attr_space_free(xdr, bitmap, &fsstat->fbytes)) != 0)
@@ -4574,6 +4579,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_mode(xdr, bitmap, &fmode);
if (status < 0)
goto xdr_error;
@@ -4627,6 +4636,10 @@ static int decode_getfattr_attrs(struct xdr_stream *xdr, uint32_t *bitmap,
goto xdr_error;
fattr->valid |= status;
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_mdsthreshold(xdr, bitmap, fattr->mdsthreshold);
if (status < 0)
goto xdr_error;
@@ -4764,6 +4777,28 @@ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
return 0;
}
+/*
+ * The granularity of a CLONE operation.
+ */
+static int decode_attr_clone_blksize(struct xdr_stream *xdr, uint32_t *bitmap,
+ uint32_t *res)
+{
+ __be32 *p;
+
+ dprintk("%s: bitmap is %x\n", __func__, bitmap[2]);
+ *res = 0;
+ if (bitmap[2] & FATTR4_WORD2_CLONE_BLKSIZE) {
+ p = xdr_inline_decode(xdr, 4);
+ if (unlikely(!p)) {
+ print_overflow_msg(__func__, xdr);
+ return -EIO;
+ }
+ *res = be32_to_cpup(p);
+ bitmap[2] &= ~FATTR4_WORD2_CLONE_BLKSIZE;
+ }
+ return 0;
+}
+
static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
{
unsigned int savep;
@@ -4789,15 +4824,28 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
if ((status = decode_attr_maxwrite(xdr, bitmap, &fsinfo->wtmax)) != 0)
goto xdr_error;
fsinfo->wtpref = fsinfo->wtmax;
+
+ status = -EIO;
+ if (unlikely(bitmap[0]))
+ goto xdr_error;
+
status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
if (status != 0)
goto xdr_error;
status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
if (status != 0)
goto xdr_error;
+
+ status = -EIO;
+ if (unlikely(bitmap[1]))
+ goto xdr_error;
+
status = decode_attr_layout_blksize(xdr, bitmap, &fsinfo->blksize);
if (status)
goto xdr_error;
+ status = decode_attr_clone_blksize(xdr, bitmap, &fsinfo->clone_blksize);
+ if (status)
+ goto xdr_error;
status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
@@ -7465,6 +7513,7 @@ struct rpc_procinfo nfs4_procedures[] = {
PROC(ALLOCATE, enc_allocate, dec_allocate),
PROC(DEALLOCATE, enc_deallocate, dec_deallocate),
PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats),
+ PROC(CLONE, enc_clone, dec_clone),
#endif /* CONFIG_NFS_V4_2 */
};
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 9bc9f04fb7f6..89a15dbe5efc 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -90,7 +90,7 @@
#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
-static char nfs_root_parms[256] __initdata = "";
+static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
/* Text-based mount options passed to super.c */
static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 5aaed363556a..5c0c6b58157f 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -124,7 +124,7 @@ objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
retry_lookup:
od = osduld_info_lookup(&odi);
- if (unlikely(IS_ERR(od))) {
+ if (IS_ERR(od)) {
err = PTR_ERR(od);
dprintk("%s: osduld_info_lookup => %d\n", __func__, err);
if (err == -ENODEV && retry_flag) {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7c5718ba625e..fe3ddd20ff89 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -508,7 +508,7 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
* for it without upsetting the slab allocator.
*/
if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
- sizeof(struct page) > PAGE_SIZE)
+ sizeof(struct page *) > PAGE_SIZE)
return 0;
return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index ba1246433794..93496c059837 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1104,20 +1104,15 @@ bool pnfs_roc(struct inode *ino)
mark_lseg_invalid(lseg, &tmp_list);
found = true;
}
- /* pnfs_prepare_layoutreturn() grabs lo ref and it will be put
- * in pnfs_roc_release(). We don't really send a layoutreturn but
- * still want others to view us like we are sending one!
- *
- * If pnfs_prepare_layoutreturn() fails, it means someone else is doing
- * LAYOUTRETURN, so we proceed like there are no layouts to return.
- *
- * ROC in three conditions:
+ /* ROC in two conditions:
* 1. there are ROC lsegs
* 2. we don't send layoutreturn
- * 3. no others are sending layoutreturn
*/
- if (found && !layoutreturn && pnfs_prepare_layoutreturn(lo))
+ if (found && !layoutreturn) {
+ /* lo ref dropped in pnfs_roc_release() */
+ pnfs_get_layout_hdr(lo);
roc = true;
+ }
out_noroc:
spin_unlock(&ino->i_lock);
@@ -1172,6 +1167,26 @@ void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
spin_unlock(&ino->i_lock);
}
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ struct nfs_inode *nfsi = NFS_I(ino);
+ struct pnfs_layout_hdr *lo;
+ bool sleep = false;
+
+ /* we might not have grabbed lo reference. so need to check under
+ * i_lock */
+ spin_lock(&ino->i_lock);
+ lo = nfsi->layout;
+ if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ sleep = true;
+ spin_unlock(&ino->i_lock);
+
+ if (sleep)
+ rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+
+ return sleep;
+}
+
/*
* Compare two layout segments for sorting into layout cache.
* We want to preferentially return RW over RO layouts, so ensure those
@@ -1897,12 +1912,13 @@ static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
- if (!hdr->pnfs_error) {
+ if (likely(!hdr->pnfs_error)) {
pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
hdr->mds_offset + hdr->res.count);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_write_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
@@ -2013,11 +2029,12 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
*/
void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
{
- trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
if (likely(!hdr->pnfs_error)) {
__nfs4_read_done_cb(hdr);
hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
- } else
+ }
+ trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
+ if (unlikely(hdr->pnfs_error))
pnfs_ld_handle_read_error(hdr);
hdr->mds_ops->rpc_release(hdr);
}
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 78c9351ff117..d1990e90e7a0 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -270,6 +270,7 @@ bool pnfs_roc(struct inode *ino);
void pnfs_roc_release(struct inode *ino);
void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
+bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
int pnfs_layoutcommit_inode(struct inode *inode, bool sync);
@@ -639,6 +640,12 @@ pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
{
}
+static inline bool
+pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
+{
+ return false;
+}
+
static inline void set_pnfs_layoutdriver(struct nfs_server *s,
const struct nfs_fh *mntfh, u32 id)
{
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index ae0ff7a11b40..0a5e33f33b5c 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -72,6 +72,9 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
pgio->pg_ops = &nfs_pgio_rw_ops;
/* read path should never have more than one mirror */
@@ -243,6 +246,13 @@ static void nfs_readpage_retry(struct rpc_task *task,
nfs_set_pgio_error(hdr, -EIO, argp->offset);
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Yes, so retry the read at the end of the hdr */
hdr->mds_offset += resp->count;
argp->offset += resp->count;
@@ -265,7 +275,7 @@ static void nfs_readpage_result(struct rpc_task *task,
hdr->good_bytes = bound - hdr->io_start;
}
spin_unlock(&hdr->lock);
- } else if (hdr->res.count != hdr->args.count)
+ } else if (hdr->res.count < hdr->args.count)
nfs_readpage_retry(task, hdr);
}
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 383a027de452..f1268280244e 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2816,7 +2816,6 @@ out_invalid_transport_udp:
* NFS client for backwards compatibility
*/
unsigned int nfs_callback_set_tcpport;
-unsigned short nfs_callback_tcpport;
/* Default cache timeout is 10 minutes */
unsigned int nfs_idmap_cache_timeout = 600;
/* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
@@ -2827,7 +2826,6 @@ char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
bool recover_lost_locks = false;
EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
-EXPORT_SYMBOL_GPL(nfs_callback_tcpport);
EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
EXPORT_SYMBOL_GPL(max_session_slots);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 388f48079c43..7b9316406930 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -569,19 +569,17 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
if (!nfs_pageio_add_request(pgio, req)) {
nfs_redirty_request(req);
ret = pgio->pg_error;
- }
+ } else
+ nfs_add_stats(page_file_mapping(page)->host,
+ NFSIOS_WRITEPAGES, 1);
out:
return ret;
}
static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
{
- struct inode *inode = page_file_mapping(page)->host;
int ret;
- nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
- nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1);
-
nfs_pageio_cond_complete(pgio, page_file_index(page));
ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
if (ret == -EAGAIN) {
@@ -597,9 +595,11 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
{
struct nfs_pageio_descriptor pgio;
+ struct inode *inode = page_file_mapping(page)->host;
int err;
- nfs_pageio_init_write(&pgio, page->mapping->host, wb_priority(wbc),
+ nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
+ nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
false, &nfs_async_write_completion_ops);
err = nfs_do_writepage(page, wbc, &pgio);
nfs_pageio_complete(&pgio);
@@ -1223,7 +1223,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino
return 1;
if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
list_empty_careful(&flctx->flc_posix)))
- return 0;
+ return 1;
/* Check to see if there are whole file write locks */
ret = 0;
@@ -1351,6 +1351,9 @@ void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
{
struct nfs_pgio_mirror *mirror;
+ if (pgio->pg_ops && pgio->pg_ops->pg_cleanup)
+ pgio->pg_ops->pg_cleanup(pgio);
+
pgio->pg_ops = &nfs_pgio_rw_ops;
nfs_pageio_stop_mirroring(pgio);
@@ -1502,6 +1505,13 @@ static void nfs_writeback_result(struct rpc_task *task,
task->tk_status = -EIO;
return;
}
+
+ /* For non rpc-based layout drivers, retry-through-MDS */
+ if (!task->tk_ops) {
+ hdr->pnfs_error = -EAGAIN;
+ return;
+ }
+
/* Was this an NFSv2 write or an NFSv3 stable write? */
if (resp->verf->committed != NFS_UNSTABLE) {
/* Resend from where the server left off */
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index cdefaa331a07..c29d9421bd5e 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -56,14 +56,6 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
u32 device_generation = 0;
int error;
- /*
- * We do not attempt to support I/O smaller than the fs block size,
- * or not aligned to it.
- */
- if (args->lg_minlength < block_size) {
- dprintk("pnfsd: I/O too small\n");
- goto out_layoutunavailable;
- }
if (seg->offset & (block_size - 1)) {
dprintk("pnfsd: I/O misaligned\n");
goto out_layoutunavailable;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f6e7cbabac5a..00575d776d91 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -262,11 +262,11 @@ void fill_post_wcc(struct svc_fh *fhp)
err = fh_getattr(fhp, &fhp->fh_post_attr);
fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version;
if (err) {
- fhp->fh_post_saved = 0;
+ fhp->fh_post_saved = false;
/* Grab the ctime anyway - set_change_info might use it */
fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime;
} else
- fhp->fh_post_saved = 1;
+ fhp->fh_post_saved = true;
}
/*
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index ebf90e487c75..9ffef06b30d5 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -201,6 +201,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
INIT_LIST_HEAD(&ls->ls_perfile);
spin_lock_init(&ls->ls_lock);
INIT_LIST_HEAD(&ls->ls_layouts);
+ mutex_init(&ls->ls_mutex);
ls->ls_layout_type = layout_type;
nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
NFSPROC4_CLNT_CB_LAYOUT);
@@ -262,19 +263,23 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
status = nfserr_jukebox;
if (!ls)
goto out;
+ mutex_lock(&ls->ls_mutex);
} else {
ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
status = nfserr_bad_stateid;
+ mutex_lock(&ls->ls_mutex);
if (stateid->si_generation > stid->sc_stateid.si_generation)
- goto out_put_stid;
+ goto out_unlock_stid;
if (layout_type != ls->ls_layout_type)
- goto out_put_stid;
+ goto out_unlock_stid;
}
*lsp = ls;
return 0;
+out_unlock_stid:
+ mutex_unlock(&ls->ls_mutex);
out_put_stid:
nfs4_put_stid(stid);
out:
@@ -296,8 +301,6 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
trace_layout_recall(&ls->ls_stid.sc_stateid);
atomic_inc(&ls->ls_stid.sc_count);
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
nfsd4_run_cb(&ls->ls_recall);
out_unlock:
@@ -406,8 +409,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
list_add_tail(&new->lo_perstate, &ls->ls_layouts);
new = NULL;
done:
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lgp->lg_sid, &ls->ls_stid);
spin_unlock(&ls->ls_lock);
out:
spin_unlock(&fp->fi_lock);
@@ -481,11 +483,8 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
}
if (!list_empty(&ls->ls_layouts)) {
- if (found) {
- update_stateid(&ls->ls_stid.sc_stateid);
- memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
- sizeof(stateid_t));
- }
+ if (found)
+ nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid);
lrp->lrs_present = 1;
} else {
trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
@@ -494,6 +493,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
}
spin_unlock(&ls->ls_lock);
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
nfsd4_free_layouts(&reaplist);
return nfs_ok;
@@ -608,6 +608,16 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
}
}
+static void
+nfsd4_cb_layout_prepare(struct nfsd4_callback *cb)
+{
+ struct nfs4_layout_stateid *ls =
+ container_of(cb, struct nfs4_layout_stateid, ls_recall);
+
+ mutex_lock(&ls->ls_mutex);
+ nfs4_inc_and_copy_stateid(&ls->ls_recall_sid, &ls->ls_stid);
+}
+
static int
nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
{
@@ -649,12 +659,14 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+ mutex_unlock(&ls->ls_mutex);
nfsd4_return_all_layouts(ls, &reaplist);
nfsd4_free_layouts(&reaplist);
nfs4_put_stid(&ls->ls_stid);
}
static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+ .prepare = nfsd4_cb_layout_prepare,
.done = nfsd4_cb_layout_done,
.release = nfsd4_cb_layout_release,
};
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 4ce6b97b31ad..a9f096c7e99f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1309,6 +1309,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp,
nfserr = nfsd4_insert_layout(lgp, ls);
out_put_stid:
+ mutex_unlock(&ls->ls_mutex);
nfs4_put_stid(&ls->ls_stid);
out:
return nfserr;
@@ -1362,6 +1363,9 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp,
goto out;
}
+ /* LAYOUTCOMMIT does not require any serialization */
+ mutex_unlock(&ls->ls_mutex);
+
if (new_size > i_size_read(inode)) {
lcp->lc_size_chg = 1;
lcp->lc_newsize = new_size;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0f1d5691b795..6b800b5b8fed 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -575,6 +575,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid;
/* Will be incremented before return to client: */
atomic_set(&stid->sc_count, 1);
+ spin_lock_init(&stid->sc_lock);
/*
* It shouldn't be a problem to reuse an opaque stateid value.
@@ -745,6 +746,18 @@ nfs4_put_stid(struct nfs4_stid *s)
put_nfs4_file(fp);
}
+void
+nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid)
+{
+ stateid_t *src = &stid->sc_stateid;
+
+ spin_lock(&stid->sc_lock);
+ if (unlikely(++src->si_generation == 0))
+ src->si_generation = 1;
+ memcpy(dst, src, sizeof(*dst));
+ spin_unlock(&stid->sc_lock);
+}
+
static void nfs4_put_deleg_lease(struct nfs4_file *fp)
{
struct file *filp = NULL;
@@ -765,16 +778,68 @@ void nfs4_unhash_stid(struct nfs4_stid *s)
s->sc_type = 0;
}
-static void
+/**
+ * nfs4_get_existing_delegation - Discover if this delegation already exists
+ * @clp: a pointer to the nfs4_client we're granting a delegation to
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if an existing delegation was not found.
+ *
+ * On error: -EAGAIN if one was previously granted to this nfs4_client
+ * for this nfs4_file.
+ *
+ */
+
+static int
+nfs4_get_existing_delegation(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+ struct nfs4_delegation *searchdp = NULL;
+ struct nfs4_client *searchclp = NULL;
+
+ lockdep_assert_held(&state_lock);
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(searchdp, &fp->fi_delegations, dl_perfile) {
+ searchclp = searchdp->dl_stid.sc_client;
+ if (clp == searchclp) {
+ return -EAGAIN;
+ }
+ }
+ return 0;
+}
+
+/**
+ * hash_delegation_locked - Add a delegation to the appropriate lists
+ * @dp: a pointer to the nfs4_delegation we are adding.
+ * @fp: a pointer to the nfs4_file we're granting a delegation on
+ *
+ * Return:
+ * On success: NULL if the delegation was successfully hashed.
+ *
+ * On error: -EAGAIN if one was previously granted to this
+ * nfs4_client for this nfs4_file. Delegation is not hashed.
+ *
+ */
+
+static int
hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
{
+ int status;
+ struct nfs4_client *clp = dp->dl_stid.sc_client;
+
lockdep_assert_held(&state_lock);
lockdep_assert_held(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ if (status)
+ return status;
+ ++fp->fi_delegees;
atomic_inc(&dp->dl_stid.sc_count);
dp->dl_stid.sc_type = NFS4_DELEG_STID;
list_add(&dp->dl_perfile, &fp->fi_delegations);
- list_add(&dp->dl_perclnt, &dp->dl_stid.sc_client->cl_delegations);
+ list_add(&dp->dl_perclnt, &clp->cl_delegations);
+ return 0;
}
static bool
@@ -2256,15 +2321,20 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
clid->flags = new->cl_exchange_flags;
}
+static bool client_has_openowners(struct nfs4_client *clp)
+{
+ struct nfs4_openowner *oo;
+
+ list_for_each_entry(oo, &clp->cl_openowners, oo_perclient) {
+ if (!list_empty(&oo->oo_owner.so_stateids))
+ return true;
+ }
+ return false;
+}
+
static bool client_has_state(struct nfs4_client *clp)
{
- /*
- * Note clp->cl_openowners check isn't quite right: there's no
- * need to count owners without stateid's.
- *
- * Also note we should probably be using this in 4.0 case too.
- */
- return !list_empty(&clp->cl_openowners)
+ return client_has_openowners(clp)
#ifdef CONFIG_NFSD_PNFS
|| !list_empty(&clp->cl_lo_states)
#endif
@@ -3049,7 +3119,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
/* Cases below refer to rfc 3530 section 14.2.33: */
spin_lock(&nn->client_lock);
conf = find_confirmed_client_by_name(&clname, nn);
- if (conf) {
+ if (conf && client_has_state(conf)) {
/* case 0: */
status = nfserr_clid_inuse;
if (clp_used_exchangeid(conf))
@@ -3136,6 +3206,11 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
} else { /* case 3: normal case; new or rebooted client */
old = find_confirmed_client_by_name(&unconf->cl_name, nn);
if (old) {
+ status = nfserr_clid_inuse;
+ if (client_has_state(old)
+ && !same_creds(&unconf->cl_cred,
+ &old->cl_cred))
+ goto out;
status = mark_client_expired_locked(old);
if (status) {
old = NULL;
@@ -3317,6 +3392,27 @@ static const struct nfs4_stateowner_operations openowner_ops = {
.so_free = nfs4_free_openowner,
};
+static struct nfs4_ol_stateid *
+nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
+{
+ struct nfs4_ol_stateid *local, *ret = NULL;
+ struct nfs4_openowner *oo = open->op_openowner;
+
+ lockdep_assert_held(&fp->fi_lock);
+
+ list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
+ /* ignore lock owners */
+ if (local->st_stateowner->so_is_open_owner == 0)
+ continue;
+ if (local->st_stateowner == &oo->oo_owner) {
+ ret = local;
+ atomic_inc(&ret->st_stid.sc_count);
+ break;
+ }
+ }
+ return ret;
+}
+
static struct nfs4_openowner *
alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
struct nfsd4_compound_state *cstate)
@@ -3348,9 +3444,20 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open,
return ret;
}
-static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
+static struct nfs4_ol_stateid *
+init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
+ struct nfsd4_open *open)
+{
+
struct nfs4_openowner *oo = open->op_openowner;
+ struct nfs4_ol_stateid *retstp = NULL;
+ spin_lock(&oo->oo_owner.so_client->cl_lock);
+ spin_lock(&fp->fi_lock);
+
+ retstp = nfsd4_find_existing_open(fp, open);
+ if (retstp)
+ goto out_unlock;
atomic_inc(&stp->st_stid.sc_count);
stp->st_stid.sc_type = NFS4_OPEN_STID;
INIT_LIST_HEAD(&stp->st_locks);
@@ -3360,12 +3467,14 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
stp->st_access_bmap = 0;
stp->st_deny_bmap = 0;
stp->st_openstp = NULL;
- spin_lock(&oo->oo_owner.so_client->cl_lock);
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_perstateowner, &oo->oo_owner.so_stateids);
- spin_lock(&fp->fi_lock);
list_add(&stp->st_perfile, &fp->fi_stateids);
+
+out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&oo->oo_owner.so_client->cl_lock);
+ return retstp;
}
/*
@@ -3776,27 +3885,6 @@ out:
return nfs_ok;
}
-static struct nfs4_ol_stateid *
-nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
-{
- struct nfs4_ol_stateid *local, *ret = NULL;
- struct nfs4_openowner *oo = open->op_openowner;
-
- spin_lock(&fp->fi_lock);
- list_for_each_entry(local, &fp->fi_stateids, st_perfile) {
- /* ignore lock owners */
- if (local->st_stateowner->so_is_open_owner == 0)
- continue;
- if (local->st_stateowner == &oo->oo_owner) {
- ret = local;
- atomic_inc(&ret->st_stid.sc_count);
- break;
- }
- }
- spin_unlock(&fp->fi_lock);
- return ret;
-}
-
static inline int nfs4_access_to_access(u32 nfs4_access)
{
int flags = 0;
@@ -3945,6 +4033,18 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
return fl;
}
+/**
+ * nfs4_setlease - Obtain a delegation by requesting lease from vfs layer
+ * @dp: a pointer to the nfs4_delegation we're adding.
+ *
+ * Return:
+ * On success: Return code will be 0 on success.
+ *
+ * On error: -EAGAIN if there was an existing delegation.
+ * nonzero if there is an error in other cases.
+ *
+ */
+
static int nfs4_setlease(struct nfs4_delegation *dp)
{
struct nfs4_file *fp = dp->dl_stid.sc_file;
@@ -3976,16 +4076,19 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
goto out_unlock;
/* Race breaker */
if (fp->fi_deleg_file) {
- status = 0;
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
goto out_unlock;
}
fp->fi_deleg_file = filp;
- fp->fi_delegees = 1;
- hash_delegation_locked(dp, fp);
+ fp->fi_delegees = 0;
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
+ if (status) {
+ /* Should never happen, this is a new fi_deleg_file */
+ WARN_ON_ONCE(1);
+ goto out_fput;
+ }
return 0;
out_unlock:
spin_unlock(&fp->fi_lock);
@@ -4005,6 +4108,15 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
if (fp->fi_had_conflict)
return ERR_PTR(-EAGAIN);
+ spin_lock(&state_lock);
+ spin_lock(&fp->fi_lock);
+ status = nfs4_get_existing_delegation(clp, fp);
+ spin_unlock(&fp->fi_lock);
+ spin_unlock(&state_lock);
+
+ if (status)
+ return ERR_PTR(status);
+
dp = alloc_init_deleg(clp, fh, odstate);
if (!dp)
return ERR_PTR(-ENOMEM);
@@ -4023,9 +4135,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
status = -EAGAIN;
goto out_unlock;
}
- ++fp->fi_delegees;
- hash_delegation_locked(dp, fp);
- status = 0;
+ status = hash_delegation_locked(dp, fp);
out_unlock:
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -4160,6 +4270,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
struct nfs4_client *cl = open->op_openowner->oo_owner.so_client;
struct nfs4_file *fp = NULL;
struct nfs4_ol_stateid *stp = NULL;
+ struct nfs4_ol_stateid *swapstp = NULL;
struct nfs4_delegation *dp = NULL;
__be32 status;
@@ -4173,7 +4284,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
status = nfs4_check_deleg(cl, open, &dp);
if (status)
goto out;
+ spin_lock(&fp->fi_lock);
stp = nfsd4_find_existing_open(fp, open);
+ spin_unlock(&fp->fi_lock);
} else {
open->op_file = NULL;
status = nfserr_bad_stateid;
@@ -4187,15 +4300,32 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
*/
if (stp) {
/* Stateid was found, this is an OPEN upgrade */
+ down_read(&stp->st_rwsem);
status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open);
- if (status)
+ if (status) {
+ up_read(&stp->st_rwsem);
goto out;
+ }
} else {
stp = open->op_stp;
open->op_stp = NULL;
- init_open_stateid(stp, fp, open);
+ swapstp = init_open_stateid(stp, fp, open);
+ if (swapstp) {
+ nfs4_put_stid(&stp->st_stid);
+ stp = swapstp;
+ down_read(&stp->st_rwsem);
+ status = nfs4_upgrade_open(rqstp, fp, current_fh,
+ stp, open);
+ if (status) {
+ up_read(&stp->st_rwsem);
+ goto out;
+ }
+ goto upgrade_out;
+ }
+ down_read(&stp->st_rwsem);
status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open);
if (status) {
+ up_read(&stp->st_rwsem);
release_open_stateid(stp);
goto out;
}
@@ -4205,8 +4335,9 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
if (stp->st_clnt_odstate == open->op_odstate)
open->op_odstate = NULL;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+upgrade_out:
+ nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid);
+ up_read(&stp->st_rwsem);
if (nfsd4_has_session(&resp->cstate)) {
if (open->op_deleg_want & NFS4_SHARE_WANT_NO_DELEG) {
@@ -4819,10 +4950,13 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
* revoked delegations are kept only for free_stateid.
*/
return nfserr_bad_stateid;
+ down_write(&stp->st_rwsem);
status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate));
- if (status)
- return status;
- return nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status == nfs_ok)
+ status = nfs4_check_fh(current_fh, &stp->st_stid);
+ if (status != nfs_ok)
+ up_write(&stp->st_rwsem);
+ return status;
}
/*
@@ -4869,6 +5003,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
return status;
oo = openowner(stp->st_stateowner);
if (!(oo->oo_flags & NFS4_OO_CONFIRMED)) {
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
return nfserr_bad_stateid;
}
@@ -4899,11 +5034,13 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
oo = openowner(stp->st_stateowner);
status = nfserr_bad_stateid;
- if (oo->oo_flags & NFS4_OO_CONFIRMED)
+ if (oo->oo_flags & NFS4_OO_CONFIRMED) {
+ up_write(&stp->st_rwsem);
goto put_stateid;
+ }
oo->oo_flags |= NFS4_OO_CONFIRMED;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&oc->oc_resp_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&oc->oc_resp_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
dprintk("NFSD: %s: success, seqid=%d stateid=" STATEID_FMT "\n",
__func__, oc->oc_seqid, STATEID_VAL(&stp->st_stid.sc_stateid));
@@ -4975,13 +5112,11 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
goto put_stateid;
}
nfs4_stateid_downgrade(stp, od->od_share_access);
-
reset_union_bmap_deny(od->od_share_deny, stp);
-
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&od->od_stateid, &stp->st_stid);
status = nfs_ok;
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
@@ -5033,8 +5168,8 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfsd4_bump_seqid(cstate, status);
if (status)
goto out;
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid);
+ up_write(&stp->st_rwsem);
nfsd4_close_open_stateid(stp);
@@ -5260,6 +5395,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo,
stp->st_access_bmap = 0;
stp->st_deny_bmap = open_stp->st_deny_bmap;
stp->st_openstp = open_stp;
+ init_rwsem(&stp->st_rwsem);
list_add(&stp->st_locks, &open_stp->st_locks);
list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids);
spin_lock(&fp->fi_lock);
@@ -5428,6 +5564,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
&open_stp, nn);
if (status)
goto out;
+ up_write(&open_stp->st_rwsem);
open_sop = openowner(open_stp->st_stateowner);
status = nfserr_bad_stateid;
if (!same_clid(&open_sop->oo_owner.so_client->cl_clientid,
@@ -5435,6 +5572,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = lookup_or_create_lock_state(cstate, open_stp, lock,
&lock_stp, &new);
+ if (status == nfs_ok)
+ down_write(&lock_stp->st_rwsem);
} else {
status = nfs4_preprocess_seqid_op(cstate,
lock->lk_old_lock_seqid,
@@ -5512,9 +5651,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
switch (-err) {
case 0: /* success! */
- update_stateid(&lock_stp->st_stid.sc_stateid);
- memcpy(&lock->lk_resp_stateid, &lock_stp->st_stid.sc_stateid,
- sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
status = 0;
break;
case (EAGAIN): /* conflock holds conflicting lock */
@@ -5540,6 +5677,8 @@ out:
seqid_mutating_err(ntohl(status)))
lock_sop->lo_owner.so_seqid++;
+ up_write(&lock_stp->st_rwsem);
+
/*
* If this is a new, never-before-used stateid, and we are
* returning an error, then just go ahead and release it.
@@ -5704,11 +5843,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
dprintk("NFSD: nfs4_locku: vfs_lock_file failed!\n");
goto out_nfserr;
}
- update_stateid(&stp->st_stid.sc_stateid);
- memcpy(&locku->lu_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+ nfs4_inc_and_copy_stateid(&locku->lu_stateid, &stp->st_stid);
fput:
fput(filp);
put_stateid:
+ up_write(&stp->st_rwsem);
nfs4_put_stid(&stp->st_stid);
out:
nfsd4_bump_seqid(cstate, status);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 46ec934f5dee..54cde9a5864e 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -63,7 +63,6 @@ static unsigned int longest_chain;
static unsigned int longest_chain_cachesize;
static int nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
-static void cache_cleaner_func(struct work_struct *unused);
static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
@@ -76,13 +75,6 @@ static struct shrinker nfsd_reply_cache_shrinker = {
};
/*
- * locking for the reply cache:
- * A cache entry is "single use" if c_state == RC_INPROG
- * Otherwise, it when accessing _prev or _next, the lock must be held.
- */
-static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func);
-
-/*
* Put a cap on the size of the DRC based on the amount of available
* low memory in the machine.
*
@@ -203,7 +195,6 @@ void nfsd_reply_cache_shutdown(void)
unsigned int i;
unregister_shrinker(&nfsd_reply_cache_shrinker);
- cancel_delayed_work_sync(&cache_cleaner);
for (i = 0; i < drc_hashsize; i++) {
struct list_head *head = &drc_hashtbl[i].lru_head;
@@ -217,10 +208,8 @@ void nfsd_reply_cache_shutdown(void)
drc_hashtbl = NULL;
drc_hashsize = 0;
- if (drc_slab) {
- kmem_cache_destroy(drc_slab);
- drc_slab = NULL;
- }
+ kmem_cache_destroy(drc_slab);
+ drc_slab = NULL;
}
/*
@@ -232,7 +221,6 @@ lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp)
{
rp->c_timestamp = jiffies;
list_move_tail(&rp->c_lru, &b->lru_head);
- schedule_delayed_work(&cache_cleaner, RC_EXPIRE);
}
static long
@@ -266,7 +254,6 @@ prune_cache_entries(void)
{
unsigned int i;
long freed = 0;
- bool cancel = true;
for (i = 0; i < drc_hashsize; i++) {
struct nfsd_drc_bucket *b = &drc_hashtbl[i];
@@ -275,26 +262,11 @@ prune_cache_entries(void)
continue;
spin_lock(&b->cache_lock);
freed += prune_bucket(b);
- if (!list_empty(&b->lru_head))
- cancel = false;
spin_unlock(&b->cache_lock);
}
-
- /*
- * Conditionally rearm the job to run in RC_EXPIRE since we just
- * ran the pruner.
- */
- if (!cancel)
- mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
return freed;
}
-static void
-cache_cleaner_func(struct work_struct *unused)
-{
- prune_cache_entries();
-}
-
static unsigned long
nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
{
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index 350041a40fe5..c1681ce894c5 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -631,10 +631,7 @@ fh_put(struct svc_fh *fhp)
fh_unlock(fhp);
fhp->fh_dentry = NULL;
dput(dentry);
-#ifdef CONFIG_NFSD_V3
- fhp->fh_pre_saved = 0;
- fhp->fh_post_saved = 0;
-#endif
+ fh_clear_wcc(fhp);
}
fh_drop_write(fhp);
if (exp) {
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 1e90dad4926b..2087bae17582 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -26,16 +26,16 @@ static inline ino_t u32_to_ino_t(__u32 uino)
*/
typedef struct svc_fh {
struct knfsd_fh fh_handle; /* FH data */
+ int fh_maxsize; /* max size for fh_handle */
struct dentry * fh_dentry; /* validated dentry */
struct svc_export * fh_export; /* export pointer */
- int fh_maxsize; /* max size for fh_handle */
- unsigned char fh_locked; /* inode locked by us */
- unsigned char fh_want_write; /* remount protection taken */
+ bool fh_locked; /* inode locked by us */
+ bool fh_want_write; /* remount protection taken */
#ifdef CONFIG_NFSD_V3
- unsigned char fh_post_saved; /* post-op attrs saved */
- unsigned char fh_pre_saved; /* pre-op attrs saved */
+ bool fh_post_saved; /* post-op attrs saved */
+ bool fh_pre_saved; /* pre-op attrs saved */
/* Pre-op attributes saved during fh_lock */
__u64 fh_pre_size; /* size before operation */
@@ -213,8 +213,8 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
static inline void
fh_clear_wcc(struct svc_fh *fhp)
{
- fhp->fh_post_saved = 0;
- fhp->fh_pre_saved = 0;
+ fhp->fh_post_saved = false;
+ fhp->fh_pre_saved = false;
}
/*
@@ -231,7 +231,7 @@ fill_pre_wcc(struct svc_fh *fhp)
fhp->fh_pre_ctime = inode->i_ctime;
fhp->fh_pre_size = inode->i_size;
fhp->fh_pre_change = inode->i_version;
- fhp->fh_pre_saved = 1;
+ fhp->fh_pre_saved = true;
}
}
@@ -267,7 +267,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
inode = d_inode(dentry);
mutex_lock_nested(&inode->i_mutex, subclass);
fill_pre_wcc(fhp);
- fhp->fh_locked = 1;
+ fhp->fh_locked = true;
}
static inline void
@@ -285,7 +285,7 @@ fh_unlock(struct svc_fh *fhp)
if (fhp->fh_locked) {
fill_post_wcc(fhp);
mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
- fhp->fh_locked = 0;
+ fhp->fh_locked = false;
}
}
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 583ffc13cae2..77fdf4de91ba 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -84,7 +84,7 @@ struct nfsd4_callback_ops {
* fields that are of general use to any stateid.
*/
struct nfs4_stid {
- atomic_t sc_count;
+ atomic_t sc_count;
#define NFS4_OPEN_STID 1
#define NFS4_LOCK_STID 2
#define NFS4_DELEG_STID 4
@@ -94,11 +94,12 @@ struct nfs4_stid {
#define NFS4_REVOKED_DELEG_STID 16
#define NFS4_CLOSED_DELEG_STID 32
#define NFS4_LAYOUT_STID 64
- unsigned char sc_type;
- stateid_t sc_stateid;
- struct nfs4_client *sc_client;
- struct nfs4_file *sc_file;
- void (*sc_free)(struct nfs4_stid *);
+ unsigned char sc_type;
+ stateid_t sc_stateid;
+ spinlock_t sc_lock;
+ struct nfs4_client *sc_client;
+ struct nfs4_file *sc_file;
+ void (*sc_free)(struct nfs4_stid *);
};
/*
@@ -364,15 +365,6 @@ struct nfs4_client_reclaim {
char cr_recdir[HEXDIR_LEN]; /* recover dir */
};
-static inline void
-update_stateid(stateid_t *stateid)
-{
- stateid->si_generation++;
- /* Wraparound recommendation from 3530bis-13 9.1.3.2: */
- if (stateid->si_generation == 0)
- stateid->si_generation = 1;
-}
-
/* A reasonable value for REPLAY_ISIZE was estimated as follows:
* The OPEN response, typically the largest, requires
* 4(status) + 8(stateid) + 20(changeinfo) + 4(rflags) + 8(verifier) +
@@ -534,15 +526,16 @@ struct nfs4_file {
* Better suggestions welcome.
*/
struct nfs4_ol_stateid {
- struct nfs4_stid st_stid; /* must be first field */
- struct list_head st_perfile;
- struct list_head st_perstateowner;
- struct list_head st_locks;
- struct nfs4_stateowner * st_stateowner;
- struct nfs4_clnt_odstate * st_clnt_odstate;
- unsigned char st_access_bmap;
- unsigned char st_deny_bmap;
- struct nfs4_ol_stateid * st_openstp;
+ struct nfs4_stid st_stid;
+ struct list_head st_perfile;
+ struct list_head st_perstateowner;
+ struct list_head st_locks;
+ struct nfs4_stateowner *st_stateowner;
+ struct nfs4_clnt_odstate *st_clnt_odstate;
+ unsigned char st_access_bmap;
+ unsigned char st_deny_bmap;
+ struct nfs4_ol_stateid *st_openstp;
+ struct rw_semaphore st_rwsem;
};
static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
@@ -561,6 +554,7 @@ struct nfs4_layout_stateid {
struct nfsd4_callback ls_recall;
stateid_t ls_recall_sid;
bool ls_recalled;
+ struct mutex ls_mutex;
};
static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
@@ -593,6 +587,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
struct kmem_cache *slab);
void nfs4_unhash_stid(struct nfs4_stid *s);
void nfs4_put_stid(struct nfs4_stid *s);
+void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
extern void nfs4_release_reclaim(struct nfsd_net *);
extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir,
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
index 82f89070594c..90967466a1e5 100644
--- a/fs/nfsd/trace.c
+++ b/fs/nfsd/trace.c
@@ -1,5 +1,3 @@
-#include "state.h"
-
#define CREATE_TRACE_POINTS
#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index c668520c344b..0befe762762b 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,6 +9,8 @@
#include <linux/tracepoint.h>
+#include "state.h"
+
DECLARE_EVENT_CLASS(nfsd_stateid_class,
TP_PROTO(stateid_t *stp),
TP_ARGS(stp),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 45c04979e7b3..994d66fbb446 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1631,7 +1631,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
/* cannot use fh_lock as we need deadlock protective ordering
* so do it by hand */
trap = lock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 1;
+ ffhp->fh_locked = tfhp->fh_locked = true;
fill_pre_wcc(ffhp);
fill_pre_wcc(tfhp);
@@ -1681,7 +1681,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
fill_post_wcc(ffhp);
fill_post_wcc(tfhp);
unlock_rename(tdentry, fdentry);
- ffhp->fh_locked = tfhp->fh_locked = 0;
+ ffhp->fh_locked = tfhp->fh_locked = false;
fh_drop_write(ffhp);
out:
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fee2451ae248..fcfc48cbe136 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -112,14 +112,14 @@ static inline int fh_want_write(struct svc_fh *fh)
int ret = mnt_want_write(fh->fh_export->ex_path.mnt);
if (!ret)
- fh->fh_want_write = 1;
+ fh->fh_want_write = true;
return ret;
}
static inline void fh_drop_write(struct svc_fh *fh)
{
if (fh->fh_want_write) {
- fh->fh_want_write = 0;
+ fh->fh_want_write = false;
mnt_drop_write(fh->fh_export->ex_path.mnt);
}
}
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 9f991007a578..ce7362c88b48 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -632,7 +632,7 @@ static inline void
set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp)
{
BUG_ON(!fhp->fh_pre_saved);
- cinfo->atomic = fhp->fh_post_saved;
+ cinfo->atomic = (u32)fhp->fh_post_saved;
cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry));
cinfo->before_change = fhp->fh_pre_change;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 8df0f3b7839b..2ccbf5531554 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -133,38 +133,38 @@ nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
/**
* nilfs_palloc_group_desc_nfrees - get the number of free entries in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
*/
static unsigned long
-nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
- const struct nilfs_palloc_group_desc *desc)
+nilfs_palloc_group_desc_nfrees(const struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock)
{
unsigned long nfree;
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ spin_lock(lock);
nfree = le32_to_cpu(desc->pg_nfrees);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ spin_unlock(lock);
return nfree;
}
/**
* nilfs_palloc_group_desc_add_entries - adjust count of free entries
- * @inode: inode of metadata file using this allocator
- * @group: group number
* @desc: pointer to descriptor structure for the group
+ * @lock: spin lock protecting @desc
* @n: delta to be added
*/
-static void
-nilfs_palloc_group_desc_add_entries(struct inode *inode,
- unsigned long group,
- struct nilfs_palloc_group_desc *desc,
- u32 n)
+static u32
+nilfs_palloc_group_desc_add_entries(struct nilfs_palloc_group_desc *desc,
+ spinlock_t *lock, u32 n)
{
- spin_lock(nilfs_mdt_bgl_lock(inode, group));
+ u32 nfree;
+
+ spin_lock(lock);
le32_add_cpu(&desc->pg_nfrees, n);
- spin_unlock(nilfs_mdt_bgl_lock(inode, group));
+ nfree = le32_to_cpu(desc->pg_nfrees);
+ spin_unlock(lock);
+ return nfree;
}
/**
@@ -240,6 +240,26 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
}
/**
+ * nilfs_palloc_delete_block - delete a block on the persistent allocator file
+ * @inode: inode of metadata file using this allocator
+ * @blkoff: block offset
+ * @prev: nilfs_bh_assoc struct of the last used buffer
+ * @lock: spin lock protecting @prev
+ */
+static int nilfs_palloc_delete_block(struct inode *inode, unsigned long blkoff,
+ struct nilfs_bh_assoc *prev,
+ spinlock_t *lock)
+{
+ spin_lock(lock);
+ if (prev->bh && blkoff == prev->blkoff) {
+ brelse(prev->bh);
+ prev->bh = NULL;
+ }
+ spin_unlock(lock);
+ return nilfs_mdt_delete_block(inode, blkoff);
+}
+
+/**
* nilfs_palloc_get_desc_block - get buffer head of a group descriptor block
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -278,6 +298,22 @@ static int nilfs_palloc_get_bitmap_block(struct inode *inode,
}
/**
+ * nilfs_palloc_delete_bitmap_block - delete a bitmap block
+ * @inode: inode of metadata file using this allocator
+ * @group: group number
+ */
+static int nilfs_palloc_delete_bitmap_block(struct inode *inode,
+ unsigned long group)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_bitmap_blkoff(inode,
+ group),
+ &cache->prev_bitmap, &cache->lock);
+}
+
+/**
* nilfs_palloc_get_entry_block - get buffer head of an entry block
* @inode: inode of metadata file using this allocator
* @nr: serial number of the entry (e.g. inode number)
@@ -296,6 +332,20 @@ int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
}
/**
+ * nilfs_palloc_delete_entry_block - delete an entry block
+ * @inode: inode of metadata file using this allocator
+ * @nr: serial number of the entry
+ */
+static int nilfs_palloc_delete_entry_block(struct inode *inode, __u64 nr)
+{
+ struct nilfs_palloc_cache *cache = NILFS_MDT(inode)->mi_palloc_cache;
+
+ return nilfs_palloc_delete_block(inode,
+ nilfs_palloc_entry_blkoff(inode, nr),
+ &cache->prev_entry, &cache->lock);
+}
+
+/**
* nilfs_palloc_block_get_group_desc - get kernel address of a group descriptor
* @inode: inode of metadata file using this allocator
* @group: group number
@@ -332,51 +382,40 @@ void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
/**
* nilfs_palloc_find_available_slot - find available slot in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @target: offset number of an entry in the group (start point)
* @bitmap: bitmap of the group
+ * @target: offset number of an entry in the group (start point)
* @bsize: size in bits
+ * @lock: spin lock protecting @bitmap
*/
-static int nilfs_palloc_find_available_slot(struct inode *inode,
- unsigned long group,
+static int nilfs_palloc_find_available_slot(unsigned char *bitmap,
unsigned long target,
- unsigned char *bitmap,
- int bsize)
-{
- int curr, pos, end, i;
-
- if (target > 0) {
- end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, target);
- if (pos < end &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
- return pos;
- } else
- end = 0;
-
- for (i = 0, curr = end;
- i < bsize;
- i += BITS_PER_LONG, curr += BITS_PER_LONG) {
- /* wrap around */
- if (curr >= bsize)
- curr = 0;
- while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
- != ~0UL) {
- end = curr + BITS_PER_LONG;
- if (end > bsize)
- end = bsize;
- pos = nilfs_find_next_zero_bit(bitmap, end, curr);
- if ((pos < end) &&
- !nilfs_set_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group), pos,
- bitmap))
+ unsigned bsize,
+ spinlock_t *lock)
+{
+ int pos, end = bsize;
+
+ if (likely(target < bsize)) {
+ pos = target;
+ do {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
return pos;
- }
+ } while (++pos < end);
+
+ end = target;
+ }
+
+ /* wrap around */
+ for (pos = 0; pos < end; pos++) {
+ pos = nilfs_find_next_zero_bit(bitmap, end, pos);
+ if (pos >= end)
+ break;
+ if (!nilfs_set_bit_atomic(lock, pos, bitmap))
+ return pos;
}
+
return -ENOSPC;
}
@@ -475,15 +514,15 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, maxgroup, ngroups;
unsigned long group_offset, maxgroup_offset;
- unsigned long n, entries_per_group, groups_per_desc_block;
+ unsigned long n, entries_per_group;
unsigned long i, j;
+ spinlock_t *lock;
int pos, ret;
ngroups = nilfs_palloc_groups_count(inode);
maxgroup = ngroups - 1;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
entries_per_group = nilfs_palloc_entries_per_group(inode);
- groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
for (i = 0; i < ngroups; i += n) {
if (group >= ngroups) {
@@ -501,8 +540,8 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
maxgroup);
for (j = 0; j < n; j++, desc++, group++) {
- if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
- > 0) {
+ lock = nilfs_mdt_bgl_lock(inode, group);
+ if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
ret = nilfs_palloc_get_bitmap_block(
inode, group, 1, &bitmap_bh);
if (ret < 0)
@@ -510,12 +549,12 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
pos = nilfs_palloc_find_available_slot(
- inode, group, group_offset, bitmap,
- entries_per_group);
+ bitmap, group_offset,
+ entries_per_group, lock);
if (pos >= 0) {
/* found a free entry */
nilfs_palloc_group_desc_add_entries(
- inode, group, desc, -1);
+ desc, lock, -1);
req->pr_entry_nr =
entries_per_group * group + pos;
kunmap(desc_bh->b_page);
@@ -573,6 +612,7 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
unsigned long group, group_offset;
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -580,13 +620,15 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
+ lock = nilfs_mdt_bgl_lock(inode, group);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -611,6 +653,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
void *desc_kaddr, *bitmap_kaddr;
unsigned char *bitmap;
unsigned long group, group_offset;
+ spinlock_t *lock;
group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
desc_kaddr = kmap(req->pr_desc_bh->b_page);
@@ -618,12 +661,15 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
req->pr_desc_bh, desc_kaddr);
bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
- if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap))
- printk(KERN_WARNING "%s: entry number %llu already freed\n",
- __func__, (unsigned long long)req->pr_entry_nr);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)req->pr_entry_nr,
+ (unsigned long)inode->i_ino);
else
- nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
+ nilfs_palloc_group_desc_add_entries(desc, lock, 1);
kunmap(req->pr_bitmap_bh->b_page);
kunmap(req->pr_desc_bh->b_page);
@@ -680,22 +726,6 @@ void nilfs_palloc_abort_free_entry(struct inode *inode,
}
/**
- * nilfs_palloc_group_is_in - judge if an entry is in a group
- * @inode: inode of metadata file using this allocator
- * @group: group number
- * @nr: serial number of the entry (e.g. inode number)
- */
-static int
-nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
-{
- __u64 first, last;
-
- first = group * nilfs_palloc_entries_per_group(inode);
- last = first + nilfs_palloc_entries_per_group(inode) - 1;
- return (nr >= first) && (nr <= last);
-}
-
-/**
* nilfs_palloc_freev - deallocate a set of persistent objects
* @inode: inode of metadata file using this allocator
* @entry_nrs: array of entry numbers to be deallocated
@@ -708,9 +738,18 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
unsigned char *bitmap;
void *desc_kaddr, *bitmap_kaddr;
unsigned long group, group_offset;
- int i, j, n, ret;
+ __u64 group_min_nr, last_nrs[8];
+ const unsigned long epg = nilfs_palloc_entries_per_group(inode);
+ const unsigned epb = NILFS_MDT(inode)->mi_entries_per_block;
+ unsigned entry_start, end, pos;
+ spinlock_t *lock;
+ int i, j, k, ret;
+ u32 nfree;
for (i = 0; i < nitems; i = j) {
+ int change_group = false;
+ int nempties = 0, n = 0;
+
group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
if (ret < 0)
@@ -721,38 +760,89 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
brelse(desc_bh);
return ret;
}
- desc_kaddr = kmap(desc_bh->b_page);
- desc = nilfs_palloc_block_get_group_desc(
- inode, group, desc_bh, desc_kaddr);
+
+ /* Get the first entry number of the group */
+ group_min_nr = (__u64)group * epg;
+
bitmap_kaddr = kmap(bitmap_bh->b_page);
bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
- for (j = i, n = 0;
- (j < nitems) && nilfs_palloc_group_is_in(inode, group,
- entry_nrs[j]);
- j++) {
- nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
- if (!nilfs_clear_bit_atomic(
- nilfs_mdt_bgl_lock(inode, group),
- group_offset, bitmap)) {
- printk(KERN_WARNING
- "%s: entry number %llu already freed\n",
- __func__,
- (unsigned long long)entry_nrs[j]);
+ lock = nilfs_mdt_bgl_lock(inode, group);
+
+ j = i;
+ entry_start = rounddown(group_offset, epb);
+ do {
+ if (!nilfs_clear_bit_atomic(lock, group_offset,
+ bitmap)) {
+ nilfs_warning(inode->i_sb, __func__,
+ "entry number %llu already freed: ino=%lu\n",
+ (unsigned long long)entry_nrs[j],
+ (unsigned long)inode->i_ino);
} else {
n++;
}
- }
- nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
+
+ j++;
+ if (j >= nitems || entry_nrs[j] < group_min_nr ||
+ entry_nrs[j] >= group_min_nr + epg) {
+ change_group = true;
+ } else {
+ group_offset = entry_nrs[j] - group_min_nr;
+ if (group_offset >= entry_start &&
+ group_offset < entry_start + epb) {
+ /* This entry is in the same block */
+ continue;
+ }
+ }
+
+ /* Test if the entry block is empty or not */
+ end = entry_start + epb;
+ pos = nilfs_find_next_bit(bitmap, end, entry_start);
+ if (pos >= end) {
+ last_nrs[nempties++] = entry_nrs[j - 1];
+ if (nempties >= ARRAY_SIZE(last_nrs))
+ break;
+ }
+
+ if (change_group)
+ break;
+
+ /* Go on to the next entry block */
+ entry_start = rounddown(group_offset, epb);
+ } while (true);
kunmap(bitmap_bh->b_page);
- kunmap(desc_bh->b_page);
+ mark_buffer_dirty(bitmap_bh);
+ brelse(bitmap_bh);
+ for (k = 0; k < nempties; k++) {
+ ret = nilfs_palloc_delete_entry_block(inode,
+ last_nrs[k]);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete block of entry %llu: ino=%lu, err=%d\n",
+ (unsigned long long)last_nrs[k],
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
+
+ desc_kaddr = kmap_atomic(desc_bh->b_page);
+ desc = nilfs_palloc_block_get_group_desc(
+ inode, group, desc_bh, desc_kaddr);
+ nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
+ kunmap_atomic(desc_kaddr);
mark_buffer_dirty(desc_bh);
- mark_buffer_dirty(bitmap_bh);
nilfs_mdt_mark_dirty(inode);
-
- brelse(bitmap_bh);
brelse(desc_bh);
+
+ if (nfree == nilfs_palloc_entries_per_group(inode)) {
+ ret = nilfs_palloc_delete_bitmap_block(inode, group);
+ if (ret && ret != -ENOENT) {
+ nilfs_warning(inode->i_sb, __func__,
+ "failed to delete bitmap block of group %lu: ino=%lu, err=%d\n",
+ group,
+ (unsigned long)inode->i_ino, ret);
+ }
+ }
}
return 0;
}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
index 4bd6451b5703..6e6f49aa53df 100644
--- a/fs/nilfs2/alloc.h
+++ b/fs/nilfs2/alloc.h
@@ -77,6 +77,7 @@ int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
#define nilfs_set_bit_atomic ext2_set_bit_atomic
#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
#define nilfs_find_next_zero_bit find_next_zero_bit_le
+#define nilfs_find_next_bit find_next_bit_le
/**
* struct nilfs_bh_assoc - block offset and buffer head association
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919fd5bb14a8..3a3821b00486 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -919,8 +919,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
int level, __u64 *keyp, __u64 *ptrp)
{
struct nilfs_btree_node *node, *right;
- __u64 newkey;
- __u64 newptr;
int nchildren, n, move, ncblk;
node = nilfs_btree_get_nonroot_node(path, level);
@@ -942,9 +940,6 @@ static void nilfs_btree_split(struct nilfs_bmap *btree,
if (!buffer_dirty(path[level].bp_sib_bh))
mark_buffer_dirty(path[level].bp_sib_bh);
- newkey = nilfs_btree_node_get_key(right, 0);
- newptr = path[level].bp_newreq.bpr_ptr;
-
if (move) {
path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
nilfs_btree_node_insert(right, path[level].bp_index,
@@ -1856,7 +1851,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree,
__u64 key, __u64 ptr,
const __u64 *keys, const __u64 *ptrs, int n)
{
- struct buffer_head *bh;
+ struct buffer_head *bh = NULL;
union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
struct nilfs_bmap_stats stats;
int ret;
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 0d5fada91191..7dc23f100e57 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -155,7 +155,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
{
struct nilfs_dat_entry *entry;
- __u64 start;
sector_t blocknr;
void *kaddr;
int ret;
@@ -169,7 +168,6 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
req->pr_entry_bh, kaddr);
- start = le64_to_cpu(entry->de_start);
blocknr = le64_to_cpu(entry->de_blocknr);
kunmap_atomic(kaddr);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 54575e3cc1a2..088ba001c6ef 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -109,7 +109,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
file_update_time(vma->vm_file);
- ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+ ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
if (ret) {
nilfs_transaction_abort(inode->i_sb);
goto out;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 4a73d6dffabf..ac2f64943ff4 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -356,7 +356,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
@@ -522,7 +522,7 @@ static int __nilfs_read_inode(struct super_block *sb,
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS));
return 0;
failed_unmap:
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index dee34d990281..1125f40233ff 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -33,6 +33,7 @@
#include "page.h"
#include "mdt.h"
+#include <trace/events/nilfs2.h>
#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
@@ -68,6 +69,9 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
set_buffer_uptodate(bh);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(inode);
+
+ trace_nilfs2_mdt_insert_new_block(inode, inode->i_ino, block);
+
return 0;
}
@@ -158,6 +162,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
get_bh(bh);
submit_bh(mode, bh);
ret = 0;
+
+ trace_nilfs2_mdt_submit_block(inode, inode->i_ino, blkoff, mode);
out:
get_bh(bh);
*out_bh = bh;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index fe529a87a208..03246cac3338 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -72,7 +72,7 @@ static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
}
/* Default GFP flags using highmem */
-#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
+#define NILFS_MDT_GFP (__GFP_RECLAIM | __GFP_IO | __GFP_HIGHMEM)
int nilfs_mdt_get_block(struct inode *, unsigned long, int,
void (*init_block)(struct inode *,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 37dd6b05b1b5..c9a1a491aa91 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -120,9 +120,6 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev)
struct nilfs_transaction_info ti;
int err;
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
if (err)
return err;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index ff00a0b7acb9..9b4f205d1173 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -582,7 +582,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start;
sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
unsigned long nsalvaged_blocks = 0;
@@ -814,7 +814,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs,
struct nilfs_recovery_info *ri)
{
struct buffer_head *bh_sum = NULL;
- struct nilfs_segment_summary *sum;
+ struct nilfs_segment_summary *sum = NULL;
sector_t pseg_start, pseg_end, sr_pseg_start = 0;
sector_t seg_start, seg_end; /* range of full segment (block number) */
sector_t b, end;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index c6abbad9b8e3..3b65adaae7e4 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -77,6 +77,36 @@ enum {
NILFS_ST_DONE,
};
+#define CREATE_TRACE_POINTS
+#include <trace/events/nilfs2.h>
+
+/*
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get() are
+ * wrapper functions of stage count (nilfs_sc_info->sc_stage.scnt). Users of
+ * the variable must use them because transition of stage count must involve
+ * trace events (trace_nilfs2_collection_stage_transition).
+ *
+ * nilfs_sc_cstage_get() isn't required for the above purpose because it doesn't
+ * produce tracepoint events. It is provided just for making the intention
+ * clear.
+ */
+static inline void nilfs_sc_cstage_inc(struct nilfs_sc_info *sci)
+{
+ sci->sc_stage.scnt++;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline void nilfs_sc_cstage_set(struct nilfs_sc_info *sci, int next_scnt)
+{
+ sci->sc_stage.scnt = next_scnt;
+ trace_nilfs2_collection_stage_transition(sci);
+}
+
+static inline int nilfs_sc_cstage_get(struct nilfs_sc_info *sci)
+{
+ return sci->sc_stage.scnt;
+}
+
/* State flags of collection */
#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
@@ -184,11 +214,18 @@ int nilfs_transaction_begin(struct super_block *sb,
{
struct the_nilfs *nilfs;
int ret = nilfs_prepare_segment_lock(ti);
+ struct nilfs_transaction_info *trace_ti;
if (unlikely(ret < 0))
return ret;
- if (ret > 0)
+ if (ret > 0) {
+ trace_ti = current->journal_info;
+
+ trace_nilfs2_transaction_transition(sb, trace_ti,
+ trace_ti->ti_count, trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
+ }
sb_start_intwrite(sb);
@@ -199,6 +236,11 @@ int nilfs_transaction_begin(struct super_block *sb,
ret = -ENOSPC;
goto failed;
}
+
+ trace_ti = current->journal_info;
+ trace_nilfs2_transaction_transition(sb, trace_ti, trace_ti->ti_count,
+ trace_ti->ti_flags,
+ TRACE_NILFS2_TRANSACTION_BEGIN);
return 0;
failed:
@@ -231,6 +273,8 @@ int nilfs_transaction_commit(struct super_block *sb)
ti->ti_flags |= NILFS_TI_COMMIT;
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
return 0;
}
if (nilfs->ns_writer) {
@@ -242,6 +286,9 @@ int nilfs_transaction_commit(struct super_block *sb)
nilfs_segctor_do_flush(sci, 0);
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_COMMIT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_SYNC)
@@ -260,10 +307,15 @@ void nilfs_transaction_abort(struct super_block *sb)
BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
if (ti->ti_count > 0) {
ti->ti_count--;
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
return;
}
up_read(&nilfs->ns_segctor_sem);
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_ABORT);
+
current->journal_info = ti->ti_save;
if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
kmem_cache_free(nilfs_transaction_cachep, ti);
@@ -309,6 +361,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
current->journal_info = ti;
for (;;) {
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_TRYLOCK);
+
down_write(&nilfs->ns_segctor_sem);
if (!test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags))
break;
@@ -320,6 +375,9 @@ static void nilfs_transaction_lock(struct super_block *sb,
}
if (gcflag)
ti->ti_flags |= NILFS_TI_GC;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_LOCK);
}
static void nilfs_transaction_unlock(struct super_block *sb)
@@ -332,6 +390,9 @@ static void nilfs_transaction_unlock(struct super_block *sb)
up_write(&nilfs->ns_segctor_sem);
current->journal_info = ti->ti_save;
+
+ trace_nilfs2_transaction_transition(sb, ti, ti->ti_count,
+ ti->ti_flags, TRACE_NILFS2_TRANSACTION_UNLOCK);
}
static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -1062,7 +1123,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
size_t ndone;
int err = 0;
- switch (sci->sc_stage.scnt) {
+ switch (nilfs_sc_cstage_get(sci)) {
case NILFS_ST_INIT:
/* Pre-processes */
sci->sc_stage.flags = 0;
@@ -1071,7 +1132,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_nblk_inc = 0;
sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
if (mode == SC_LSEG_DSYNC) {
- sci->sc_stage.scnt = NILFS_ST_DSYNC;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DSYNC);
goto dsync_mode;
}
}
@@ -1079,10 +1140,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
sci->sc_stage.dirty_file_ptr = NULL;
sci->sc_stage.gc_inode_ptr = NULL;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DAT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DAT);
goto dat_stage;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_GC:
if (nilfs_doing_gc()) {
head = &sci->sc_gc_inodes;
@@ -1103,7 +1164,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.gc_inode_ptr = NULL;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_FILE:
head = &sci->sc_dirty_files;
ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
@@ -1125,10 +1186,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
sci->sc_stage.dirty_file_ptr = NULL;
if (mode == SC_FLUSH_FILE) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
/* Fall through */
case NILFS_ST_IFILE:
@@ -1136,7 +1197,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++;
+ nilfs_sc_cstage_inc(sci);
/* Creating a checkpoint */
err = nilfs_segctor_create_checkpoint(sci);
if (unlikely(err))
@@ -1147,7 +1208,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SUFILE:
err = nilfs_sufile_freev(nilfs->ns_sufile, sci->sc_freesegs,
sci->sc_nfreesegs, &ndone);
@@ -1163,7 +1224,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
&nilfs_sc_file_ops);
if (unlikely(err))
break;
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_DAT:
dat_stage:
err = nilfs_segctor_scan_file(sci, nilfs->ns_dat,
@@ -1171,10 +1232,10 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
if (mode == SC_FLUSH_DAT) {
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
}
- sci->sc_stage.scnt++; /* Fall through */
+ nilfs_sc_cstage_inc(sci); /* Fall through */
case NILFS_ST_SR:
if (mode == SC_LSEG_SR) {
/* Appending a super root */
@@ -1184,7 +1245,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
}
/* End of a logical segment */
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DSYNC:
dsync_mode:
@@ -1197,7 +1258,7 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
break;
sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
- sci->sc_stage.scnt = NILFS_ST_DONE;
+ nilfs_sc_cstage_set(sci, NILFS_ST_DONE);
return 0;
case NILFS_ST_DONE:
return 0;
@@ -1442,7 +1503,8 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
goto failed;
/* The current segment is filled up */
- if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
+ if (mode != SC_LSEG_SR ||
+ nilfs_sc_cstage_get(sci) < NILFS_ST_CPFILE)
break;
nilfs_clear_logs(&sci->sc_segbufs);
@@ -1946,7 +2008,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
- sci->sc_stage.scnt = NILFS_ST_INIT;
+ nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
err = nilfs_segctor_collect_dirty_files(sci, nilfs);
@@ -1974,7 +2036,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
goto failed;
/* Avoid empty segment */
- if (sci->sc_stage.scnt == NILFS_ST_DONE &&
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE &&
nilfs_segbuf_empty(sci->sc_curseg)) {
nilfs_segctor_abort_construction(sci, nilfs, 1);
goto out;
@@ -1988,7 +2050,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
nilfs_segctor_fill_in_file_bmap(sci);
if (mode == SC_LSEG_SR &&
- sci->sc_stage.scnt >= NILFS_ST_CPFILE) {
+ nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
err = nilfs_segctor_fill_in_checkpoint(sci);
if (unlikely(err))
goto failed_to_write;
@@ -2007,7 +2069,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (unlikely(err))
goto failed_to_write;
- if (sci->sc_stage.scnt == NILFS_ST_DONE ||
+ if (nilfs_sc_cstage_get(sci) == NILFS_ST_DONE ||
nilfs->ns_blocksize_bits != PAGE_CACHE_SHIFT) {
/*
* At this point, we avoid double buffering
@@ -2020,7 +2082,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
if (err)
goto failed_to_write;
}
- } while (sci->sc_stage.scnt != NILFS_ST_DONE);
+ } while (nilfs_sc_cstage_get(sci) != NILFS_ST_DONE);
out:
nilfs_segctor_drop_written_files(sci, nilfs);
@@ -2430,7 +2492,6 @@ static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
{
int mode = 0;
- int err;
spin_lock(&sci->sc_state_lock);
mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
@@ -2438,7 +2499,7 @@ static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
spin_unlock(&sci->sc_state_lock);
if (mode) {
- err = nilfs_segctor_do_construct(sci, mode);
+ nilfs_segctor_do_construct(sci, mode);
spin_lock(&sci->sc_state_lock);
sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index a48d6de1e02c..0408b9b2814b 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -67,7 +67,8 @@ struct nilfs_recovery_info {
/**
* struct nilfs_cstage - Context of collection stage
- * @scnt: Stage count
+ * @scnt: Stage count, must be accessed via wrappers:
+ * nilfs_sc_cstage_inc(), nilfs_sc_cstage_set(), nilfs_sc_cstage_get()
* @flags: State flags
* @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
* @gc_inode_ptr: Pointer on the list of gc-inodes
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 2a869c35c362..52821ffc11f4 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -30,6 +30,8 @@
#include "mdt.h"
#include "sufile.h"
+#include <trace/events/nilfs2.h>
+
/**
* struct nilfs_sufile_info - on-memory private data of sufile
* @mi: on-memory private data of metadata file
@@ -317,7 +319,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
size_t susz = NILFS_MDT(sufile)->mi_entry_size;
__u64 segnum, maxsegnum, last_alloc;
void *kaddr;
- unsigned long nsegments, ncleansegs, nsus, cnt;
+ unsigned long nsegments, nsus, cnt;
int ret, j;
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -327,7 +329,6 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
goto out_sem;
kaddr = kmap_atomic(header_bh->b_page);
header = kaddr + bh_offset(header_bh);
- ncleansegs = le64_to_cpu(header->sh_ncleansegs);
last_alloc = le64_to_cpu(header->sh_last_alloc);
kunmap_atomic(kaddr);
@@ -358,6 +359,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
break; /* never happens */
}
}
+ trace_nilfs2_segment_usage_check(sufile, segnum, cnt);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
&su_bh);
if (ret < 0)
@@ -388,6 +390,9 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
nilfs_mdt_mark_dirty(sufile);
brelse(su_bh);
*segnump = segnum;
+
+ trace_nilfs2_segment_usage_allocated(sufile, segnum);
+
goto out_header;
}
@@ -490,6 +495,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
NILFS_SUI(sufile)->ncleansegs++;
nilfs_mdt_mark_dirty(sufile);
+
+ trace_nilfs2_segment_usage_freed(sufile, segnum);
}
/**
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f47585bfeb01..354013ea22ec 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -361,7 +361,7 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
struct nilfs_super_block *nsbp;
sector_t blocknr, newblocknr;
unsigned long offset;
- int sb2i = -1; /* array index of the secondary superblock */
+ int sb2i; /* array index of the secondary superblock */
int ret = 0;
/* nilfs->ns_sem must be locked by the caller. */
@@ -372,6 +372,9 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
} else if (nilfs->ns_sbh[0]->b_blocknr > nilfs->ns_first_data_block) {
sb2i = 0;
blocknr = nilfs->ns_sbh[0]->b_blocknr;
+ } else {
+ sb2i = -1;
+ blocknr = 0;
}
if (sb2i >= 0 && (u64)blocknr << nilfs->ns_blocksize_bits == sb2off)
goto out; /* super block location is unchanged */
@@ -1405,14 +1408,10 @@ static void nilfs_destroy_cachep(void)
*/
rcu_barrier();
- if (nilfs_inode_cachep)
- kmem_cache_destroy(nilfs_inode_cachep);
- if (nilfs_transaction_cachep)
- kmem_cache_destroy(nilfs_transaction_cachep);
- if (nilfs_segbuf_cachep)
- kmem_cache_destroy(nilfs_segbuf_cachep);
- if (nilfs_btree_path_cache)
- kmem_cache_destroy(nilfs_btree_path_cache);
+ kmem_cache_destroy(nilfs_inode_cachep);
+ kmem_cache_destroy(nilfs_transaction_cachep);
+ kmem_cache_destroy(nilfs_segbuf_cachep);
+ kmem_cache_destroy(nilfs_btree_path_cache);
}
static int __init nilfs_init_cachep(void)
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 6b6f0d472ae8..fd98e5100cab 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,9 +83,16 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(mark->inode);
if (inode) {
+ /*
+ * IN_ALL_EVENTS represents all of the mask bits
+ * that we expose to userspace. There is at
+ * least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+ u32 mask = mark->mask & IN_ALL_EVENTS;
seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mark->mask, mark->ignored_mask);
+ mask, mark->ignored_mask);
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5b1e2a497e51..b8d08d0d0a4d 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -706,7 +706,19 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
int ret;
unsigned flags = 0;
- /* don't allow invalid bits: we don't want flags set */
+ /*
+ * We share a lot of code with fs/dnotify. We also share
+ * the bit layout between inotify's IN_* and the fsnotify
+ * FS_*. This check ensures that only the inotify IN_*
+ * bits get passed in and set in watches/events.
+ */
+ if (unlikely(mask & ~ALL_INOTIFY_BITS))
+ return -EINVAL;
+ /*
+ * Require at least one valid bit set in the mask.
+ * Without _something_ set, we would have no events to
+ * watch for.
+ */
if (unlikely(!(mask & ALL_INOTIFY_BITS)))
return -EINVAL;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 262561fea923..9d383e5eff0e 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -525,8 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
}
}
err = add_to_page_cache_lru(*cached_page, mapping,
- index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(err)) {
if (err == -EEXIST)
continue;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 64b11d90eca6..7f604727f487 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -589,6 +589,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
ret = -EIO;
goto bail;
}
+ set_buffer_new(bh_result);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
}
@@ -864,6 +865,7 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
if (is_overwrite < 0) {
mlog_errno(is_overwrite);
+ ret = is_overwrite;
ocfs2_inode_unlock(inode, 1);
goto clean_orphan;
}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index e404386bd93e..709fbbd44c65 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -219,7 +219,8 @@ struct o2hb_region {
unsigned hr_unclean_stop:1,
hr_aborted_start:1,
hr_item_pinned:1,
- hr_item_dropped:1;
+ hr_item_dropped:1,
+ hr_node_deleted:1;
/* protected by the hr_callback_sem */
struct task_struct *hr_task;
@@ -1078,7 +1079,13 @@ static int o2hb_thread(void *data)
set_user_nice(current, MIN_NICE);
/* Pin node */
- o2nm_depend_this_node();
+ ret = o2nm_depend_this_node();
+ if (ret) {
+ mlog(ML_ERROR, "Node has been deleted, ret = %d\n", ret);
+ reg->hr_node_deleted = 1;
+ wake_up(&o2hb_steady_queue);
+ return 0;
+ }
while (!kthread_should_stop() &&
!reg->hr_unclean_stop && !reg->hr_aborted_start) {
@@ -1789,7 +1796,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
spin_unlock(&o2hb_live_lock);
ret = wait_event_interruptible(o2hb_steady_queue,
- atomic_read(&reg->hr_steady_iterations) == 0);
+ atomic_read(&reg->hr_steady_iterations) == 0 ||
+ reg->hr_node_deleted);
if (ret) {
atomic_set(&reg->hr_steady_iterations, 0);
reg->hr_aborted_start = 1;
@@ -1800,6 +1808,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
goto out3;
}
+ if (reg->hr_node_deleted) {
+ ret = -EINVAL;
+ goto out3;
+ }
+
/* Ok, we were woken. Make sure it wasn't by drop_item() */
spin_lock(&o2hb_live_lock);
hb_task = reg->hr_task;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6918f30d02cd..2ee7fe747cea 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1866,6 +1866,7 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
int status;
unsigned int backoff;
unsigned int total_backoff = 0;
+ char wq_name[O2NM_MAX_NAME_LEN];
BUG_ON(!dlm);
@@ -1895,7 +1896,8 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
- dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ snprintf(wq_name, O2NM_MAX_NAME_LEN, "dlm_wq-%s", dlm->name);
+ dlm->dlm_worker = create_singlethread_workqueue(wq_name);
if (!dlm->dlm_worker) {
status = -ENOMEM;
mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 46b8b2bbc95a..ce38b4ccc9ab 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1439,6 +1439,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data,
int found, ret;
int set_maybe;
int dispatch_assert = 0;
+ int dispatched = 0;
if (!dlm_grab(dlm))
return DLM_MASTER_RESP_NO;
@@ -1657,16 +1658,20 @@ send_response:
if (ret < 0) {
mlog(ML_ERROR, "failed to dispatch assert master work\n");
response = DLM_MASTER_RESP_ERROR;
+ spin_unlock(&res->spinlock);
dlm_lockres_put(res);
- } else
+ } else {
+ dispatched = 1;
__dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
+ spin_unlock(&res->spinlock);
+ }
} else {
if (res)
dlm_lockres_put(res);
}
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return response;
}
@@ -2090,7 +2095,6 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
/* queue up work for dlm_assert_master_worker */
- dlm_grab(dlm); /* get an extra ref for the work item */
dlm_init_work_item(dlm, item, dlm_assert_master_worker, NULL);
item->u.am.lockres = res; /* already have a ref */
/* can optionally ignore node numbers higher than this node */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index ce12e0b1a31f..9e4f862d20fe 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -205,7 +205,7 @@ int dlm_launch_recovery_thread(struct dlm_ctxt *dlm)
mlog(0, "starting dlm recovery thread...\n");
dlm->dlm_reco_thread_task = kthread_run(dlm_recovery_thread, dlm,
- "dlm_reco_thread");
+ "dlm_reco-%s", dlm->name);
if (IS_ERR(dlm->dlm_reco_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_reco_thread_task));
dlm->dlm_reco_thread_task = NULL;
@@ -1694,6 +1694,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
unsigned int hash;
int master = DLM_LOCK_RES_OWNER_UNKNOWN;
u32 flags = DLM_ASSERT_MASTER_REQUERY;
+ int dispatched = 0;
if (!dlm_grab(dlm)) {
/* since the domain has gone away on this
@@ -1719,9 +1720,11 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
dlm_put(dlm);
/* sender will take care of this and retry */
return ret;
- } else
+ } else {
+ dispatched = 1;
__dlm_lockres_grab_inflight_worker(dlm, res);
- spin_unlock(&res->spinlock);
+ spin_unlock(&res->spinlock);
+ }
} else {
/* put.. incase we are not the master */
spin_unlock(&res->spinlock);
@@ -1730,7 +1733,8 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data,
}
spin_unlock(&dlm->spinlock);
- dlm_put(dlm);
+ if (!dispatched)
+ dlm_put(dlm);
return master;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 2e5e6d5fffe8..c5f6c241ecd7 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -493,7 +493,8 @@ int dlm_launch_thread(struct dlm_ctxt *dlm)
{
mlog(0, "Starting dlm_thread...\n");
- dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm_thread");
+ dlm->dlm_thread_task = kthread_run(dlm_thread, dlm, "dlm-%s",
+ dlm->name);
if (IS_ERR(dlm->dlm_thread_task)) {
mlog_errno(PTR_ERR(dlm->dlm_thread_task));
dlm->dlm_thread_task = NULL;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c91103c1333..20276e340339 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2998,7 +2998,8 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
}
/* launch downconvert thread */
- osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc");
+ osb->dc_task = kthread_run(ocfs2_downconvert_thread, osb, "ocfs2dc-%s",
+ osb->uuid_str);
if (IS_ERR(osb->dc_task)) {
status = PTR_ERR(osb->dc_task);
osb->dc_task = NULL;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..aac8b86f312e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -112,6 +112,8 @@ struct ocfs2_inode_info
#define OCFS2_INODE_OPEN_DIRECT 0x00000020
/* Tell the inode wipe code it's not in orphan dir */
#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000040
+/* Entry in orphan dir with 'dio-' prefix */
+#define OCFS2_INODE_DIO_ORPHAN_ENTRY 0x00000080
static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
{
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index ff82b28462a6..13534f4fe5b5 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1090,7 +1090,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
/* Launch the commit thread */
if (!local) {
osb->commit_task = kthread_run(ocfs2_commit_thread, osb,
- "ocfs2cmt");
+ "ocfs2cmt-%s", osb->uuid_str);
if (IS_ERR(osb->commit_task)) {
status = PTR_ERR(osb->commit_task);
osb->commit_task = NULL;
@@ -1507,7 +1507,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
goto out;
osb->recovery_thread_task = kthread_run(__ocfs2_recovery_thread, osb,
- "ocfs2rec");
+ "ocfs2rec-%s", osb->uuid_str);
if (IS_ERR(osb->recovery_thread_task)) {
mlog_errno((int)PTR_ERR(osb->recovery_thread_task));
osb->recovery_thread_task = NULL;
@@ -2021,6 +2021,7 @@ struct ocfs2_orphan_filldir_priv {
struct dir_context ctx;
struct inode *head;
struct ocfs2_super *osb;
+ enum ocfs2_orphan_reco_type orphan_reco_type;
};
static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
@@ -2036,12 +2037,22 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
if (name_len == 2 && !strncmp("..", name, 2))
return 0;
+ /* do not include dio entry in case of orphan scan */
+ if ((p->orphan_reco_type == ORPHAN_NO_NEED_TRUNCATE) &&
+ (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN)))
+ return 0;
+
/* Skip bad inodes so that recovery can continue */
iter = ocfs2_iget(p->osb, ino,
OCFS2_FI_FLAG_ORPHAN_RECOVERY, 0);
if (IS_ERR(iter))
return 0;
+ if (!strncmp(name, OCFS2_DIO_ORPHAN_PREFIX,
+ OCFS2_DIO_ORPHAN_PREFIX_LEN))
+ OCFS2_I(iter)->ip_flags |= OCFS2_INODE_DIO_ORPHAN_ENTRY;
+
/* Skip inodes which are already added to recover list, since dio may
* happen concurrently with unlink/rename */
if (OCFS2_I(iter)->ip_next_orphan) {
@@ -2060,14 +2071,16 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
static int ocfs2_queue_orphans(struct ocfs2_super *osb,
int slot,
- struct inode **head)
+ struct inode **head,
+ enum ocfs2_orphan_reco_type orphan_reco_type)
{
int status;
struct inode *orphan_dir_inode = NULL;
struct ocfs2_orphan_filldir_priv priv = {
.ctx.actor = ocfs2_orphan_filldir,
.osb = osb,
- .head = *head
+ .head = *head,
+ .orphan_reco_type = orphan_reco_type
};
orphan_dir_inode = ocfs2_get_system_file_inode(osb,
@@ -2170,7 +2183,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
trace_ocfs2_recover_orphans(slot);
ocfs2_mark_recovering_orphan_dir(osb, slot);
- ret = ocfs2_queue_orphans(osb, slot, &inode);
+ ret = ocfs2_queue_orphans(osb, slot, &inode, orphan_reco_type);
ocfs2_clear_recovering_orphan_dir(osb, slot);
/* Error here should be noted, but we want to continue with as
@@ -2186,25 +2199,51 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi->ip_next_orphan;
oi->ip_next_orphan = NULL;
- mutex_lock(&inode->i_mutex);
- ret = ocfs2_rw_lock(inode, 1);
- if (ret < 0) {
- mlog_errno(ret);
- goto next;
- }
- /*
- * We need to take and drop the inode lock to
- * force read inode from disk.
- */
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
- goto unlock_rw;
- }
+ if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
+ mutex_lock(&inode->i_mutex);
+ ret = ocfs2_rw_lock(inode, 1);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto unlock_mutex;
+ }
+ /*
+ * We need to take and drop the inode lock to
+ * force read inode from disk.
+ */
+ ret = ocfs2_inode_lock(inode, &di_bh, 1);
+ if (ret) {
+ mlog_errno(ret);
+ goto unlock_rw;
+ }
- di = (struct ocfs2_dinode *)di_bh->b_data;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
- if (inode->i_nlink == 0) {
+ if (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)) {
+ ret = ocfs2_truncate_file(inode, di_bh,
+ i_size_read(inode));
+ if (ret < 0) {
+ if (ret != -ENOSPC)
+ mlog_errno(ret);
+ goto unlock_inode;
+ }
+
+ ret = ocfs2_del_inode_from_orphan(osb, inode,
+ di_bh, 0, 0);
+ if (ret)
+ mlog_errno(ret);
+ }
+unlock_inode:
+ ocfs2_inode_unlock(inode, 1);
+ brelse(di_bh);
+ di_bh = NULL;
+unlock_rw:
+ ocfs2_rw_unlock(inode, 1);
+unlock_mutex:
+ mutex_unlock(&inode->i_mutex);
+
+ /* clear dio flag in ocfs2_inode_info */
+ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
+ } else {
spin_lock(&oi->ip_lock);
/* Set the proper information to get us going into
* ocfs2_delete_inode. */
@@ -2212,28 +2251,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
spin_unlock(&oi->ip_lock);
}
- if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) &&
- (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
- ret = ocfs2_truncate_file(inode, di_bh,
- i_size_read(inode));
- if (ret < 0) {
- if (ret != -ENOSPC)
- mlog_errno(ret);
- goto unlock_inode;
- }
-
- ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0);
- if (ret)
- mlog_errno(ret);
- } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-unlock_inode:
- ocfs2_inode_unlock(inode, 1);
- brelse(di_bh);
- di_bh = NULL;
-unlock_rw:
- ocfs2_rw_unlock(inode, 1);
-next:
- mutex_unlock(&inode->i_mutex);
iput(inode);
inode = iter;
}
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index 6b6d092b0998..652ece4a9d9e 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -66,7 +66,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
* level.
*/
- flock_lock_file_wait(file,
+ locks_lock_file_wait(file,
&(struct file_lock){.fl_type = F_UNLCK});
ocfs2_file_unlock(file);
@@ -81,7 +81,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
goto out;
}
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
if (ret)
ocfs2_file_unlock(file);
@@ -98,7 +98,7 @@ static int ocfs2_do_funlock(struct file *file, int cmd, struct file_lock *fl)
mutex_lock(&fp->fp_mutex);
ocfs2_file_unlock(file);
- ret = flock_lock_file_wait(file, fl);
+ ret = locks_lock_file_wait(file, fl);
mutex_unlock(&fp->fp_mutex);
return ret;
@@ -119,7 +119,7 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
ocfs2_mount_local(osb))
- return flock_lock_file_wait(file, fl);
+ return locks_lock_file_wait(file, fl);
if (fl->fl_type == F_UNLCK)
return ocfs2_do_funlock(file, cmd, fl);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b7dfac226b1e..3b48ac25d8a7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
-#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
-#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
@@ -657,9 +655,18 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
+ if (status < 0) {
+ u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
+ int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
+ inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
+ if (tmp)
+ mlog_errno(tmp);
+ }
+
+ return status;
}
static int ocfs2_mkdir(struct inode *dir,
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e173329eb830..1155918d6784 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -26,6 +26,9 @@
#ifndef OCFS2_NAMEI_H
#define OCFS2_NAMEI_H
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
+
extern const struct inode_operations ocfs2_dir_iops;
struct dentry *ocfs2_get_parent(struct dentry *child);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index e5d57cd32505..252119860e6c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2920,16 +2920,13 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
- unsigned int from, to, readahead_pages;
+ unsigned int from, to;
loff_t offset, end, map_end;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
- readahead_pages =
- (ocfs2_cow_contig_clusters(sb) <<
- OCFS2_SB(sb)->s_clustersize_bits) >> PAGE_CACHE_SHIFT;
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index d83d2602cf2b..fc6d25f6d444 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -1920,7 +1920,10 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
res, &bits_left);
if (!status) {
- hint = ocfs2_group_from_res(res);
+ if (ocfs2_is_cluster_bitmap(ac->ac_inode))
+ hint = res->sr_bg_blkno;
+ else
+ hint = ocfs2_group_from_res(res);
goto set_hint;
}
if (status < 0 && status != -ENOSPC) {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ebfdea78659b..e9164f09841b 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -7229,9 +7229,10 @@ leave:
/*
* 'security' attributes support
*/
-static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len, int type)
+ size_t name_len)
{
const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -7244,8 +7245,9 @@ static size_t ocfs2_xattr_security_list(struct dentry *dentry, char *list,
return total_len;
}
-static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -7253,8 +7255,9 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name,
name, buffer, size);
}
-static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -7319,9 +7322,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
/*
* 'trusted' attributes support
*/
-static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len, int type)
+ size_t name_len)
{
const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -7337,8 +7341,9 @@ static size_t ocfs2_xattr_trusted_list(struct dentry *dentry, char *list,
return total_len;
}
-static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -7346,8 +7351,9 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name,
name, buffer, size);
}
-static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -7366,9 +7372,10 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
/*
* 'user' attributes support
*/
-static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
+static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list,
size_t list_size, const char *name,
- size_t name_len, int type)
+ size_t name_len)
{
const size_t prefix_len = XATTR_USER_PREFIX_LEN;
const size_t total_len = prefix_len + name_len + 1;
@@ -7385,8 +7392,9 @@ static size_t ocfs2_xattr_user_list(struct dentry *dentry, char *list,
return total_len;
}
-static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
- void *buffer, size_t size, int type)
+static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *buffer, size_t size)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
@@ -7398,8 +7406,9 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name,
buffer, size);
}
-static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 84d693d37428..871fcb67be97 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -81,11 +81,11 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
if (len == 0)
return 0;
- old_file = ovl_path_open(old, O_RDONLY);
+ old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
- new_file = ovl_path_open(new, O_WRONLY);
+ new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
@@ -267,7 +267,7 @@ out:
out_cleanup:
ovl_cleanup(wdir, newdentry);
- goto out;
+ goto out2;
}
/*
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index d9da5a4e9382..ec0c2a050043 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -363,6 +363,9 @@ struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
ovl_path_upper(dentry, &realpath);
}
+ if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
+ return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
+
return d_backing_inode(realpath.dentry);
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 79073d68b475..e38ee0fed24a 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -544,6 +544,7 @@ static void ovl_put_super(struct super_block *sb)
mntput(ufs->upper_mnt);
for (i = 0; i < ufs->numlower; i++)
mntput(ufs->lower_mnt[i]);
+ kfree(ufs->lower_mnt);
kfree(ufs->config.lowerdir);
kfree(ufs->config.upperdir);
@@ -1048,6 +1049,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
oe->lowerstack[i].dentry = stack[i].dentry;
oe->lowerstack[i].mnt = ufs->lower_mnt[i];
}
+ kfree(stack);
root_dentry->d_fsdata = oe;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8865f7963700..42cf8ddf0e55 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -366,18 +366,17 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
int offset = buf->offset + buf->len;
if (ops->can_merge && offset + chars <= PAGE_SIZE) {
- int error = ops->confirm(pipe, buf);
- if (error)
+ ret = ops->confirm(pipe, buf);
+ if (ret)
goto out;
ret = copy_page_from_iter(buf->page, offset, chars, from);
if (unlikely(ret < chars)) {
- error = -EFAULT;
+ ret = -EFAULT;
goto out;
}
do_wakeup = 1;
- buf->len += chars;
- ret = chars;
+ buf->len += ret;
if (!iov_iter_count(from))
goto out;
}
@@ -693,17 +692,20 @@ int create_pipe_files(struct file **res, int flags)
d_instantiate(path.dentry, inode);
- err = -ENFILE;
f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
- if (IS_ERR(f))
+ if (IS_ERR(f)) {
+ err = PTR_ERR(f);
goto err_dentry;
+ }
f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
f->private_data = inode->i_pipe;
res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
- if (IS_ERR(res[0]))
+ if (IS_ERR(res[0])) {
+ err = PTR_ERR(res[0]);
goto err_file;
+ }
path_get(&path);
res[0]->private_data = inode->i_pipe;
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4fb17ded7d47..4adde1e2cbec 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -762,18 +762,21 @@ posix_acl_to_xattr(struct user_namespace *user_ns, const struct posix_acl *acl,
EXPORT_SYMBOL (posix_acl_to_xattr);
static int
-posix_acl_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int type)
+posix_acl_xattr_get(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ void *value, size_t size)
{
struct posix_acl *acl;
int error;
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
if (!IS_POSIXACL(d_backing_inode(dentry)))
return -EOPNOTSUPP;
if (d_is_symlink(dentry))
return -EOPNOTSUPP;
- acl = get_acl(d_backing_inode(dentry), type);
+ acl = get_acl(d_backing_inode(dentry), handler->flags);
if (IS_ERR(acl))
return PTR_ERR(acl);
if (acl == NULL)
@@ -786,19 +789,22 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name,
}
static int
-posix_acl_xattr_set(struct dentry *dentry, const char *name,
- const void *value, size_t size, int flags, int type)
+posix_acl_xattr_set(const struct xattr_handler *handler,
+ struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags)
{
struct inode *inode = d_backing_inode(dentry);
struct posix_acl *acl = NULL;
int ret;
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
if (!IS_POSIXACL(inode))
return -EOPNOTSUPP;
if (!inode->i_op->set_acl)
return -EOPNOTSUPP;
- if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+ if (handler->flags == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
return value ? -EACCES : 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -815,28 +821,22 @@ posix_acl_xattr_set(struct dentry *dentry, const char *name,
}
}
- ret = inode->i_op->set_acl(inode, acl, type);
+ ret = inode->i_op->set_acl(inode, acl, handler->flags);
out:
posix_acl_release(acl);
return ret;
}
static size_t
-posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+posix_acl_xattr_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
- const char *xname;
+ const char *xname = handler->prefix;
size_t size;
if (!IS_POSIXACL(d_backing_inode(dentry)))
- return -EOPNOTSUPP;
- if (d_is_symlink(dentry))
- return -EOPNOTSUPP;
-
- if (type == ACL_TYPE_ACCESS)
- xname = POSIX_ACL_XATTR_ACCESS;
- else
- xname = POSIX_ACL_XATTR_DEFAULT;
+ return 0;
size = strlen(xname) + 1;
if (list && size <= list_size)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index f60f0121e331..d73291f5f0fc 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -91,18 +91,18 @@
static inline void task_name(struct seq_file *m, struct task_struct *p)
{
char *buf;
+ size_t size;
char tcomm[sizeof(p->comm)];
+ int ret;
get_task_comm(tcomm, p);
seq_puts(m, "Name:\t");
- buf = m->buf + m->count;
- /* Ignore error for now */
- buf += string_escape_str(tcomm, buf, m->size - m->count,
- ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ size = seq_get_buf(m, &buf);
+ ret = string_escape_str(tcomm, buf, size, ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
+ seq_commit(m, ret < size ? ret : -1);
- m->count = buf - m->buf;
seq_putc(m, '\n');
}
@@ -375,7 +375,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task, int whole)
{
- unsigned long vsize, eip, esp, wchan = ~0UL;
+ unsigned long vsize, eip, esp, wchan = 0;
int priority, nice;
int tty_pgrp = -1, tty_nr = 0;
sigset_t sigign, sigcatch;
@@ -507,7 +507,19 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
seq_put_decimal_ull(m, ' ', task->blocked.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigign.sig[0] & 0x7fffffffUL);
seq_put_decimal_ull(m, ' ', sigcatch.sig[0] & 0x7fffffffUL);
- seq_put_decimal_ull(m, ' ', wchan);
+
+ /*
+ * We used to output the absolute kernel address, but that's an
+ * information leak - so instead we show a 0/1 flag here, to signal
+ * to user-space whether there's a wchan field in /proc/PID/wchan.
+ *
+ * This works with older implementations of procps as well.
+ */
+ if (wchan)
+ seq_puts(m, " 1");
+ else
+ seq_puts(m, " 0");
+
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ull(m, ' ', 0);
seq_put_decimal_ll(m, ' ', task->exit_signal);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b25eee4cead5..bd3e9e68125b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -430,13 +430,10 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns,
wchan = get_wchan(task);
- if (lookup_symbol_name(wchan, symname) < 0) {
- if (!ptrace_may_access(task, PTRACE_MODE_READ))
- return 0;
- seq_printf(m, "%lu", wchan);
- } else {
+ if (wchan && ptrace_may_access(task, PTRACE_MODE_READ) && !lookup_symbol_name(wchan, symname))
seq_printf(m, "%s", symname);
- }
+ else
+ seq_putc(m, '0');
return 0;
}
@@ -1035,6 +1032,16 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
+/*
+ * /proc/pid/oom_adj exists solely for backwards compatibility with previous
+ * kernels. The effective policy is defined by oom_score_adj, which has a
+ * different scale: oom_adj grew exponentially and oom_score_adj grows linearly.
+ * Values written to oom_adj are simply mapped linearly to oom_score_adj.
+ * Processes that become oom disabled via oom_adj will still be oom disabled
+ * with this implementation.
+ *
+ * oom_adj cannot be removed since existing userspace binaries use it.
+ */
static ssize_t oom_adj_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 6e5fcd00733e..3c2a915c695a 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -291,11 +291,19 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
*/
int proc_fd_permission(struct inode *inode, int mask)
{
- int rv = generic_permission(inode, mask);
+ struct task_struct *p;
+ int rv;
+
+ rv = generic_permission(inode, mask);
if (rv == 0)
- return 0;
- if (task_tgid(current) == proc_pid(inode))
+ return rv;
+
+ rcu_read_lock();
+ p = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (p && same_thread_group(p, current))
rv = 0;
+ rcu_read_unlock();
+
return rv;
}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index d3ebf2e61853..9155a5a0d3b9 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -27,7 +27,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
{
struct sysinfo i;
unsigned long committed;
- struct vmalloc_info vmi;
long cached;
long available;
unsigned long pagecache;
@@ -49,8 +48,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
if (cached < 0)
cached = 0;
- get_vmalloc_info(&vmi);
-
for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
pages[lru] = global_page_state(NR_LRU_BASE + lru);
@@ -191,8 +188,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
K(vm_commit_limit()),
K(committed),
(unsigned long)VMALLOC_TOTAL >> 10,
- vmi.used >> 10,
- vmi.largest_chunk >> 10
+ 0ul, // used to be vmalloc 'used'
+ 0ul // used to be vmalloc 'largest_chunk'
#ifdef CONFIG_MEMORY_FAILURE
, atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)
#endif
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index fdda62e6115e..fe5b6e6c4671 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -948,7 +948,7 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir,
found:
subdir->header.nreg++;
failed:
- if (unlikely(IS_ERR(subdir))) {
+ if (IS_ERR(subdir)) {
pr_err("sysctl could not get directory: ");
sysctl_print_dir(dir);
pr_cont("/%*.*s %ld\n",
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e2d46adb54b4..187b3b5f242e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -70,6 +70,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
ptes >> 10,
pmds >> 10,
swap << (PAGE_SHIFT-10));
+ hugetlb_report_usage(m, mm);
}
unsigned long task_vsize(struct mm_struct *mm)
@@ -446,6 +447,8 @@ struct mem_size_stats {
unsigned long anonymous;
unsigned long anonymous_thp;
unsigned long swap;
+ unsigned long shared_hugetlb;
+ unsigned long private_hugetlb;
u64 pss;
u64 swap_pss;
};
@@ -625,12 +628,44 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
seq_putc(m, '\n');
}
+#ifdef CONFIG_HUGETLB_PAGE
+static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct mem_size_stats *mss = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page = NULL;
+
+ if (pte_present(*pte)) {
+ page = vm_normal_page(vma, addr, *pte);
+ } else if (is_swap_pte(*pte)) {
+ swp_entry_t swpent = pte_to_swp_entry(*pte);
+
+ if (is_migration_entry(swpent))
+ page = migration_entry_to_page(swpent);
+ }
+ if (page) {
+ int mapcount = page_mapcount(page);
+
+ if (mapcount >= 2)
+ mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
+ else
+ mss->private_hugetlb += huge_page_size(hstate_vma(vma));
+ }
+ return 0;
+}
+#endif /* HUGETLB_PAGE */
+
static int show_smap(struct seq_file *m, void *v, int is_pid)
{
struct vm_area_struct *vma = v;
struct mem_size_stats mss;
struct mm_walk smaps_walk = {
.pmd_entry = smaps_pte_range,
+#ifdef CONFIG_HUGETLB_PAGE
+ .hugetlb_entry = smaps_hugetlb_range,
+#endif
.mm = vma->vm_mm,
.private = &mss,
};
@@ -652,6 +687,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
"Referenced: %8lu kB\n"
"Anonymous: %8lu kB\n"
"AnonHugePages: %8lu kB\n"
+ "Shared_Hugetlb: %8lu kB\n"
+ "Private_Hugetlb: %7lu kB\n"
"Swap: %8lu kB\n"
"SwapPss: %8lu kB\n"
"KernelPageSize: %8lu kB\n"
@@ -667,6 +704,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
mss.referenced >> 10,
mss.anonymous >> 10,
mss.anonymous_thp >> 10,
+ mss.shared_hugetlb >> 10,
+ mss.private_hugetlb >> 10,
mss.swap >> 10,
(unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)),
vma_kernel_pagesize(vma) >> 10,
@@ -753,36 +792,37 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
pte_t ptent = *pte;
if (pte_present(ptent)) {
+ ptent = ptep_modify_prot_start(vma->vm_mm, addr, pte);
ptent = pte_wrprotect(ptent);
- ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
+ ptent = pte_clear_soft_dirty(ptent);
+ ptep_modify_prot_commit(vma->vm_mm, addr, pte, ptent);
} else if (is_swap_pte(ptent)) {
ptent = pte_swp_clear_soft_dirty(ptent);
+ set_pte_at(vma->vm_mm, addr, pte, ptent);
}
-
- set_pte_at(vma->vm_mm, addr, pte, ptent);
}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
+{
+}
+#endif
+#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
- pmd_t pmd = *pmdp;
+ pmd_t pmd = pmdp_huge_get_and_clear(vma->vm_mm, addr, pmdp);
pmd = pmd_wrprotect(pmd);
- pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+ pmd = pmd_clear_soft_dirty(pmd);
if (vma->vm_flags & VM_SOFTDIRTY)
vma->vm_flags &= ~VM_SOFTDIRTY;
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
-
#else
-
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
-{
-}
-
static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 916b8e23d968..360ae43f590c 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -1,5 +1,5 @@
config PSTORE
- bool "Persistent store support"
+ tristate "Persistent store support"
default n
select ZLIB_DEFLATE
select ZLIB_INFLATE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index e647d8e81712..b8803cc07fce 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -2,12 +2,12 @@
# Makefile for the linux pstorefs routines.
#
-obj-y += pstore.o
+obj-$(CONFIG_PSTORE) += pstore.o
pstore-objs += inode.o platform.o
-obj-$(CONFIG_PSTORE_FTRACE) += ftrace.o
+pstore-$(CONFIG_PSTORE_FTRACE) += ftrace.o
-obj-$(CONFIG_PSTORE_PMSG) += pmsg.o
+pstore-$(CONFIG_PSTORE_PMSG) += pmsg.o
ramoops-objs += ram.o ram_core.o
obj-$(CONFIG_PSTORE_RAM) += ramoops.o
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index 76a4eeb92982..d4887705bb61 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -104,22 +104,23 @@ static const struct file_operations pstore_knob_fops = {
.write = pstore_ftrace_knob_write,
};
+static struct dentry *pstore_ftrace_dir;
+
void pstore_register_ftrace(void)
{
- struct dentry *dir;
struct dentry *file;
if (!psinfo->write_buf)
return;
- dir = debugfs_create_dir("pstore", NULL);
- if (!dir) {
+ pstore_ftrace_dir = debugfs_create_dir("pstore", NULL);
+ if (!pstore_ftrace_dir) {
pr_err("%s: unable to create pstore directory\n", __func__);
return;
}
- file = debugfs_create_file("record_ftrace", 0600, dir, NULL,
- &pstore_knob_fops);
+ file = debugfs_create_file("record_ftrace", 0600, pstore_ftrace_dir,
+ NULL, &pstore_knob_fops);
if (!file) {
pr_err("%s: unable to create record_ftrace file\n", __func__);
goto err_file;
@@ -127,5 +128,17 @@ void pstore_register_ftrace(void)
return;
err_file:
- debugfs_remove(dir);
+ debugfs_remove(pstore_ftrace_dir);
+}
+
+void pstore_unregister_ftrace(void)
+{
+ mutex_lock(&pstore_ftrace_lock);
+ if (pstore_ftrace_enabled) {
+ unregister_ftrace_function(&pstore_ftrace_ops);
+ pstore_ftrace_enabled = 0;
+ }
+ mutex_unlock(&pstore_ftrace_lock);
+
+ debugfs_remove_recursive(pstore_ftrace_dir);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 3adcc4669fac..d8c439d813ce 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -178,6 +178,7 @@ static loff_t pstore_file_llseek(struct file *file, loff_t off, int whence)
}
static const struct file_operations pstore_file_operations = {
+ .owner = THIS_MODULE,
.open = pstore_file_open,
.read = pstore_file_read,
.llseek = pstore_file_llseek,
@@ -287,7 +288,7 @@ static const struct super_operations pstore_ops = {
static struct super_block *pstore_sb;
-int pstore_is_mounted(void)
+bool pstore_is_mounted(void)
{
return pstore_sb != NULL;
}
@@ -456,6 +457,7 @@ static void pstore_kill_sb(struct super_block *sb)
}
static struct file_system_type pstore_fs_type = {
+ .owner = THIS_MODULE,
.name = "pstore",
.mount = pstore_mount,
.kill_sb = pstore_kill_sb,
@@ -479,5 +481,12 @@ out:
}
module_init(init_pstore_fs)
+static void __exit exit_pstore_fs(void)
+{
+ unregister_filesystem(&pstore_fs_type);
+ sysfs_remove_mount_point(fs_kobj, "pstore");
+}
+module_exit(exit_pstore_fs)
+
MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index c36ba2cd0b5d..e38a22b31282 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -41,14 +41,18 @@ pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
#ifdef CONFIG_PSTORE_FTRACE
extern void pstore_register_ftrace(void);
+extern void pstore_unregister_ftrace(void);
#else
static inline void pstore_register_ftrace(void) {}
+static inline void pstore_unregister_ftrace(void) {}
#endif
#ifdef CONFIG_PSTORE_PMSG
extern void pstore_register_pmsg(void);
+extern void pstore_unregister_pmsg(void);
#else
static inline void pstore_register_pmsg(void) {}
+static inline void pstore_unregister_pmsg(void) {}
#endif
extern struct pstore_info *psinfo;
@@ -59,6 +63,6 @@ extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
int count, char *data, bool compressed,
size_t size, struct timespec time,
struct pstore_info *psi);
-extern int pstore_is_mounted(void);
+extern bool pstore_is_mounted(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 791743deedf1..588461bb2dd4 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -237,6 +237,14 @@ static void allocate_buf_for_compression(void)
}
+static void free_buf_for_compression(void)
+{
+ kfree(stream.workspace);
+ stream.workspace = NULL;
+ kfree(big_oops_buf);
+ big_oops_buf = NULL;
+}
+
/*
* Called when compression fails, since the printk buffer
* would be fetched for compression calling it again when
@@ -353,6 +361,19 @@ static struct kmsg_dumper pstore_dumper = {
.dump = pstore_dump,
};
+/*
+ * Register with kmsg_dump to save last part of console log on panic.
+ */
+static void pstore_register_kmsg(void)
+{
+ kmsg_dump_register(&pstore_dumper);
+}
+
+static void pstore_unregister_kmsg(void)
+{
+ kmsg_dump_unregister(&pstore_dumper);
+}
+
#ifdef CONFIG_PSTORE_CONSOLE
static void pstore_console_write(struct console *con, const char *s, unsigned c)
{
@@ -390,8 +411,14 @@ static void pstore_register_console(void)
{
register_console(&pstore_console);
}
+
+static void pstore_unregister_console(void)
+{
+ unregister_console(&pstore_console);
+}
#else
static void pstore_register_console(void) {}
+static void pstore_unregister_console(void) {}
#endif
static int pstore_write_compat(enum pstore_type_id type,
@@ -410,8 +437,6 @@ static int pstore_write_compat(enum pstore_type_id type,
* read function right away to populate the file system. If not
* then the pstore mount code will call us later to fill out
* the file system.
- *
- * Register with kmsg_dump to save last part of console log on panic.
*/
int pstore_register(struct pstore_info *psi)
{
@@ -442,7 +467,7 @@ int pstore_register(struct pstore_info *psi)
if (pstore_is_mounted())
pstore_get_records(0);
- kmsg_dump_register(&pstore_dumper);
+ pstore_register_kmsg();
if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
pstore_register_console();
@@ -462,12 +487,28 @@ int pstore_register(struct pstore_info *psi)
*/
backend = psi->name;
+ module_put(owner);
+
pr_info("Registered %s as persistent store backend\n", psi->name);
return 0;
}
EXPORT_SYMBOL_GPL(pstore_register);
+void pstore_unregister(struct pstore_info *psi)
+{
+ pstore_unregister_pmsg();
+ pstore_unregister_ftrace();
+ pstore_unregister_console();
+ pstore_unregister_kmsg();
+
+ free_buf_for_compression();
+
+ psinfo = NULL;
+ backend = NULL;
+}
+EXPORT_SYMBOL_GPL(pstore_unregister);
+
/*
* Read all the records from the persistent store. Create
* files in our filesystem. Don't warn about -EEXIST errors
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
index feb5dd2948b4..7de20cd3797f 100644
--- a/fs/pstore/pmsg.c
+++ b/fs/pstore/pmsg.c
@@ -37,6 +37,8 @@ static ssize_t write_pmsg(struct file *file, const char __user *buf,
if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
buffer = vmalloc(buffer_size);
+ if (!buffer)
+ return -ENOMEM;
mutex_lock(&pmsg_lock);
for (i = 0; i < count; ) {
@@ -112,3 +114,10 @@ err_class:
err:
return;
}
+
+void pstore_unregister_pmsg(void)
+{
+ device_destroy(pmsg_class, MKDEV(pmsg_major, 0));
+ class_destroy(pmsg_class);
+ unregister_chrdev(pmsg_major, PMSG_NAME);
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6c26c4daaec9..319c3a60cfa5 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -578,30 +578,27 @@ fail_out:
return err;
}
-static int __exit ramoops_remove(struct platform_device *pdev)
+static int ramoops_remove(struct platform_device *pdev)
{
-#if 0
- /* TODO(kees): We cannot unload ramoops since pstore doesn't support
- * unregistering yet.
- */
struct ramoops_context *cxt = &oops_cxt;
- iounmap(cxt->virt_addr);
- release_mem_region(cxt->phys_addr, cxt->size);
+ pstore_unregister(&cxt->pstore);
cxt->max_dump_cnt = 0;
- /* TODO(kees): When pstore supports unregistering, call it here. */
kfree(cxt->pstore.buf);
cxt->pstore.bufsize = 0;
+ persistent_ram_free(cxt->mprz);
+ persistent_ram_free(cxt->fprz);
+ persistent_ram_free(cxt->cprz);
+ ramoops_free_przs(cxt);
+
return 0;
-#endif
- return -EBUSY;
}
static struct platform_driver ramoops_driver = {
.probe = ramoops_probe,
- .remove = __exit_p(ramoops_remove),
+ .remove = ramoops_remove,
.driver = {
.name = "ramoops",
},
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ba1323a94924..a586467f6ff6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,6 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
unsigned order;
void *data;
int ret;
+ gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
/* make various checks */
order = get_order(newsize);
@@ -84,7 +85,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* allocate enough contiguous pages to be able to satisfy the
* request */
- pages = alloc_pages(mapping_gfp_mask(inode->i_mapping), order);
+ pages = alloc_pages(gfp, order);
if (!pages)
return -ENOMEM;
@@ -108,7 +109,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
struct page *page = pages + loop;
ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
- GFP_KERNEL);
+ gfp);
if (ret < 0)
goto add_error;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 5f1c9c29eb8c..47f96988fdd4 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -712,9 +712,6 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
2 * (REISERFS_QUOTA_INIT_BLOCKS(dir->i_sb) +
REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb));
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
retval = dquot_initialize(dir);
if (retval)
return retval;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index e87f9b52bf06..66b26fdfff8d 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -778,7 +778,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -797,7 +797,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -814,7 +814,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name)
if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
struct listxattr_buf {
@@ -842,14 +842,14 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
if (!handler) /* Unsupported xattr name */
return 0;
if (b->buf) {
- size = handler->list(b->dentry, b->buf + b->pos,
- b->size, name, namelen,
- handler->flags);
+ size = handler->list(handler, b->dentry,
+ b->buf + b->pos, b->size, name,
+ namelen);
if (size > b->size)
return -ERANGE;
} else {
- size = handler->list(b->dentry, NULL, 0, name,
- namelen, handler->flags);
+ size = handler->list(handler, b->dentry,
+ NULL, 0, name, namelen);
}
b->pos += size;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 9a3b0616f283..ac659af431ae 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -9,8 +9,8 @@
#include <linux/uaccess.h>
static int
-security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+security_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -22,8 +22,8 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-security_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+security_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX))
return -EINVAL;
@@ -34,8 +34,9 @@ security_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t security_list(struct dentry *dentry, char *list, size_t list_len,
- const char *name, size_t namelen, int handler_flags)
+static size_t security_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_len,
+ const char *name, size_t namelen)
{
const size_t len = namelen + 1;
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index e4f1343714e0..a338adf1b8b4 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -8,8 +8,8 @@
#include <linux/uaccess.h>
static int
-trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+trusted_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -21,8 +21,8 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-trusted_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX))
return -EINVAL;
@@ -33,8 +33,9 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static size_t trusted_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t len = name_len + 1;
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index d0b08d3e5689..39c9667191c5 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -7,8 +7,8 @@
#include <linux/uaccess.h>
static int
-user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
- int handler_flags)
+user_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *buffer, size_t size)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
@@ -19,8 +19,8 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size,
}
static int
-user_set(struct dentry *dentry, const char *name, const void *buffer,
- size_t size, int flags, int handler_flags)
+user_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *buffer, size_t size, int flags)
{
if (strlen(name) < sizeof(XATTR_USER_PREFIX))
return -EINVAL;
@@ -30,8 +30,9 @@ user_set(struct dentry *dentry, const char *name, const void *buffer,
return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
}
-static size_t user_list(struct dentry *dentry, char *list, size_t list_size,
- const char *name, size_t name_len, int handler_flags)
+static size_t user_list(const struct xattr_handler *handler,
+ struct dentry *dentry, char *list, size_t list_size,
+ const char *name, size_t name_len)
{
const size_t len = name_len + 1;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 225586e141ca..e85664b7c7d9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -13,6 +13,7 @@
#include <linux/cred.h>
#include <linux/mm.h>
#include <linux/printk.h>
+#include <linux/string_helpers.h>
#include <asm/uaccess.h>
#include <asm/page.h>
@@ -25,12 +26,17 @@ static void seq_set_overflow(struct seq_file *m)
static void *seq_buf_alloc(unsigned long size)
{
void *buf;
+ gfp_t gfp = GFP_KERNEL;
/*
- * __GFP_NORETRY to avoid oom-killings with high-order allocations -
- * it's better to fall back to vmalloc() than to kill things.
+ * For high order allocations, use __GFP_NORETRY to avoid oom-killing -
+ * it's better to fall back to vmalloc() than to kill things. For small
+ * allocations, just use GFP_KERNEL which will oom kill, thus no need
+ * for vmalloc fallback.
*/
- buf = kmalloc(size, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN);
+ if (size > PAGE_SIZE)
+ gfp |= __GFP_NORETRY | __GFP_NOWARN;
+ buf = kmalloc(size, gfp);
if (!buf && size > PAGE_SIZE)
buf = vmalloc(size);
return buf;
@@ -377,26 +383,12 @@ EXPORT_SYMBOL(seq_release);
*/
void seq_escape(struct seq_file *m, const char *s, const char *esc)
{
- char *end = m->buf + m->size;
- char *p;
- char c;
+ char *buf;
+ size_t size = seq_get_buf(m, &buf);
+ int ret;
- for (p = m->buf + m->count; (c = *s) != '\0' && p < end; s++) {
- if (!strchr(esc, c)) {
- *p++ = c;
- continue;
- }
- if (p + 3 < end) {
- *p++ = '\\';
- *p++ = '0' + ((c & 0300) >> 6);
- *p++ = '0' + ((c & 070) >> 3);
- *p++ = '0' + (c & 07);
- continue;
- }
- seq_set_overflow(m);
- return;
- }
- m->count = p - m->buf;
+ ret = string_escape_str(s, buf, size, ESCAPE_OCTAL, esc);
+ seq_commit(m, ret < size ? ret : -1);
}
EXPORT_SYMBOL(seq_escape);
@@ -773,6 +765,8 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
{
const u8 *ptr = buf;
int i, linelen, remaining = len;
+ char *buffer;
+ size_t size;
int ret;
if (rowsize != 16 && rowsize != 32)
@@ -794,15 +788,12 @@ void seq_hex_dump(struct seq_file *m, const char *prefix_str, int prefix_type,
break;
}
+ size = seq_get_buf(m, &buffer);
ret = hex_dump_to_buffer(ptr + i, linelen, rowsize, groupsize,
- m->buf + m->count, m->size - m->count,
- ascii);
- if (ret >= m->size - m->count) {
- seq_set_overflow(m);
- } else {
- m->count += ret;
- seq_putc(m, '\n');
- }
+ buffer, size, ascii);
+ seq_commit(m, ret < size ? ret : -1);
+
+ seq_putc(m, '\n');
}
}
EXPORT_SYMBOL(seq_hex_dump);
diff --git a/fs/splice.c b/fs/splice.c
index 5fc1e50a7f30..801c21cd77fe 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
break;
error = add_to_page_cache_lru(page, mapping, index,
- GFP_KERNEL & mapping_gfp_mask(mapping));
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
if (unlikely(error)) {
page_cache_release(page);
if (error == -EEXIST)
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index e5e0ddf5b143..6a4cc344085c 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -68,8 +68,8 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
name_size = le16_to_cpu(entry.size);
handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
if (handler)
- prefix_size = handler->list(d, buffer, rest, NULL,
- name_size, handler->flags);
+ prefix_size = handler->list(handler, d, buffer, rest,
+ NULL, name_size);
if (prefix_size) {
if (buffer) {
if (prefix_size + name_size + 1 > rest) {
@@ -212,88 +212,68 @@ failed:
}
-/*
- * User namespace support
- */
-static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int type)
+static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler,
+ struct dentry *d, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
{
- if (list && XATTR_USER_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
- return XATTR_USER_PREFIX_LEN;
+ int len = strlen(handler->prefix);
+
+ if (list && len <= list_size)
+ memcpy(list, handler->prefix, len);
+ return len;
}
-static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
- size_t size, int type)
+static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
+ struct dentry *d, const char *name,
+ void *buffer, size_t size)
{
if (name[0] == '\0')
return -EINVAL;
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name,
+ return squashfs_xattr_get(d_inode(d), handler->flags, name,
buffer, size);
}
+/*
+ * User namespace support
+ */
static const struct xattr_handler squashfs_xattr_user_handler = {
.prefix = XATTR_USER_PREFIX,
- .list = squashfs_user_list,
- .get = squashfs_user_get
+ .flags = SQUASHFS_XATTR_USER,
+ .list = squashfs_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
/*
* Trusted namespace support
*/
-static size_t squashfs_trusted_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
+static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler,
+ struct dentry *d, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
{
if (!capable(CAP_SYS_ADMIN))
return 0;
-
- if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
- return XATTR_TRUSTED_PREFIX_LEN;
-}
-
-static int squashfs_trusted_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name,
- buffer, size);
+ return squashfs_xattr_handler_list(handler, d, list, list_size, name,
+ name_len);
}
static const struct xattr_handler squashfs_xattr_trusted_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
- .list = squashfs_trusted_list,
- .get = squashfs_trusted_get
+ .flags = SQUASHFS_XATTR_TRUSTED,
+ .list = squashfs_trusted_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
/*
* Security namespace support
*/
-static size_t squashfs_security_list(struct dentry *d, char *list,
- size_t list_size, const char *name, size_t name_len, int type)
-{
- if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
- memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
- return XATTR_SECURITY_PREFIX_LEN;
-}
-
-static int squashfs_security_get(struct dentry *d, const char *name,
- void *buffer, size_t size, int type)
-{
- if (name[0] == '\0')
- return -EINVAL;
-
- return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name,
- buffer, size);
-}
-
static const struct xattr_handler squashfs_xattr_security_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .list = squashfs_security_list,
- .get = squashfs_security_get
+ .flags = SQUASHFS_XATTR_SECURITY,
+ .list = squashfs_xattr_handler_list,
+ .get = squashfs_xattr_handler_get
};
static const struct xattr_handler *squashfs_xattr_handler(int type)
diff --git a/fs/stat.c b/fs/stat.c
index cccc1aab9a8b..d4a61d8dc021 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -367,8 +367,6 @@ static long cp_new_stat64(struct kstat *stat, struct stat64 __user *statbuf)
INIT_STRUCT_STAT64_PADDING(tmp);
#ifdef CONFIG_MIPS
/* mips has weird padding, so we don't get 64 bits there */
- if (!new_valid_dev(stat->dev) || !new_valid_dev(stat->rdev))
- return -EOVERFLOW;
tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_rdev = new_encode_dev(stat->rdev);
#else
diff --git a/fs/sync.c b/fs/sync.c
index fbc98ee62044..dd5d1711c7ac 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -86,7 +86,12 @@ static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
- filemap_fdatawait(bdev->bd_inode->i_mapping);
+ /*
+ * We keep the error status of individual mapping so that
+ * applications can catch the writeback error using fsync(2).
+ * See filemap_fdatawait_keep_errors() for details.
+ */
+ filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
}
/*
@@ -343,7 +348,8 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
}
if (flags & SYNC_FILE_RANGE_WRITE) {
- ret = filemap_fdatawrite_range(mapping, offset, endbyte);
+ ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+ WB_SYNC_NONE);
if (ret < 0)
goto out_put;
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 6c95628ea377..f35523d4fa3a 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -108,6 +108,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
{
const struct sysfs_ops *ops = sysfs_file_ops(of->kn);
struct kobject *kobj = of->kn->parent->priv;
+ size_t len;
/*
* If buf != of->prealloc_buf, we don't know how
@@ -115,7 +116,8 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf,
*/
if (pos || WARN_ON_ONCE(buf != of->prealloc_buf))
return 0;
- return ops->show(kobj, of->kn->priv, buf);
+ len = ops->show(kobj, of->kn->priv, buf);
+ return min(count, len);
}
/* kernfs write callback for regular sysfs files */
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 39a019936768..e1236594fffe 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -352,3 +352,47 @@ void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
}
}
EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
+
+/**
+ * __compat_only_sysfs_link_entry_to_kobj - add a symlink to a kobject pointing
+ * to a group or an attribute
+ * @kobj: The kobject containing the group.
+ * @target_kobj: The target kobject.
+ * @target_name: The name of the target group or attribute.
+ */
+int __compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj,
+ struct kobject *target_kobj,
+ const char *target_name)
+{
+ struct kernfs_node *target;
+ struct kernfs_node *entry;
+ struct kernfs_node *link;
+
+ /*
+ * We don't own @target_kobj and it may be removed at any time.
+ * Synchronize using sysfs_symlink_target_lock. See sysfs_remove_dir()
+ * for details.
+ */
+ spin_lock(&sysfs_symlink_target_lock);
+ target = target_kobj->sd;
+ if (target)
+ kernfs_get(target);
+ spin_unlock(&sysfs_symlink_target_lock);
+ if (!target)
+ return -ENOENT;
+
+ entry = kernfs_find_and_get(target_kobj->sd, target_name);
+ if (!entry) {
+ kernfs_put(target);
+ return -ENOENT;
+ }
+
+ link = kernfs_create_link(kobj->sd, target_name, entry);
+ if (IS_ERR(link) && PTR_ERR(link) == -EEXIST)
+ sysfs_warn_dup(kobj->sd, target_name);
+
+ kernfs_put(entry);
+ kernfs_put(target);
+ return IS_ERR(link) ? PTR_ERR(link) : 0;
+}
+EXPORT_SYMBOL_GPL(__compat_only_sysfs_link_entry_to_kobj);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index cbc8d5d2755a..c66f2423e1f5 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -340,8 +340,12 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
dput(dentry);
dentry = ERR_PTR(-EEXIST);
}
- if (IS_ERR(dentry))
+
+ if (IS_ERR(dentry)) {
mutex_unlock(&parent->d_inode->i_mutex);
+ simple_release_fs(&tracefs_mount, &tracefs_mount_count);
+ }
+
return dentry;
}
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index ba66d508006a..7ff7712f284e 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -35,3 +35,18 @@ config UBIFS_FS_ZLIB
default y
help
Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
+
+config UBIFS_ATIME_SUPPORT
+ bool "Access time support" if UBIFS_FS
+ depends on UBIFS_FS
+ default n
+ help
+ Originally UBIFS did not support atime, because it looked like a bad idea due
+ increased flash wear. This option adds atime support and it is disabled by default
+ to preserve the old behavior. If you enable this option, UBIFS starts updating atime,
+ which means that file-system read operations will cause writes (inode atime
+ updates). This may affect file-system performance and increase flash device wear,
+ so be careful. How often atime is updated depends on the selected strategy:
+ strictatime is the "heavy", relatime is "lighter", etc.
+
+ If unsure, say 'N'
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 4c46a9865fa7..595ca0debe11 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2573,7 +2573,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
{
int err, failing;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
failing = power_cut_emulated(c, lnum, 1);
@@ -2595,7 +2595,7 @@ int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 1))
return -EROFS;
@@ -2611,7 +2611,7 @@ int dbg_leb_unmap(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
@@ -2627,7 +2627,7 @@ int dbg_leb_map(struct ubifs_info *c, int lnum)
{
int err;
- if (c->dbg->pc_happened)
+ if (dbg_is_power_cut(c))
return -EROFS;
if (power_cut_emulated(c, lnum, 0))
return -EROFS;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 5c27c66c224a..e49bd2808bf3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -449,13 +449,14 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
}
out:
+ kfree(file->private_data);
+ file->private_data = NULL;
+
if (err != -ENOENT) {
ubifs_err(c, "cannot find next direntry, error %d", err);
return err;
}
- kfree(file->private_data);
- file->private_data = NULL;
/* 2 is a special value indicating that there are no more direntries */
ctx->pos = 2;
return 0;
@@ -787,9 +788,6 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
dbg_gen("dent '%pd' in dir ino %lu", dentry, dir->i_ino);
- if (!new_valid_dev(rdev))
- return -EINVAL;
-
if (S_ISBLK(mode) || S_ISCHR(mode)) {
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev)
@@ -1188,6 +1186,9 @@ const struct inode_operations ubifs_dir_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_dir_operations = {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index a3dfe2ae79f2..0edc12856147 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1354,6 +1354,47 @@ static inline int mctime_update_needed(const struct inode *inode,
return 0;
}
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+/**
+ * ubifs_update_time - update time of inode.
+ * @inode: inode to update
+ *
+ * This function updates time of the inode.
+ */
+int ubifs_update_time(struct inode *inode, struct timespec *time,
+ int flags)
+{
+ struct ubifs_inode *ui = ubifs_inode(inode);
+ struct ubifs_info *c = inode->i_sb->s_fs_info;
+ struct ubifs_budget_req req = { .dirtied_ino = 1,
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
+ int iflags = I_DIRTY_TIME;
+ int err, release;
+
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ return err;
+
+ mutex_lock(&ui->ui_mutex);
+ if (flags & S_ATIME)
+ inode->i_atime = *time;
+ if (flags & S_CTIME)
+ inode->i_ctime = *time;
+ if (flags & S_MTIME)
+ inode->i_mtime = *time;
+
+ if (!(inode->i_sb->s_flags & MS_LAZYTIME))
+ iflags |= I_DIRTY_SYNC;
+
+ release = ui->dirty;
+ __mark_inode_dirty(inode, iflags);
+ mutex_unlock(&ui->ui_mutex);
+ if (release)
+ ubifs_release_budget(c, &req);
+ return 0;
+}
+#endif
+
/**
* update_ctime - update mtime and ctime of an inode.
* @inode: inode to update
@@ -1537,6 +1578,9 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (err)
return err;
vma->vm_ops = &ubifs_file_vm_ops;
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ file_accessed(file);
+#endif
return 0;
}
@@ -1557,6 +1601,9 @@ const struct inode_operations ubifs_file_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct inode_operations ubifs_symlink_inode_operations = {
@@ -1568,6 +1615,9 @@ const struct inode_operations ubifs_symlink_inode_operations = {
.getxattr = ubifs_getxattr,
.listxattr = ubifs_listxattr,
.removexattr = ubifs_removexattr,
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+ .update_time = ubifs_update_time,
+#endif
};
const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index dc9f27e9d61b..9a517109da0f 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -1498,11 +1498,10 @@ static struct ubifs_nnode *dirty_cow_nnode(struct ubifs_info *c,
}
/* nnode is being committed, so copy it */
- n = kmalloc(sizeof(struct ubifs_nnode), GFP_NOFS);
+ n = kmemdup(nnode, sizeof(struct ubifs_nnode), GFP_NOFS);
if (unlikely(!n))
return ERR_PTR(-ENOMEM);
- memcpy(n, nnode, sizeof(struct ubifs_nnode));
n->cnext = NULL;
__set_bit(DIRTY_CNODE, &n->flags);
__clear_bit(COW_CNODE, &n->flags);
@@ -1549,11 +1548,10 @@ static struct ubifs_pnode *dirty_cow_pnode(struct ubifs_info *c,
}
/* pnode is being committed, so copy it */
- p = kmalloc(sizeof(struct ubifs_pnode), GFP_NOFS);
+ p = kmemdup(pnode, sizeof(struct ubifs_pnode), GFP_NOFS);
if (unlikely(!p))
return ERR_PTR(-ENOMEM);
- memcpy(p, pnode, sizeof(struct ubifs_pnode));
p->cnext = NULL;
__set_bit(DIRTY_CNODE, &p->flags);
__clear_bit(COW_CNODE, &p->flags);
diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h
index ee7cb5ebb6e8..8ece6ca58c0b 100644
--- a/fs/ubifs/misc.h
+++ b/fs/ubifs/misc.h
@@ -155,13 +155,8 @@ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf)
*/
static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev)
{
- if (new_valid_dev(rdev)) {
- dev->new = cpu_to_le32(new_encode_dev(rdev));
- return sizeof(dev->new);
- } else {
- dev->huge = cpu_to_le64(huge_encode_dev(rdev));
- return sizeof(dev->huge);
- }
+ dev->new = cpu_to_le32(new_encode_dev(rdev));
+ return sizeof(dev->new);
}
/**
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 695fc71d5244..586d59347fff 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -789,7 +789,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum,
corrupted_rescan:
/* Re-scan the corrupted data with verbose messages */
ubifs_err(c, "corruption %d", ret);
- ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
+ ubifs_scan_a_node(c, buf, len, lnum, offs, 0);
corrupted:
ubifs_scanned_corruption(c, lnum, offs, buf);
err = -EUCLEAN;
@@ -1331,8 +1331,7 @@ void ubifs_destroy_size_tree(struct ubifs_info *c)
struct size_entry *e, *n;
rbtree_postorder_for_each_entry_safe(e, n, &c->size_tree, rb) {
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
kfree(e);
}
@@ -1533,8 +1532,7 @@ int ubifs_recover_size(struct ubifs_info *c)
err = fix_size_in_place(c, e);
if (err)
return err;
- if (e->inode)
- iput(e->inode);
+ iput(e->inode);
}
}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 9547a27868ad..1fd90c079537 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -128,7 +128,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
if (err)
goto out_ino;
- inode->i_flags |= (S_NOCMTIME | S_NOATIME);
+ inode->i_flags |= S_NOCMTIME;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ inode->i_flags |= S_NOATIME;
+#endif
set_nlink(inode, le32_to_cpu(ino->nlink));
i_uid_write(inode, le32_to_cpu(ino->uid));
i_gid_write(inode, le32_to_cpu(ino->gid));
@@ -2037,7 +2040,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
if (c->max_inode_sz > MAX_LFS_FILESIZE)
sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
sb->s_op = &ubifs_super_operations;
- sb->s_xattr = ubifs_xattr_handlers;
mutex_lock(&c->umount_mutex);
err = mount_ubifs(c);
@@ -2139,7 +2141,12 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags,
if (err)
goto out_deact;
/* We do not support atime */
- sb->s_flags |= MS_ACTIVE | MS_NOATIME;
+ sb->s_flags |= MS_ACTIVE;
+#ifndef CONFIG_UBIFS_ATIME_SUPPORT
+ sb->s_flags |= MS_NOATIME;
+#else
+ ubifs_msg(c, "full atime support is enabled.");
+#endif
}
/* 'fill_super()' opens ubi again so we must close it here */
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 957f5757f374..fa9a20cc60d6 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -198,11 +198,10 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
{
struct ubifs_znode *zn;
- zn = kmalloc(c->max_znode_sz, GFP_NOFS);
+ zn = kmemdup(znode, c->max_znode_sz, GFP_NOFS);
if (unlikely(!zn))
return ERR_PTR(-ENOMEM);
- memcpy(zn, znode, c->max_znode_sz);
zn->cnext = NULL;
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index de759022f3d6..a5697de763f5 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -858,9 +858,9 @@ struct ubifs_compressor {
* @mod_dent: non-zero if the operation removes or modifies an existing
* directory entry
* @new_ino: non-zero if the operation adds a new inode
- * @new_ino_d: now much data newly created inode contains
+ * @new_ino_d: how much data newly created inode contains
* @dirtied_ino: how many inodes the operation makes dirty
- * @dirtied_ino_d: now much data dirtied inode contains
+ * @dirtied_ino_d: how much data dirtied inode contains
* @idx_growth: how much the index will supposedly grow
* @data_growth: how much new data the operation will supposedly add
* @dd_growth: how much data that makes other data dirty the operation will
@@ -1470,7 +1470,6 @@ extern spinlock_t ubifs_infos_lock;
extern atomic_long_t ubifs_clean_zn_cnt;
extern struct kmem_cache *ubifs_inode_slab;
extern const struct super_operations ubifs_super_operations;
-extern const struct xattr_handler *ubifs_xattr_handlers[];
extern const struct address_space_operations ubifs_file_address_operations;
extern const struct file_operations ubifs_file_operations;
extern const struct inode_operations ubifs_file_inode_operations;
@@ -1746,6 +1745,9 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc);
/* file.c */
int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
+#ifdef CONFIG_UBIFS_ATIME_SUPPORT
+int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
+#endif
/* dir.c */
struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 96f3448b6eb4..e8b01b721e99 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -200,6 +200,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
int err;
struct ubifs_inode *host_ui = ubifs_inode(host);
struct ubifs_inode *ui = ubifs_inode(inode);
+ void *buf = NULL;
struct ubifs_budget_req req = { .dirtied_ino = 2,
.dirtied_ino_d = ALIGN(size, 8) + ALIGN(host_ui->data_len, 8) };
@@ -208,14 +209,17 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
if (err)
return err;
- kfree(ui->data);
- ui->data = kmemdup(value, size, GFP_NOFS);
- if (!ui->data) {
+ buf = kmemdup(value, size, GFP_NOFS);
+ if (!buf) {
err = -ENOMEM;
goto out_free;
}
+ mutex_lock(&ui->ui_mutex);
+ kfree(ui->data);
+ ui->data = buf;
inode->i_size = ui->ui_size = size;
ui->data_len = size;
+ mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
host->i_ctime = ubifs_current_time(host);
@@ -409,6 +413,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
ubifs_assert(inode->i_size == ui->data_len);
ubifs_assert(ubifs_inode(host)->xattr_size > ui->data_len);
+ mutex_lock(&ui->ui_mutex);
if (buf) {
/* If @buf is %NULL we are supposed to return the length */
if (ui->data_len > size) {
@@ -423,6 +428,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
err = ui->data_len;
out_iput:
+ mutex_unlock(&ui->ui_mutex);
iput(inode);
out_unlock:
kfree(xent);
@@ -582,46 +588,6 @@ out_free:
return err;
}
-static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
- const char *name, size_t name_len, int flags)
-{
- const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
- const size_t total_len = prefix_len + name_len + 1;
-
- if (list && total_len <= list_size) {
- memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
- memcpy(list + prefix_len, name, name_len);
- list[prefix_len + name_len] = '\0';
- }
-
- return total_len;
-}
-
-static int security_getxattr(struct dentry *d, const char *name, void *buffer,
- size_t size, int flags)
-{
- return ubifs_getxattr(d, name, buffer, size);
-}
-
-static int security_setxattr(struct dentry *d, const char *name,
- const void *value, size_t size, int flags,
- int handler_flags)
-{
- return ubifs_setxattr(d, name, value, size, flags);
-}
-
-static const struct xattr_handler ubifs_xattr_security_handler = {
- .prefix = XATTR_SECURITY_PREFIX,
- .list = security_listxattr,
- .get = security_getxattr,
- .set = security_setxattr,
-};
-
-const struct xattr_handler *ubifs_xattr_handlers[] = {
- &ubifs_xattr_security_handler,
- NULL,
-};
-
static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
void *fs_info)
{
@@ -652,11 +618,8 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
{
int err;
- mutex_lock(&inode->i_mutex);
err = security_inode_init_security(inode, dentry, qstr,
&init_xattrs, 0);
- mutex_unlock(&inode->i_mutex);
-
if (err) {
struct ubifs_info *c = dentry->i_sb->s_fs_info;
ubifs_err(c, "cannot initialize security for inode %lu, error %d",
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 634e676072cb..50311703135b 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -467,8 +467,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* the fault_*wqh.
*/
spin_lock(&ctx->fault_pending_wqh.lock);
- __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0, &range);
- __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, &range);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, &range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
spin_unlock(&ctx->fault_pending_wqh.lock);
wake_up_poll(&ctx->fd_wqh, POLLHUP);
@@ -650,10 +650,10 @@ static void __wake_userfault(struct userfaultfd_ctx *ctx,
spin_lock(&ctx->fault_pending_wqh.lock);
/* wake all in the range and autoremove */
if (waitqueue_active(&ctx->fault_pending_wqh))
- __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL, 0,
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
range);
if (waitqueue_active(&ctx->fault_wqh))
- __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, 0, range);
+ __wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, range);
spin_unlock(&ctx->fault_pending_wqh.lock);
}
@@ -1287,8 +1287,10 @@ static struct file *userfaultfd_file_create(int flags)
file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
- if (IS_ERR(file))
+ if (IS_ERR(file)) {
+ mmput(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
+ }
out:
return file;
}
diff --git a/fs/xattr.c b/fs/xattr.c
index 072fee1258dd..9b932b95d74e 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -720,7 +720,7 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (!handler)
return -EOPNOTSUPP;
- return handler->get(dentry, name, buffer, size, handler->flags);
+ return handler->get(handler, dentry, name, buffer, size);
}
/*
@@ -735,15 +735,15 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (!buffer) {
for_each_xattr_handler(handlers, handler) {
- size += handler->list(dentry, NULL, 0, NULL, 0,
- handler->flags);
+ size += handler->list(handler, dentry, NULL, 0,
+ NULL, 0);
}
} else {
char *buf = buffer;
for_each_xattr_handler(handlers, handler) {
- size = handler->list(dentry, buf, buffer_size,
- NULL, 0, handler->flags);
+ size = handler->list(handler, dentry, buf, buffer_size,
+ NULL, 0);
if (size > buffer_size)
return -ERANGE;
buf += size;
@@ -767,7 +767,7 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (!handler)
return -EOPNOTSUPP;
- return handler->set(dentry, name, value, size, flags, handler->flags);
+ return handler->set(handler, dentry, name, value, size, flags);
}
/*
@@ -782,8 +782,7 @@ generic_removexattr(struct dentry *dentry, const char *name)
handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
if (!handler)
return -EOPNOTSUPP;
- return handler->set(dentry, name, NULL, 0,
- XATTR_REPLACE, handler->flags);
+ return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
}
EXPORT_SYMBOL(generic_getxattr);
@@ -791,6 +790,30 @@ EXPORT_SYMBOL(generic_listxattr);
EXPORT_SYMBOL(generic_setxattr);
EXPORT_SYMBOL(generic_removexattr);
+/**
+ * xattr_full_name - Compute full attribute name from suffix
+ *
+ * @handler: handler of the xattr_handler operation
+ * @name: name passed to the xattr_handler operation
+ *
+ * The get and set xattr handler operations are called with the remainder of
+ * the attribute name after skipping the handler's prefix: for example, "foo"
+ * is passed to the get operation of a handler with prefix "user." to get
+ * attribute "user.foo". The full name is still "there" in the name though.
+ *
+ * Note: the list xattr handler operation when called from the vfs is passed a
+ * NULL name; some file systems use this operation internally, with varying
+ * semantics.
+ */
+const char *xattr_full_name(const struct xattr_handler *handler,
+ const char *name)
+{
+ size_t prefix_len = strlen(handler->prefix);
+
+ return name - prefix_len;
+}
+EXPORT_SYMBOL(xattr_full_name);
+
/*
* Allocate new xattr and copy in the value; but leave the name to callers.
*/
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index a096841bd06c..f64639176670 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -84,6 +84,7 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mount.o \
xfs_mru_cache.o \
+ xfs_stats.o \
xfs_super.o \
xfs_symlink.o \
xfs_sysfs.o \
@@ -118,7 +119,6 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \
xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o
xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o
-xfs-$(CONFIG_PROC_FS) += xfs_stats.o
xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o
xfs-$(CONFIG_COMPAT) += xfs_ioctl32.o
xfs-$(CONFIG_NFSD_PNFS) += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index a7a3a63bb360..686ba6fb20dd 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -55,8 +55,9 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
+ current->comm, current->pid,
+ (unsigned int)size, __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
@@ -120,8 +121,9 @@ kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
return ptr;
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
- __func__, lflags);
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
+ __func__, lflags);
congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (1);
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ffad7f20342f..3479294c1d58 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -482,7 +482,9 @@ xfs_agfl_verify(
be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
return false;
}
- return true;
+
+ return xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn));
}
static void
@@ -651,8 +653,8 @@ xfs_alloc_ag_vextent(
-((long)(args->len)));
}
- XFS_STATS_INC(xs_allocx);
- XFS_STATS_ADD(xs_allocb, args->len);
+ XFS_STATS_INC(args->mp, xs_allocx);
+ XFS_STATS_ADD(args->mp, xs_allocb, args->len);
return error;
}
@@ -1808,8 +1810,8 @@ xfs_free_ag_extent(
if (!isfl)
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len);
- XFS_STATS_INC(xs_freex);
- XFS_STATS_ADD(xs_freeb, len);
+ XFS_STATS_INC(mp, xs_freex);
+ XFS_STATS_ADD(mp, xs_freeb, len);
trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright);
@@ -2259,9 +2261,13 @@ xfs_agf_verify(
{
struct xfs_agf *agf = XFS_BUF_TO_AGF(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGF(bp)->agf_lsn)))
+ return false;
+ }
if (!(agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
@@ -2503,7 +2509,7 @@ xfs_alloc_vextent(
* Try near allocation first, then anywhere-in-ag after
* the first a.g. fails.
*/
- if ((args->userdata == XFS_ALLOC_INITIAL_USER_DATA) &&
+ if ((args->userdata & XFS_ALLOC_INITIAL_USER_DATA) &&
(mp->m_flags & XFS_MOUNT_32BITINODES)) {
args->fsbno = XFS_AGB_TO_FSB(mp,
((mp->m_agfrotor / rotorstep) %
@@ -2634,6 +2640,14 @@ xfs_alloc_vextent(
XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
args->len);
#endif
+
+ /* Zero the extent if we were asked to do so */
+ if (args->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(args->ip, args->fsbno, args->len);
+ if (error)
+ goto error0;
+ }
+
}
xfs_perag_put(args->pag);
return 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index ca1c8168373a..0ecde4d5cac8 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -101,6 +101,7 @@ typedef struct xfs_alloc_arg {
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for a.g. freelist header */
struct xfs_perag *pag; /* per-ag struct for this agno */
+ struct xfs_inode *ip; /* for userdata zeroing method */
xfs_fsblock_t fsbno; /* file system block number */
xfs_agnumber_t agno; /* allocation group number */
xfs_agblock_t agbno; /* allocation group-relative block # */
@@ -120,15 +121,16 @@ typedef struct xfs_alloc_arg {
char wasdel; /* set if allocation was prev delayed */
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
- char userdata; /* set if this is user data */
+ char userdata; /* mask defining userdata treatment */
xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
* Defines for userdata
*/
-#define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/
-#define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */
+#define XFS_ALLOC_USERDATA (1 << 0)/* allocation is for user data*/
+#define XFS_ALLOC_INITIAL_USER_DATA (1 << 1)/* special case start of file */
+#define XFS_ALLOC_USERDATA_ZERO (1 << 2)/* zero extent on allocation */
xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp,
struct xfs_perag *pag, xfs_extlen_t need);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index ff065578969f..f949818fa1c7 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -125,7 +125,7 @@ xfs_attr_get(
uint lock_mode;
int error;
- XFS_STATS_INC(xs_attr_get);
+ XFS_STATS_INC(ip->i_mount, xs_attr_get);
if (XFS_FORCED_SHUTDOWN(ip->i_mount))
return -EIO;
@@ -209,7 +209,7 @@ xfs_attr_set(
int rsvd = (flags & ATTR_ROOT) != 0;
int error, err2, committed, local;
- XFS_STATS_INC(xs_attr_set);
+ XFS_STATS_INC(mp, xs_attr_set);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
@@ -412,7 +412,7 @@ xfs_attr_remove(
xfs_fsblock_t firstblock;
int error;
- XFS_STATS_INC(xs_attr_remove);
+ XFS_STATS_INC(mp, xs_attr_remove);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 33df52d97ec7..aa187f7ba2dd 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -41,6 +41,7 @@
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
#include "xfs_dir2.h"
+#include "xfs_log.h"
/*
@@ -266,6 +267,8 @@ xfs_attr3_leaf_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
return false;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index f38f9bd81557..5ab95ffa4ae9 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -107,7 +107,7 @@ xfs_attr3_rmt_verify(
if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt))
return false;
if (be32_to_cpu(rmt->rm_offset) +
- be32_to_cpu(rmt->rm_bytes) > XATTR_SIZE_MAX)
+ be32_to_cpu(rmt->rm_bytes) > XFS_XATTR_SIZE_MAX)
return false;
if (rmt->rm_owner == 0)
return false;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 8e2010d53b07..119c2422aac7 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -948,14 +948,16 @@ xfs_bmap_local_to_extents(
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
/*
- * Initialise the block and copy the data
+ * Initialize the block, copy the data and log the remote buffer.
*
- * Note: init_fn must set the buffer log item type correctly!
+ * The callout is responsible for logging because the remote format
+ * might differ from the local format and thus we don't know how much to
+ * log here. Note that init_fn must also set the buffer log item type
+ * correctly.
*/
init_fn(tp, bp, ip, ifp);
- /* account for the change in fork size and log everything */
- xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
+ /* account for the change in fork size */
xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
xfs_bmap_local_to_extents_empty(ip, whichfork);
flags |= XFS_ILOG_CORE;
@@ -1435,7 +1437,7 @@ xfs_bmap_search_extents(
xfs_ifork_t *ifp; /* inode fork pointer */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
- XFS_STATS_INC(xs_look_exlist);
+ XFS_STATS_INC(ip->i_mount, xs_look_exlist);
ifp = XFS_IFORK_PTR(ip, fork);
ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
@@ -1732,7 +1734,7 @@ xfs_bmap_add_extent_delay_real(
ASSERT(!bma->cur ||
(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2286,7 +2288,7 @@ xfs_bmap_add_extent_unwritten_real(
ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
ASSERT(!isnullstartblock(new->br_startblock));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
#define LEFT r[0]
#define RIGHT r[1]
@@ -2946,7 +2948,7 @@ xfs_bmap_add_extent_hole_real(
ASSERT(!bma->cur ||
!(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
- XFS_STATS_INC(xs_add_exlist);
+ XFS_STATS_INC(mp, xs_add_exlist);
state = 0;
if (whichfork == XFS_ATTR_FORK)
@@ -3800,8 +3802,13 @@ xfs_bmap_btalloc(
args.wasdel = ap->wasdel;
args.isfl = 0;
args.userdata = ap->userdata;
- if ((error = xfs_alloc_vextent(&args)))
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO)
+ args.ip = ap->ip;
+
+ error = xfs_alloc_vextent(&args);
+ if (error)
return error;
+
if (tryagain && args.fsbno == NULLFSBLOCK) {
/*
* Exact allocation failed. Now try with alignment
@@ -4036,7 +4043,7 @@ xfs_bmapi_read(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapr);
+ XFS_STATS_INC(mp, xs_blk_mapr);
ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -4221,7 +4228,7 @@ xfs_bmapi_delay(
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
@@ -4300,11 +4307,14 @@ xfs_bmapi_allocate(
/*
* Indicate if this is the first user data in the file, or just any
- * user data.
+ * user data. And if it is userdata, indicate whether it needs to
+ * be initialised to zero during allocation.
*/
if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
+ if (bma->flags & XFS_BMAPI_ZERO)
+ bma->userdata |= XFS_ALLOC_USERDATA_ZERO;
}
bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
@@ -4419,6 +4429,17 @@ xfs_bmapi_convert_unwritten(
mval->br_state = (mval->br_state == XFS_EXT_UNWRITTEN)
? XFS_EXT_NORM : XFS_EXT_UNWRITTEN;
+ /*
+ * Before insertion into the bmbt, zero the range being converted
+ * if required.
+ */
+ if (flags & XFS_BMAPI_ZERO) {
+ error = xfs_zero_extent(bma->ip, mval->br_startblock,
+ mval->br_blockcount);
+ if (error)
+ return error;
+ }
+
error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx,
&bma->cur, mval, bma->firstblock, bma->flist,
&tmp_logflags);
@@ -4512,6 +4533,18 @@ xfs_bmapi_write(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ /* zeroing is for currently only for data extents, not metadata */
+ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO));
+ /*
+ * we can allocate unwritten extents or pre-zero allocated blocks,
+ * but it makes no sense to do both at once. This would result in
+ * zeroing the unwritten extent twice, but it still being an
+ * unwritten extent....
+ */
+ ASSERT((flags & (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO)) !=
+ (XFS_BMAPI_PREALLOC | XFS_BMAPI_ZERO));
+
if (unlikely(XFS_TEST_ERROR(
(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE),
@@ -4525,7 +4558,7 @@ xfs_bmapi_write(
ifp = XFS_IFORK_PTR(ip, whichfork);
- XFS_STATS_INC(xs_blk_mapw);
+ XFS_STATS_INC(mp, xs_blk_mapw);
if (*firstblock == NULLFSBLOCK) {
if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
@@ -4718,12 +4751,12 @@ xfs_bmap_del_extent(
xfs_filblks_t temp2; /* for indirect length calculations */
int state = 0;
- XFS_STATS_INC(xs_del_exlist);
+ mp = ip->i_mount;
+ XFS_STATS_INC(mp, xs_del_exlist);
if (whichfork == XFS_ATTR_FORK)
state |= BMAP_ATTRFORK;
- mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
(uint)sizeof(xfs_bmbt_rec_t)));
@@ -5070,7 +5103,7 @@ xfs_bunmapi(
*done = 1;
return 0;
}
- XFS_STATS_INC(xs_blk_unmap);
+ XFS_STATS_INC(mp, xs_blk_unmap);
isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
start = bno;
bno = start + len - 1;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 6aaa0c1c7200..a160f8a5a3fc 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -52,9 +52,9 @@ struct xfs_bmalloca {
xfs_extlen_t minleft; /* amount must be left after alloc */
bool eof; /* set if allocating past last extent */
bool wasdel; /* replacing a delayed allocation */
- bool userdata;/* set if is user data */
bool aeof; /* allocated space at eof */
bool conv; /* overwriting unwritten extents */
+ char userdata;/* userdata mask */
int flags;
};
@@ -109,6 +109,14 @@ typedef struct xfs_bmap_free
*/
#define XFS_BMAPI_CONVERT 0x040
+/*
+ * allocate zeroed extents - this requires all newly allocated user data extents
+ * to be initialised to zero. It will be ignored if XFS_BMAPI_METADATA is set.
+ * Use in conjunction with XFS_BMAPI_CONVERT to convert unwritten extents found
+ * during the allocation range to zeroed written extents.
+ */
+#define XFS_BMAPI_ZERO 0x080
+
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
{ XFS_BMAPI_METADATA, "METADATA" }, \
@@ -116,7 +124,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_ZERO, "ZERO" }
static inline int xfs_bmapi_aflag(int w)
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index f7d7ee7a2607..af1bbee5586e 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -32,6 +32,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_alloc.h"
+#include "xfs_log.h"
/*
* Cursor allocation zone.
@@ -222,7 +223,7 @@ xfs_btree_check_ptr(
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -243,8 +244,14 @@ bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
+ }
return true;
}
@@ -254,7 +261,7 @@ xfs_btree_lblock_verify_crc(
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
- * it into the buffer so recovery knows what the last modifcation was that made
+ * it into the buffer so recovery knows what the last modification was that made
* it to disk.
*/
void
@@ -275,8 +282,14 @@ bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
- if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
+ return false;
return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
+ }
return true;
}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 8f18bab73ea5..992dec0638f3 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -84,31 +84,38 @@ union xfs_btree_rec {
/*
* Generic stats interface
*/
-#define __XFS_BTREE_STATS_INC(type, stat) \
- XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
-#define XFS_BTREE_STATS_INC(cur, stat) \
+#define __XFS_BTREE_STATS_INC(mp, type, stat) \
+ XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(fibt, stat); break; \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
+ case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
-#define __XFS_BTREE_STATS_ADD(type, stat, val) \
- XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
+ XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
#define XFS_BTREE_STATS_ADD(cur, stat, val) \
do { \
+ struct xfs_mount *__mp = cur->bc_mp; \
switch (cur->bc_btnum) { \
- case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
- case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
- case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
- case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
- case XFS_BTNUM_FINO: __XFS_BTREE_STATS_ADD(fibt, stat, val); break; \
- case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ case XFS_BTNUM_BNO: \
+ __XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: \
+ __XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: \
+ __XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: \
+ __XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
+ case XFS_BTNUM_FINO: \
+ __XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
} \
} while (0)
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index be43248a5822..e89a0f8f827c 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -39,6 +39,7 @@
#include "xfs_trace.h"
#include "xfs_cksum.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
* xfs_da_btree.c
@@ -150,6 +151,8 @@ xfs_da3_node_verify(
return false;
if (be64_to_cpu(hdr3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->info.lsn)))
+ return false;
} else {
if (ichdr.magic != XFS_DA_NODE_MAGIC)
return false;
@@ -322,6 +325,7 @@ xfs_da3_node_create(
if (xfs_sb_version_hascrc(&mp->m_sb)) {
struct xfs_da3_node_hdr *hdr3 = bp->b_addr;
+ memset(hdr3, 0, sizeof(struct xfs_da3_node_hdr));
ichdr.magic = XFS_DA3_NODE_MAGIC;
hdr3->info.blkno = cpu_to_be64(bp->b_bn);
hdr3->info.owner = cpu_to_be64(args->dp->i_ino);
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 9de401d297e5..2fb53a5c0a74 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -271,7 +271,7 @@ xfs_dir_createname(
rval = xfs_dir_ino_validate(tp->t_mountp, inum);
if (rval)
return rval;
- XFS_STATS_INC(xs_dir_create);
+ XFS_STATS_INC(dp->i_mount, xs_dir_create);
}
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
@@ -365,7 +365,7 @@ xfs_dir_lookup(
int lock_mode;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_lookup);
+ XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
/*
* We need to use KM_NOFS here so that lockdep will not throw false
@@ -444,7 +444,7 @@ xfs_dir_removename(
int v; /* type-checking value */
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_remove);
+ XFS_STATS_INC(dp->i_mount, xs_dir_remove);
args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS);
if (!args)
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 4778d1dd511a..9c10e2b8cfcb 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function prototypes.
@@ -71,6 +72,8 @@ xfs_dir3_block_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))
return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 824131e71bc5..af71a84f343c 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -31,6 +31,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Check the consistency of the data block.
@@ -224,6 +225,8 @@ xfs_dir3_data_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr3->magic != cpu_to_be32(XFS_DIR2_DATA_MAGIC))
return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index f300240ebb8d..3923e1f94697 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Local function declarations.
@@ -164,6 +165,8 @@ xfs_dir3_leaf_verify(
return false;
if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(leaf3->info.lsn)))
+ return false;
} else {
if (leaf->hdr.info.magic != cpu_to_be16(magic))
return false;
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index cc28e924545b..70b0cb2fd556 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -33,6 +33,7 @@
#include "xfs_trans.h"
#include "xfs_buf_item.h"
#include "xfs_cksum.h"
+#include "xfs_log.h"
/*
* Function declarations.
@@ -97,6 +98,8 @@ xfs_dir3_free_verify(
return false;
if (be64_to_cpu(hdr3->blkno) != bp->b_bn)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(hdr3->lsn)))
+ return false;
} else {
if (hdr->magic != cpu_to_be32(XFS_DIR2_FREE_MAGIC))
return false;
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9590a069e556..8774498ce0ff 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -60,6 +60,14 @@ struct xfs_ifork;
#define XFS_SB_VERSION_MOREBITSBIT 0x8000
/*
+ * The size of a single extended attribute on disk is limited by
+ * the size of index values within the attribute entries themselves.
+ * These are be16 fields, so we can only support attribute data
+ * sizes up to 2^16 bytes in length.
+ */
+#define XFS_XATTR_SIZE_MAX (1 << 16)
+
+/*
* Supported feature bit list is just all bits in the versionnum field because
* we've used them all up and understand them all. Except, of course, for the
* shared superblock bit, which nobody knows what it does and so is unsupported.
@@ -1483,13 +1491,17 @@ struct xfs_acl {
*/
#define XFS_ACL_MAX_ENTRIES(mp) \
(xfs_sb_version_hascrc(&mp->m_sb) \
- ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
+ ? (XFS_XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \
sizeof(struct xfs_acl_entry) \
: 25)
-#define XFS_ACL_MAX_SIZE(mp) \
+#define XFS_ACL_SIZE(cnt) \
(sizeof(struct xfs_acl) + \
- sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp)))
+ sizeof(struct xfs_acl_entry) * cnt)
+
+#define XFS_ACL_MAX_SIZE(mp) \
+ XFS_ACL_SIZE(XFS_ACL_MAX_ENTRIES((mp)))
+
/* On-disk XFS extended attribute names */
#define SGI_ACL_FILE "SGI_ACL_FILE"
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 89689c6a43e2..b2b73a998d42 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -490,6 +490,16 @@ typedef struct xfs_swapext
#define XFS_FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */
/*
+ * ioctl limits
+ */
+#ifdef XATTR_LIST_MAX
+# define XFS_XATTR_LIST_MAX XATTR_LIST_MAX
+#else
+# define XFS_XATTR_LIST_MAX 65536
+#endif
+
+
+/*
* ioctl commands that are used by Linux filesystems
*/
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 54deb2d12ac6..70c1db99f6a7 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -38,6 +38,7 @@
#include "xfs_icreate_item.h"
#include "xfs_icache.h"
#include "xfs_trace.h"
+#include "xfs_log.h"
/*
@@ -2500,9 +2501,14 @@ xfs_agi_verify(
struct xfs_mount *mp = bp->b_target->bt_mount;
struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
+ return false;
+ if (!xfs_log_check_lsn(mp,
+ be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
return false;
+ }
+
/*
* Validate the magic number of the agi block.
*/
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 47425140f343..a0b071d881a0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -35,6 +35,7 @@
#include "xfs_bmap_btree.h"
#include "xfs_alloc_btree.h"
#include "xfs_ialloc_btree.h"
+#include "xfs_log.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -163,6 +164,15 @@ xfs_mount_validate_sb(
"Filesystem can not be safely mounted by this kernel.");
return -EINVAL;
}
+ } else if (xfs_sb_version_hascrc(sbp)) {
+ /*
+ * We can't read verify the sb LSN because the read verifier is
+ * called before the log is allocated and processed. We know the
+ * log is set up before write verifier (!check_version) calls,
+ * so just check it here.
+ */
+ if (!xfs_log_check_lsn(mp, sbp->sb_lsn))
+ return -EFSCORRUPTED;
}
if (xfs_sb_version_has_pquotino(sbp)) {
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 8f8af05b3f13..cb6fd20a4d3d 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -31,6 +31,7 @@
#include "xfs_cksum.h"
#include "xfs_trans.h"
#include "xfs_buf_item.h"
+#include "xfs_log.h"
/*
@@ -60,6 +61,7 @@ xfs_symlink_hdr_set(
if (!xfs_sb_version_hascrc(&mp->m_sb))
return 0;
+ memset(dsl, 0, sizeof(struct xfs_dsymlink_hdr));
dsl->sl_magic = cpu_to_be32(XFS_SYMLINK_MAGIC);
dsl->sl_offset = cpu_to_be32(offset);
dsl->sl_bytes = cpu_to_be32(size);
@@ -116,6 +118,8 @@ xfs_symlink_verify(
return false;
if (dsl->sl_owner == 0)
return false;
+ if (!xfs_log_check_lsn(mp, be64_to_cpu(dsl->sl_lsn)))
+ return false;
return true;
}
@@ -183,6 +187,7 @@ xfs_symlink_local_to_remote(
if (!xfs_sb_version_hascrc(&mp->m_sb)) {
bp->b_ops = NULL;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -198,4 +203,6 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
+ ifp->if_bytes - 1);
}
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 4b641676f258..6bb470fbb8e8 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -37,16 +37,19 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
- struct xfs_acl *aclp,
- int max_entries)
+ const struct xfs_acl *aclp,
+ int len,
+ int max_entries)
{
struct posix_acl_entry *acl_e;
struct posix_acl *acl;
- struct xfs_acl_entry *ace;
+ const struct xfs_acl_entry *ace;
unsigned int count, i;
+ if (len < sizeof(*aclp))
+ return ERR_PTR(-EFSCORRUPTED);
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len)
return ERR_PTR(-EFSCORRUPTED);
acl = posix_acl_alloc(count, GFP_KERNEL);
@@ -160,10 +163,11 @@ xfs_get_acl(struct inode *inode, int type)
*/
if (error == -ENOATTR)
goto out_update_cache;
+ acl = ERR_PTR(error);
goto out;
}
- acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount));
+ acl = xfs_acl_from_disk(xfs_acl, len, XFS_ACL_MAX_ENTRIES(ip->i_mount));
if (IS_ERR(acl))
goto out;
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 3841b07f27bf..52f8255d6bdf 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -20,7 +20,6 @@
struct inode;
struct posix_acl;
-struct xfs_inode;
#ifdef CONFIG_XFS_POSIX_ACL
extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
@@ -36,4 +35,7 @@ static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
# define posix_acl_access_exists(inode) 0
# define posix_acl_default_exists(inode) 0
#endif /* CONFIG_XFS_POSIX_ACL */
+
+extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
+
#endif /* __XFS_ACL_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 50ab2879b9da..29e7e5dd5178 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -172,6 +172,12 @@ xfs_setfilesize_ioend(
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
+ /* we abort the update if there was an IO error */
+ if (ioend->io_error) {
+ xfs_trans_cancel(tp);
+ return ioend->io_error;
+ }
+
return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
}
@@ -212,14 +218,17 @@ xfs_end_io(
ioend->io_error = -EIO;
goto done;
}
- if (ioend->io_error)
- goto done;
/*
* For unwritten extents we need to issue transactions to convert a
* range to normal written extens after the data I/O has finished.
+ * Detecting and handling completion IO errors is done individually
+ * for each case as different cleanup operations need to be performed
+ * on error.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ if (ioend->io_error)
+ goto done;
error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
ioend->io_size);
} else if (ioend->io_append_trans) {
@@ -1250,13 +1259,28 @@ xfs_vm_releasepage(
* the DIO. There is only going to be one reference to the ioend and its life
* cycle is constrained by the DIO completion code. hence we don't need
* reference counting here.
+ *
+ * Note that for DIO, an IO to the highest supported file block offset (i.e.
+ * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
+ * bit variable. Hence if we see this overflow, we have to assume that the IO is
+ * extending the file size. We won't know for sure until IO completion is run
+ * and the actual max write offset is communicated to the IO completion
+ * routine.
+ *
+ * For DAX page faults, we are preparing to never see unwritten extents here,
+ * nor should we ever extend the inode size. Hence we will soon have nothing to
+ * do here for this case, ensuring we don't have to provide an IO completion
+ * callback to free an ioend that we don't actually need for a fault into the
+ * page at offset (2^63 - 1FSB) bytes.
*/
+
static void
xfs_map_direct(
struct inode *inode,
struct buffer_head *bh_result,
struct xfs_bmbt_irec *imap,
- xfs_off_t offset)
+ xfs_off_t offset,
+ bool dax_fault)
{
struct xfs_ioend *ioend;
xfs_off_t size = bh_result->b_size;
@@ -1269,6 +1293,13 @@ xfs_map_direct(
trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap);
+ if (dax_fault) {
+ ASSERT(type == XFS_IO_OVERWRITE);
+ trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type,
+ imap);
+ return;
+ }
+
if (bh_result->b_private) {
ioend = bh_result->b_private;
ASSERT(ioend->io_size > 0);
@@ -1283,7 +1314,8 @@ xfs_map_direct(
ioend->io_size, ioend->io_type,
imap);
} else if (type == XFS_IO_UNWRITTEN ||
- offset + size > i_size_read(inode)) {
+ offset + size > i_size_read(inode) ||
+ offset + size < 0) {
ioend = xfs_alloc_ioend(inode, type);
ioend->io_offset = offset;
ioend->io_size = size;
@@ -1345,7 +1377,8 @@ __xfs_get_blocks(
sector_t iblock,
struct buffer_head *bh_result,
int create,
- bool direct)
+ bool direct,
+ bool dax_fault)
{
struct xfs_inode *ip = XFS_I(inode);
struct xfs_mount *mp = ip->i_mount;
@@ -1393,18 +1426,20 @@ __xfs_get_blocks(
if (error)
goto out_unlock;
+ /* for DAX, we convert unwritten extents directly */
if (create &&
(!nimaps ||
(imap.br_startblock == HOLESTARTBLOCK ||
- imap.br_startblock == DELAYSTARTBLOCK))) {
+ imap.br_startblock == DELAYSTARTBLOCK) ||
+ (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
if (direct || xfs_get_extsz_hint(ip)) {
/*
- * Drop the ilock in preparation for starting the block
- * allocation transaction. It will be retaken
- * exclusively inside xfs_iomap_write_direct for the
- * actual allocation.
+ * xfs_iomap_write_direct() expects the shared lock. It
+ * is unlocked on return.
*/
- xfs_iunlock(ip, lockmode);
+ if (lockmode == XFS_ILOCK_EXCL)
+ xfs_ilock_demote(ip, lockmode);
+
error = xfs_iomap_write_direct(ip, offset, size,
&imap, nimaps);
if (error)
@@ -1441,6 +1476,12 @@ __xfs_get_blocks(
goto out_unlock;
}
+ if (IS_DAX(inode) && create) {
+ ASSERT(!ISUNWRITTEN(&imap));
+ /* zeroing is not needed at a higher layer */
+ new = 0;
+ }
+
/* trim mapping down to size requested */
if (direct || size > (1 << inode->i_blkbits))
xfs_map_trim_size(inode, iblock, bh_result,
@@ -1458,7 +1499,8 @@ __xfs_get_blocks(
set_buffer_unwritten(bh_result);
/* direct IO needs special help */
if (create && direct)
- xfs_map_direct(inode, bh_result, &imap, offset);
+ xfs_map_direct(inode, bh_result, &imap, offset,
+ dax_fault);
}
/*
@@ -1505,7 +1547,7 @@ xfs_get_blocks(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, false);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
}
int
@@ -1515,7 +1557,17 @@ xfs_get_blocks_direct(
struct buffer_head *bh_result,
int create)
{
- return __xfs_get_blocks(inode, iblock, bh_result, create, true);
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
+}
+
+int
+xfs_get_blocks_dax_fault(
+ struct inode *inode,
+ sector_t iblock,
+ struct buffer_head *bh_result,
+ int create)
+{
+ return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
}
static void
@@ -1614,45 +1666,6 @@ xfs_end_io_direct_write(
__xfs_end_io_direct_write(inode, ioend, offset, size);
}
-/*
- * For DAX we need a mapping buffer callback for unwritten extent conversion
- * when page faults allocate blocks and then zero them. Note that in this
- * case the mapping indicated by the ioend may extend beyond EOF. We most
- * definitely do not want to extend EOF here, so we trim back the ioend size to
- * EOF.
- */
-#ifdef CONFIG_FS_DAX
-void
-xfs_end_io_dax_write(
- struct buffer_head *bh,
- int uptodate)
-{
- struct xfs_ioend *ioend = bh->b_private;
- struct inode *inode = ioend->io_inode;
- ssize_t size = ioend->io_size;
-
- ASSERT(IS_DAX(ioend->io_inode));
-
- /* if there was an error zeroing, then don't convert it */
- if (!uptodate)
- ioend->io_error = -EIO;
-
- /*
- * Trim update to EOF, so we don't extend EOF during unwritten extent
- * conversion of partial EOF blocks.
- */
- spin_lock(&XFS_I(inode)->i_flags_lock);
- if (ioend->io_offset + size > i_size_read(inode))
- size = i_size_read(inode) - ioend->io_offset;
- spin_unlock(&XFS_I(inode)->i_flags_lock);
-
- __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size);
-
-}
-#else
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { }
-#endif
-
static inline ssize_t
xfs_vm_do_dio(
struct inode *inode,
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 86afd1ac7895..f6ffc9ae5ceb 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -58,7 +58,8 @@ int xfs_get_blocks(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
int xfs_get_blocks_direct(struct inode *inode, sector_t offset,
struct buffer_head *map_bh, int create);
-void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate);
+int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
+ struct buffer_head *map_bh, int create);
extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 65fb37a18e92..0ef7c2ed3f8a 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -511,7 +511,7 @@ xfs_attr_list_int(
xfs_inode_t *dp = context->dp;
uint lock_mode;
- XFS_STATS_INC(xs_attr_list);
+ XFS_STATS_INC(dp->i_mount, xs_attr_list);
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
return -EIO;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 3bf4ad0d19e4..dbae6490a79a 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -57,6 +57,35 @@ xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
}
/*
+ * Routine to zero an extent on disk allocated to the specific inode.
+ *
+ * The VFS functions take a linearised filesystem block offset, so we have to
+ * convert the sparse xfs fsb to the right format first.
+ * VFS types are real funky, too.
+ */
+int
+xfs_zero_extent(
+ struct xfs_inode *ip,
+ xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ xfs_daddr_t sector = xfs_fsb_to_db(ip, start_fsb);
+ sector_t block = XFS_BB_TO_FSBT(mp, sector);
+ ssize_t size = XFS_FSB_TO_B(mp, count_fsb);
+
+ if (IS_DAX(VFS_I(ip)))
+ return dax_clear_blocks(VFS_I(ip), block, size);
+
+ /*
+ * let the block layer decide on the fastest method of
+ * implementing the zeroing.
+ */
+ return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
+
+}
+
+/*
* Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
* caller. Frees all the extents that need freeing, which must be done
* last due to locking considerations. We never free any extents in
@@ -229,6 +258,13 @@ xfs_bmap_rtalloc(
xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
+
+ /* Zero the extent if we were asked to do so */
+ if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
+ error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
+ if (error)
+ return error;
+ }
} else {
ap->length = 0;
}
@@ -1027,7 +1063,7 @@ xfs_alloc_file_space(
xfs_bmap_init(&free_list, &firstfsb);
error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, alloc_type, &firstfsb,
- 0, imapp, &nimaps, &free_list);
+ resblks, imapp, &nimaps, &free_list);
if (error) {
goto error0;
}
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8ecffb35935b..3243cdf97f33 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -201,7 +201,7 @@ _xfs_buf_alloc(
atomic_set(&bp->b_pin_count, 0);
init_waitqueue_head(&bp->b_waiters);
- XFS_STATS_INC(xb_create);
+ XFS_STATS_INC(target->bt_mount, xb_create);
trace_xfs_buf_init(bp, _RET_IP_);
return bp;
@@ -354,15 +354,16 @@ retry:
*/
if (!(++retries % 100))
xfs_err(NULL,
- "possible memory allocation deadlock in %s (mode:0x%x)",
+ "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
+ current->comm, current->pid,
__func__, gfp_mask);
- XFS_STATS_INC(xb_page_retries);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_retries);
congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry;
}
- XFS_STATS_INC(xb_page_found);
+ XFS_STATS_INC(bp->b_target->bt_mount, xb_page_found);
nbytes = min_t(size_t, size, PAGE_SIZE - offset);
size -= nbytes;
@@ -516,7 +517,7 @@ _xfs_buf_find(
new_bp->b_pag = pag;
spin_unlock(&pag->pag_buf_lock);
} else {
- XFS_STATS_INC(xb_miss_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
spin_unlock(&pag->pag_buf_lock);
xfs_perag_put(pag);
}
@@ -529,11 +530,11 @@ found:
if (!xfs_buf_trylock(bp)) {
if (flags & XBF_TRYLOCK) {
xfs_buf_rele(bp);
- XFS_STATS_INC(xb_busy_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
return NULL;
}
xfs_buf_lock(bp);
- XFS_STATS_INC(xb_get_locked_waited);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
}
/*
@@ -549,7 +550,7 @@ found:
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
- XFS_STATS_INC(xb_get_locked);
+ XFS_STATS_INC(btp->bt_mount, xb_get_locked);
return bp;
}
@@ -603,7 +604,7 @@ found:
}
}
- XFS_STATS_INC(xb_get);
+ XFS_STATS_INC(target->bt_mount, xb_get);
trace_xfs_buf_get(bp, flags, _RET_IP_);
return bp;
}
@@ -643,7 +644,7 @@ xfs_buf_read_map(
trace_xfs_buf_read(bp, flags, _RET_IP_);
if (!XFS_BUF_ISDONE(bp)) {
- XFS_STATS_INC(xb_get_read);
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index a989a9c7edb7..642d55d10075 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -666,7 +666,7 @@ xfs_readdir(
return -EIO;
ASSERT(S_ISDIR(dp->i_d.di_mode));
- XFS_STATS_INC(xs_dir_getdents);
+ XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
args.dp = dp;
args.geo = dp->i_mount->m_dir_geo;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 30cb3afb67f0..7ac6c5c586cb 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -75,9 +75,9 @@ xfs_qm_dqdestroy(
ASSERT(list_empty(&dqp->q_lru));
mutex_destroy(&dqp->q_qlock);
- kmem_zone_free(xfs_qm_dqzone, dqp);
- XFS_STATS_DEC(xs_qm_dquot);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
+ kmem_zone_free(xfs_qm_dqzone, dqp);
}
/*
@@ -605,7 +605,7 @@ xfs_qm_dqread(
break;
}
- XFS_STATS_INC(xs_qm_dquot);
+ XFS_STATS_INC(mp, xs_qm_dquot);
trace_xfs_dqread(dqp);
@@ -747,12 +747,12 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_hit(dqp);
- XFS_STATS_INC(xs_qm_dqcachehits);
+ XFS_STATS_INC(mp, xs_qm_dqcachehits);
*O_dqpp = dqp;
return 0;
}
mutex_unlock(&qi->qi_tree_lock);
- XFS_STATS_INC(xs_qm_dqcachemisses);
+ XFS_STATS_INC(mp, xs_qm_dqcachemisses);
/*
* Dquot cache miss. We don't want to keep the inode lock across
@@ -806,7 +806,7 @@ restart:
mutex_unlock(&qi->qi_tree_lock);
trace_xfs_dqget_dup(dqp);
xfs_qm_dqdestroy(dqp);
- XFS_STATS_INC(xs_qm_dquot_dups);
+ XFS_STATS_INC(mp, xs_qm_dquot_dups);
goto restart;
}
@@ -846,7 +846,7 @@ xfs_qm_dqput(
trace_xfs_dqput_free(dqp);
if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
- XFS_STATS_INC(xs_qm_dquot_unused);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e78feb400e22..f5392ab2def1 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
}
/*
- * All metadata updates are logged, which means that we just have
- * to flush the log up to the latest LSN that touched the inode.
+ * All metadata updates are logged, which means that we just have to
+ * flush the log up to the latest LSN that touched the inode. If we have
+ * concurrent fsync/fdatasync() calls, we need them to all block on the
+ * log force before we clear the ili_fsync_fields field. This ensures
+ * that we don't get a racing sync operation that does not wait for the
+ * metadata to hit the journal before returning. If we race with
+ * clearing the ili_fsync_fields, then all that will happen is the log
+ * force will do nothing as the lsn will already be on disk. We can't
+ * race with setting ili_fsync_fields because that is done under
+ * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+ * until after the ili_fsync_fields is cleared.
*/
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip)) {
if (!datasync ||
- (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
lsn = ip->i_itemp->ili_last_lsn;
}
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (lsn)
+ if (lsn) {
error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+ ip->i_itemp->ili_fsync_fields = 0;
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
/*
* If we only have a single device, and the log force about was
@@ -287,7 +298,7 @@ xfs_file_read_iter(
xfs_fsize_t n;
loff_t pos = iocb->ki_pos;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(mp, xs_read_calls);
if (unlikely(iocb->ki_flags & IOCB_DIRECT))
ioflags |= XFS_IO_ISDIRECT;
@@ -365,7 +376,7 @@ xfs_file_read_iter(
ret = generic_file_read_iter(iocb, to);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(mp, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -383,7 +394,7 @@ xfs_file_splice_read(
int ioflags = 0;
ssize_t ret;
- XFS_STATS_INC(xs_read_calls);
+ XFS_STATS_INC(ip->i_mount, xs_read_calls);
if (infilp->f_mode & FMODE_NOCMTIME)
ioflags |= XFS_IO_INVIS;
@@ -401,7 +412,7 @@ xfs_file_splice_read(
else
ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
if (ret > 0)
- XFS_STATS_ADD(xs_read_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@@ -482,6 +493,8 @@ xfs_zero_eof(
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
ASSERT(offset > isize);
+ trace_xfs_zero_eof(ip, isize, offset - isize);
+
/*
* First handle zeroing the block on which isize resides.
*
@@ -574,6 +587,7 @@ xfs_file_aio_write_checks(
struct xfs_inode *ip = XFS_I(inode);
ssize_t error = 0;
size_t count = iov_iter_count(from);
+ bool drained_dio = false;
restart:
error = generic_write_checks(iocb, from);
@@ -611,12 +625,13 @@ restart:
bool zero = false;
spin_unlock(&ip->i_flags_lock);
- if (*iolock == XFS_IOLOCK_SHARED) {
- xfs_rw_iunlock(ip, *iolock);
- *iolock = XFS_IOLOCK_EXCL;
- xfs_rw_ilock(ip, *iolock);
- iov_iter_reexpand(from, count);
-
+ if (!drained_dio) {
+ if (*iolock == XFS_IOLOCK_SHARED) {
+ xfs_rw_iunlock(ip, *iolock);
+ *iolock = XFS_IOLOCK_EXCL;
+ xfs_rw_ilock(ip, *iolock);
+ iov_iter_reexpand(from, count);
+ }
/*
* We now have an IO submission barrier in place, but
* AIO can do EOF updates during IO completion and hence
@@ -626,6 +641,7 @@ restart:
* no-op.
*/
inode_dio_wait(inode);
+ drained_dio = true;
goto restart;
}
error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -867,7 +883,7 @@ xfs_file_write_iter(
ssize_t ret;
size_t ocount = iov_iter_count(from);
- XFS_STATS_INC(xs_write_calls);
+ XFS_STATS_INC(ip->i_mount, xs_write_calls);
if (ocount == 0)
return 0;
@@ -883,7 +899,7 @@ xfs_file_write_iter(
if (ret > 0) {
ssize_t err;
- XFS_STATS_ADD(xs_write_bytes, ret);
+ XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
/* Handle various SYNC-type writes */
err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1477,7 +1493,7 @@ xfs_file_llseek(
*
* mmap_sem (MM)
* sb_start_pagefault(vfs, freeze)
- * i_mmap_lock (XFS - truncate serialisation)
+ * i_mmaplock (XFS - truncate serialisation)
* page_lock (MM)
* i_lock (XFS - extent map serialisation)
*/
@@ -1503,10 +1519,9 @@ xfs_filemap_page_mkwrite(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) {
- ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else {
- ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+ ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
ret = block_page_mkwrite_return(ret);
}
@@ -1538,7 +1553,7 @@ xfs_filemap_fault(
* changes to xfs_get_blocks_direct() to map unwritten extent
* ioend for conversion on read-only mappings.
*/
- ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+ ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
} else
ret = filemap_fault(vma, vmf);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1546,6 +1561,13 @@ xfs_filemap_fault(
return ret;
}
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
STATIC int
xfs_filemap_pmd_fault(
struct vm_area_struct *vma,
@@ -1562,15 +1584,54 @@ xfs_filemap_pmd_fault(
trace_xfs_filemap_pmd_fault(ip);
- sb_start_pagefault(inode->i_sb);
- file_update_time(vma->vm_file);
+ if (flags & FAULT_FLAG_WRITE) {
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+ }
+
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
- xfs_end_io_dax_write);
+ ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+ NULL);
xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
- sb_end_pagefault(inode->i_sb);
+ if (flags & FAULT_FLAG_WRITE)
+ sb_end_pagefault(inode->i_sb);
+
+ return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+ struct vm_area_struct *vma,
+ struct vm_fault *vmf)
+{
+
+ struct inode *inode = file_inode(vma->vm_file);
+ struct xfs_inode *ip = XFS_I(inode);
+ int ret = VM_FAULT_NOPAGE;
+ loff_t size;
+
+ trace_xfs_filemap_pfn_mkwrite(ip);
+
+ sb_start_pagefault(inode->i_sb);
+ file_update_time(vma->vm_file);
+
+ /* check if the faulting page hasn't raced with truncate */
+ xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+ size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ if (vmf->pgoff >= size)
+ ret = VM_FAULT_SIGBUS;
+ xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+ sb_end_pagefault(inode->i_sb);
return ret;
+
}
static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1578,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
.pmd_fault = xfs_filemap_pmd_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
+ .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
STATIC int
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 0a326bd64d4e..d7a490f24ead 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -63,7 +63,7 @@ xfs_inode_alloc(
return NULL;
}
- XFS_STATS_INC(vn_active);
+ XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!spin_is_locked(&ip->i_flags_lock));
ASSERT(!xfs_isiflocked(ip));
@@ -129,7 +129,7 @@ xfs_inode_free(
/* asserts to verify all state is correct here */
ASSERT(atomic_read(&ip->i_pincount) == 0);
ASSERT(!xfs_isiflocked(ip));
- XFS_STATS_DEC(vn_active);
+ XFS_STATS_DEC(ip->i_mount, vn_active);
call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
}
@@ -159,7 +159,7 @@ xfs_iget_cache_hit(
spin_lock(&ip->i_flags_lock);
if (ip->i_ino != ino) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -177,7 +177,7 @@ xfs_iget_cache_hit(
*/
if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
+ XFS_STATS_INC(mp, xs_ig_frecycle);
error = -EAGAIN;
goto out_error;
}
@@ -259,7 +259,7 @@ xfs_iget_cache_hit(
xfs_ilock(ip, lock_flags);
xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
+ XFS_STATS_INC(mp, xs_ig_found);
return 0;
@@ -342,7 +342,7 @@ xfs_iget_cache_miss(
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
+ XFS_STATS_INC(mp, xs_ig_dup);
error = -EAGAIN;
goto out_preload_end;
}
@@ -412,7 +412,7 @@ xfs_iget(
if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
return -EINVAL;
- XFS_STATS_INC(xs_ig_attempts);
+ XFS_STATS_INC(mp, xs_ig_attempts);
/* get the perag structure and ensure that it's inode capable */
pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
@@ -429,7 +429,7 @@ again:
goto out_error_or_again;
} else {
rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
+ XFS_STATS_INC(mp, xs_ig_missed);
error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
flags, lock_flags);
@@ -965,7 +965,7 @@ reclaim:
xfs_ifunlock(ip);
xfs_iunlock(ip, XFS_ILOCK_EXCL);
- XFS_STATS_INC(xs_ig_reclaims);
+ XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
/*
* Remove the inode from the per-AG radix tree.
*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index dc40a6d5ae0d..8ee393996b7d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2365,6 +2365,7 @@ retry:
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
&iip->ili_item.li_lsn);
@@ -3271,8 +3272,8 @@ xfs_iflush_cluster(
}
if (clcount) {
- XFS_STATS_INC(xs_icluster_flushcnt);
- XFS_STATS_ADD(xs_icluster_flushinode, clcount);
+ XFS_STATS_INC(mp, xs_icluster_flushcnt);
+ XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
}
out_free:
@@ -3345,7 +3346,7 @@ xfs_iflush(
struct xfs_dinode *dip;
int error;
- XFS_STATS_INC(xs_iflush_count);
+ XFS_STATS_INC(mp, xs_iflush_count);
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(xfs_isiflocked(ip));
@@ -3560,6 +3561,7 @@ xfs_iflush_int(
*/
iip->ili_last_fields = iip->ili_fields;
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
iip->ili_logged = 1;
xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 62bd80f4edd9..d14b12b8cfef 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -719,6 +719,7 @@ xfs_iflush_abort(
* attempted.
*/
iip->ili_fields = 0;
+ iip->ili_fsync_fields = 0;
}
/*
* Release the inode's flush lock since we're done with it.
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 488d81254e28..4c7722e325b3 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -34,6 +34,7 @@ typedef struct xfs_inode_log_item {
unsigned short ili_logged; /* flushed logged data */
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
+ unsigned int ili_fsync_fields; /* logged since last fsync */
} xfs_inode_log_item_t;
static inline int xfs_inode_clean(xfs_inode_t *ip)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index ea7d85af5310..d42738deec6d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -40,6 +40,7 @@
#include "xfs_symlink.h"
#include "xfs_trans.h"
#include "xfs_pnfs.h"
+#include "xfs_acl.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -411,7 +412,7 @@ xfs_attrlist_by_handle(
if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
@@ -455,7 +456,7 @@ xfs_attrmulti_attr_get(
unsigned char *kbuf;
int error = -EFAULT;
- if (*len > XATTR_SIZE_MAX)
+ if (*len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = kmem_zalloc_large(*len, KM_SLEEP);
if (!kbuf)
@@ -482,17 +483,22 @@ xfs_attrmulti_attr_set(
__uint32_t flags)
{
unsigned char *kbuf;
+ int error;
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- if (len > XATTR_SIZE_MAX)
+ if (len > XFS_XATTR_SIZE_MAX)
return -EINVAL;
kbuf = memdup_user(ubuf, len);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
- return xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ kfree(kbuf);
+ return error;
}
int
@@ -501,9 +507,14 @@ xfs_attrmulti_attr_remove(
unsigned char *name,
__uint32_t flags)
{
+ int error;
+
if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
return -EPERM;
- return xfs_attr_remove(XFS_I(inode), name, flags);
+ error = xfs_attr_remove(XFS_I(inode), name, flags);
+ if (!error)
+ xfs_forget_acl(inode, name, flags);
+ return error;
}
STATIC int
@@ -1028,7 +1039,7 @@ xfs_ioctl_setattr_xflags(
xfs_diflags_to_linux(ip);
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
return 0;
}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index b88bdc85dd3d..1a05d8ae327d 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -356,7 +356,7 @@ xfs_compat_attrlist_by_handle(
sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
return -EFAULT;
if (al_hreq.buflen < sizeof(struct attrlist) ||
- al_hreq.buflen > XATTR_LIST_MAX)
+ al_hreq.buflen > XFS_XATTR_LIST_MAX)
return -EINVAL;
/*
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 1f86033171c8..f4f5b43cf647 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -131,20 +131,30 @@ xfs_iomap_write_direct(
uint qblocks, resblks, resrtextents;
int committed;
int error;
-
- error = xfs_qm_dqattach(ip, 0);
- if (error)
- return error;
+ int lockmode;
+ int bmapi_flags = XFS_BMAPI_PREALLOC;
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
+ lockmode = XFS_ILOCK_SHARED; /* locked by caller */
+
+ ASSERT(xfs_isilocked(ip, lockmode));
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
if ((offset + count) > XFS_ISIZE(ip)) {
+ /*
+ * Assert that the in-core extent list is present since this can
+ * call xfs_iread_extents() and we only have the ilock shared.
+ * This should be safe because the lock was held around a bmapi
+ * call in the caller and we only need it to access the in-core
+ * list.
+ */
+ ASSERT(XFS_IFORK_PTR(ip, XFS_DATA_FORK)->if_flags &
+ XFS_IFEXTENTS);
error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
- return error;
+ goto out_unlock;
} else {
if (nmaps && (imap->br_startblock == HOLESTARTBLOCK))
last_fsb = MIN(last_fsb, (xfs_fileoff_t)
@@ -174,9 +184,35 @@ xfs_iomap_write_direct(
}
/*
+ * Drop the shared lock acquired by the caller, attach the dquot if
+ * necessary and move on to transaction setup.
+ */
+ xfs_iunlock(ip, lockmode);
+ error = xfs_qm_dqattach(ip, 0);
+ if (error)
+ return error;
+
+ /*
* Allocate and setup the transaction
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
+
+ /*
+ * For DAX, we do not allocate unwritten extents, but instead we zero
+ * the block before we commit the transaction. Ideally we'd like to do
+ * this outside the transaction context, but if we commit and then crash
+ * we may not have zeroed the blocks and this will be exposed on
+ * recovery of the allocation. Hence we must zero before commit.
+ * Further, if we are mapping unwritten extents here, we need to zero
+ * and convert them to written so that we don't need an unwritten extent
+ * callback for DAX. This also means that we need to be able to dip into
+ * the reserve block pool if there is no space left but we need to do
+ * unwritten extent conversion.
+ */
+ if (IS_DAX(VFS_I(ip))) {
+ bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
+ tp->t_flags |= XFS_TRANS_RESERVE;
+ }
error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
resblks, resrtextents);
/*
@@ -187,7 +223,8 @@ xfs_iomap_write_direct(
return error;
}
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, lockmode);
error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks, 0, quota_flag);
if (error)
@@ -202,8 +239,8 @@ xfs_iomap_write_direct(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_PREALLOC, &firstfsb, 0,
- imap, &nimaps, &free_list);
+ bmapi_flags, &firstfsb, resblks, imap,
+ &nimaps, &free_list);
if (error)
goto out_bmap_cancel;
@@ -213,6 +250,7 @@ xfs_iomap_write_direct(
error = xfs_bmap_finish(&tp, &free_list, &committed);
if (error)
goto out_bmap_cancel;
+
error = xfs_trans_commit(tp);
if (error)
goto out_unlock;
@@ -229,7 +267,7 @@ xfs_iomap_write_direct(
error = xfs_alert_fsblock_zero(ip, imap);
out_unlock:
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_iunlock(ip, lockmode);
return error;
out_bmap_cancel:
@@ -670,7 +708,7 @@ xfs_iomap_write_allocate(
count_fsb = imap->br_blockcount;
map_start_fsb = imap->br_startoff;
- XFS_STATS_ADD(xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
+ XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, count_fsb));
while (count_fsb != 0) {
/*
@@ -750,9 +788,9 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0,
- &first_block, 1,
- imap, &nimaps, &free_list);
+ count_fsb, 0, &first_block,
+ nres, imap, &nimaps,
+ &free_list);
if (error)
goto trans_cancel;
@@ -777,7 +815,7 @@ xfs_iomap_write_allocate(
if ((offset_fsb >= imap->br_startoff) &&
(offset_fsb < (imap->br_startoff +
imap->br_blockcount))) {
- XFS_STATS_INC(xs_xstrat_quick);
+ XFS_STATS_INC(mp, xs_xstrat_quick);
return 0;
}
@@ -866,8 +904,8 @@ xfs_iomap_write_unwritten(
xfs_bmap_init(&free_list, &firstfsb);
nimaps = 1;
error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb,
- XFS_BMAPI_CONVERT, &firstfsb,
- 1, &imap, &nimaps, &free_list);
+ XFS_BMAPI_CONVERT, &firstfsb, resblks,
+ &imap, &nimaps, &free_list);
if (error)
goto error_on_bmapi_transaction;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 8294132e6a3c..245268a0cdf0 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -695,7 +695,7 @@ xfs_setattr_nonsize(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
@@ -922,7 +922,7 @@ xfs_setattr_size(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- XFS_STATS_INC(xs_ig_attrchg);
+ XFS_STATS_INC(mp, xs_ig_attrchg);
if (mp->m_flags & XFS_MOUNT_WSYNC)
xfs_trans_set_sync(tp);
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 85f883dd6207..ec0e239a0fa9 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -171,6 +171,13 @@ struct xfs_kobj {
struct completion complete;
};
+struct xstats {
+ struct xfsstats __percpu *xs_stats;
+ struct xfs_kobj xs_kobj;
+};
+
+extern struct xstats xfsstats;
+
/* Kernel uid/gid conversion. These are used to convert to/from the on disk
* uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
* The conversion here is type only, the value will remain the same since we
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index aaadee0969c9..f52c72a1a06f 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -268,7 +268,7 @@ xlog_grant_head_wait(
__set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(&head->lock);
- XFS_STATS_INC(xs_sleep_logspace);
+ XFS_STATS_INC(log->l_mp, xs_sleep_logspace);
trace_xfs_log_grant_sleep(log, tic);
schedule();
@@ -379,7 +379,7 @@ xfs_log_regrant(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
/*
* This is a new transaction on the ticket, so we need to change the
@@ -448,7 +448,7 @@ xfs_log_reserve(
if (XLOG_FORCED_SHUTDOWN(log))
return -EIO;
- XFS_STATS_INC(xs_try_logspace);
+ XFS_STATS_INC(mp, xs_try_logspace);
ASSERT(*ticp == NULL);
tic = xlog_ticket_alloc(log, unit_bytes, cnt, client, permanent,
@@ -1768,7 +1768,7 @@ xlog_sync(
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
int size;
- XFS_STATS_INC(xs_log_writes);
+ XFS_STATS_INC(log->l_mp, xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
/* Add for LR header */
@@ -1805,7 +1805,7 @@ xlog_sync(
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
- XFS_STATS_ADD(xs_log_blocks, BTOBB(count));
+ XFS_STATS_ADD(log->l_mp, xs_log_blocks, BTOBB(count));
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
@@ -2422,11 +2422,20 @@ xlog_write(
&partial_copy_len);
xlog_verify_dest_ptr(log, ptr);
- /* copy region */
+ /*
+ * Copy region.
+ *
+ * Unmount records just log an opheader, so can have
+ * empty payloads with no data region to copy. Hence we
+ * only copy the payload if the vector says it has data
+ * to copy.
+ */
ASSERT(copy_len >= 0);
- memcpy(ptr, reg->i_addr + copy_off, copy_len);
- xlog_write_adv_cnt(&ptr, &len, &log_offset, copy_len);
-
+ if (copy_len > 0) {
+ memcpy(ptr, reg->i_addr + copy_off, copy_len);
+ xlog_write_adv_cnt(&ptr, &len, &log_offset,
+ copy_len);
+ }
copy_len += start_rec_copy + sizeof(xlog_op_header_t);
record_cnt++;
data_cnt += contwr ? copy_len : 0;
@@ -2913,7 +2922,7 @@ restart:
iclog = log->l_iclog;
if (iclog->ic_state != XLOG_STATE_ACTIVE) {
- XFS_STATS_INC(xs_log_noiclogs);
+ XFS_STATS_INC(log->l_mp, xs_log_noiclogs);
/* Wait for log writes to have flushed */
xlog_wait(&log->l_flush_wait, &log->l_icloglock);
@@ -3165,11 +3174,19 @@ xlog_state_switch_iclogs(
}
if (log->l_curr_block >= log->l_logBBsize) {
+ /*
+ * Rewind the current block before the cycle is bumped to make
+ * sure that the combined LSN never transiently moves forward
+ * when the log wraps to the next cycle. This is to support the
+ * unlocked sample of these fields from xlog_valid_lsn(). Most
+ * other cases should acquire l_icloglock.
+ */
+ log->l_curr_block -= log->l_logBBsize;
+ ASSERT(log->l_curr_block >= 0);
+ smp_wmb();
log->l_curr_cycle++;
if (log->l_curr_cycle == XLOG_HEADER_MAGIC_NUM)
log->l_curr_cycle++;
- log->l_curr_block -= log->l_logBBsize;
- ASSERT(log->l_curr_block >= 0);
}
ASSERT(iclog == log->l_iclog);
log->l_iclog = iclog->ic_next;
@@ -3212,7 +3229,7 @@ _xfs_log_force(
struct xlog_in_core *iclog;
xfs_lsn_t lsn;
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
xlog_cil_force(log);
@@ -3297,7 +3314,7 @@ maybe_sleep:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -3362,7 +3379,7 @@ _xfs_log_force_lsn(
ASSERT(lsn != 0);
- XFS_STATS_INC(xs_log_force);
+ XFS_STATS_INC(mp, xs_log_force);
lsn = xlog_cil_force_lsn(log, lsn);
if (lsn == NULLCOMMITLSN)
@@ -3411,7 +3428,7 @@ try_again:
(XLOG_STATE_WANT_SYNC | XLOG_STATE_SYNCING))) {
ASSERT(!(iclog->ic_state & XLOG_STATE_IOERROR));
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
@@ -3441,7 +3458,7 @@ try_again:
spin_unlock(&log->l_icloglock);
return -EIO;
}
- XFS_STATS_INC(xs_log_force_sleep);
+ XFS_STATS_INC(mp, xs_log_force_sleep);
xlog_wait(&iclog->ic_force_wait, &log->l_icloglock);
/*
* No need to grab the log lock here since we're
@@ -4023,3 +4040,45 @@ xlog_iclogs_empty(
return 1;
}
+/*
+ * Verify that an LSN stamped into a piece of metadata is valid. This is
+ * intended for use in read verifiers on v5 superblocks.
+ */
+bool
+xfs_log_check_lsn(
+ struct xfs_mount *mp,
+ xfs_lsn_t lsn)
+{
+ struct xlog *log = mp->m_log;
+ bool valid;
+
+ /*
+ * norecovery mode skips mount-time log processing and unconditionally
+ * resets the in-core LSN. We can't validate in this mode, but
+ * modifications are not allowed anyways so just return true.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return true;
+
+ /*
+ * Some metadata LSNs are initialized to NULL (e.g., the agfl). This is
+ * handled by recovery and thus safe to ignore here.
+ */
+ if (lsn == NULLCOMMITLSN)
+ return true;
+
+ valid = xlog_valid_lsn(mp->m_log, lsn);
+
+ /* warn the user about what's gone wrong before verifier failure */
+ if (!valid) {
+ spin_lock(&log->l_icloglock);
+ xfs_warn(mp,
+"Corruption warning: Metadata has LSN (%d:%d) ahead of current LSN (%d:%d). "
+"Please unmount and run xfs_repair (>= v4.3) to resolve.",
+ CYCLE_LSN(lsn), BLOCK_LSN(lsn),
+ log->l_curr_cycle, log->l_curr_block);
+ spin_unlock(&log->l_icloglock);
+ }
+
+ return valid;
+}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 09d91d3166cd..aa533a7d50f2 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,6 @@ bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
void xfs_log_work_queue(struct xfs_mount *mp);
void xfs_log_worker(struct work_struct *work);
void xfs_log_quiesce(struct xfs_mount *mp);
+bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 950f3f94720c..8daba7491b13 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -560,4 +560,55 @@ static inline void xlog_wait(wait_queue_head_t *wq, spinlock_t *lock)
remove_wait_queue(wq, &wait);
}
+/*
+ * The LSN is valid so long as it is behind the current LSN. If it isn't, this
+ * means that the next log record that includes this metadata could have a
+ * smaller LSN. In turn, this means that the modification in the log would not
+ * replay.
+ */
+static inline bool
+xlog_valid_lsn(
+ struct xlog *log,
+ xfs_lsn_t lsn)
+{
+ int cur_cycle;
+ int cur_block;
+ bool valid = true;
+
+ /*
+ * First, sample the current lsn without locking to avoid added
+ * contention from metadata I/O. The current cycle and block are updated
+ * (in xlog_state_switch_iclogs()) and read here in a particular order
+ * to avoid false negatives (e.g., thinking the metadata LSN is valid
+ * when it is not).
+ *
+ * The current block is always rewound before the cycle is bumped in
+ * xlog_state_switch_iclogs() to ensure the current LSN is never seen in
+ * a transiently forward state. Instead, we can see the LSN in a
+ * transiently behind state if we happen to race with a cycle wrap.
+ */
+ cur_cycle = ACCESS_ONCE(log->l_curr_cycle);
+ smp_rmb();
+ cur_block = ACCESS_ONCE(log->l_curr_block);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block)) {
+ /*
+ * If the metadata LSN appears invalid, it's possible the check
+ * above raced with a wrap to the next log cycle. Grab the lock
+ * to check for sure.
+ */
+ spin_lock(&log->l_icloglock);
+ cur_cycle = log->l_curr_cycle;
+ cur_block = log->l_curr_block;
+ spin_unlock(&log->l_icloglock);
+
+ if ((CYCLE_LSN(lsn) > cur_cycle) ||
+ (CYCLE_LSN(lsn) == cur_cycle && BLOCK_LSN(lsn) > cur_block))
+ valid = false;
+ }
+
+ return valid;
+}
+
#endif /* __XFS_LOG_PRIV_H__ */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 512a0945d52a..c5ecaacdd218 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3431,7 +3431,7 @@ xlog_recover_add_to_cont_trans(
* previous record. Copy the rest of the header.
*/
if (list_empty(&trans->r_itemq)) {
- ASSERT(len < sizeof(struct xfs_trans_header));
+ ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
return -EIO;
@@ -4609,9 +4609,19 @@ xlog_recover(
int error;
/* find the tail of the log */
- if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
+ error = xlog_find_tail(log, &head_blk, &tail_blk);
+ if (error)
return error;
+ /*
+ * The superblock was read before the log was available and thus the LSN
+ * could not be verified. Check the superblock LSN against the current
+ * LSN now that it's known.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
+ !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
+ return -EINVAL;
+
if (tail_blk != head_blk) {
/* There used to be a comment here:
*
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index d8b67547ab34..11792d888e4e 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -17,6 +17,7 @@
#include "xfs.h"
#include "xfs_fs.h"
+#include "xfs_error.h"
#include "xfs_format.h"
#include "xfs_log_format.h"
#include "xfs_trans_resv.h"
@@ -43,6 +44,7 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
{ \
struct va_format vaf; \
va_list args; \
+ int level; \
\
va_start(args, fmt); \
\
@@ -51,6 +53,11 @@ void func(const struct xfs_mount *mp, const char *fmt, ...) \
\
__xfs_printk(kern_level, mp, &vaf); \
va_end(args); \
+ \
+ if (!kstrtoint(kern_level, 0, &level) && \
+ level <= LOGLEVEL_ERR && \
+ xfs_error_level >= XFS_ERRLEVEL_HIGH) \
+ xfs_stack_trace(); \
} \
define_xfs_printk_level(xfs_emerg, KERN_EMERG);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bf92e0c037c7..bb753b359bee 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -47,6 +47,16 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
static uuid_t *xfs_uuid_table;
+void
+xfs_uuid_table_free(void)
+{
+ if (xfs_uuid_table_size == 0)
+ return;
+ kmem_free(xfs_uuid_table);
+ xfs_uuid_table = NULL;
+ xfs_uuid_table_size = 0;
+}
+
/*
* See if the UUID is unique among mounted XFS filesystems.
* Mount fails if UUID is nil or a FS with the same UUID is already mounted.
@@ -693,10 +703,15 @@ xfs_mountfs(
if (error)
goto out;
- error = xfs_uuid_mount(mp);
+ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype,
+ &mp->m_kobj, "stats");
if (error)
goto out_remove_sysfs;
+ error = xfs_uuid_mount(mp);
+ if (error)
+ goto out_del_stats;
+
/*
* Set the minimum read and write sizes
*/
@@ -971,6 +986,8 @@ xfs_mountfs(
xfs_da_unmount(mp);
out_remove_uuid:
xfs_uuid_unmount(mp);
+ out_del_stats:
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
out_remove_sysfs:
xfs_sysfs_del(&mp->m_kobj);
out:
@@ -1047,6 +1064,7 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
+
xfs_log_unmount(mp);
xfs_da_unmount(mp);
xfs_uuid_unmount(mp);
@@ -1056,6 +1074,7 @@ xfs_unmountfs(
#endif
xfs_free_perag(mp);
+ xfs_sysfs_del(&mp->m_stats.xs_kobj);
xfs_sysfs_del(&mp->m_kobj);
}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 7999e91cd49a..b57098481c10 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -127,6 +127,7 @@ typedef struct xfs_mount {
int64_t m_low_space[XFS_LOWSP_MAX];
/* low free space thresholds */
struct xfs_kobj m_kobj;
+ struct xstats m_stats; /* per-fs stats */
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_data_workqueue;
@@ -312,6 +313,7 @@ typedef struct xfs_perag {
int pagb_count; /* pagb slots in use */
} xfs_perag_t;
+extern void xfs_uuid_table_free(void);
extern int xfs_log_sbcount(xfs_mount_t *);
extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
extern int xfs_mountfs(xfs_mount_t *mp);
@@ -336,4 +338,7 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
extern void xfs_set_low_space_thresholds(struct xfs_mount *);
+int xfs_zero_extent(struct xfs_inode *ip, xfs_fsblock_t start_fsb,
+ xfs_off_t count_fsb);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index ab4a6066f7ca..dc6221942b85 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -181,6 +181,11 @@ xfs_fs_map_blocks(
ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+ /*
+ * xfs_iomap_write_direct() expects to take ownership of
+ * the shared ilock.
+ */
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
error = xfs_iomap_write_direct(ip, offset, length,
&imap, nimaps);
if (error)
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index eac9549efd52..532ab79d38fe 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -184,7 +184,7 @@ xfs_qm_dqpurge(
*/
ASSERT(!list_empty(&dqp->q_lru));
list_lru_del(&qi->qi_lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(mp, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
return 0;
@@ -448,11 +448,11 @@ xfs_qm_dquot_isolate(
*/
if (dqp->q_nrefs) {
xfs_dqunlock(dqp);
- XFS_STATS_INC(xs_qm_dqwants);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
trace_xfs_dqreclaim_want(dqp);
list_lru_isolate(lru, &dqp->q_lru);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
return LRU_REMOVED;
}
@@ -496,19 +496,19 @@ xfs_qm_dquot_isolate(
ASSERT(dqp->q_nrefs == 0);
list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
- XFS_STATS_DEC(xs_qm_dquot_unused);
+ XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
trace_xfs_dqreclaim_done(dqp);
- XFS_STATS_INC(xs_qm_dqreclaims);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
return LRU_REMOVED;
out_miss_busy:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
return LRU_SKIP;
out_unlock_dirty:
trace_xfs_dqreclaim_busy(dqp);
- XFS_STATS_INC(xs_qm_dqreclaim_misses);
+ XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
xfs_dqunlock(dqp);
spin_lock(lru_lock);
return LRU_RETRY;
@@ -525,7 +525,7 @@ xfs_qm_shrink_scan(
unsigned long freed;
int error;
- if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+ if ((sc->gfp_mask & (__GFP_FS|__GFP_DIRECT_RECLAIM)) != (__GFP_FS|__GFP_DIRECT_RECLAIM))
return 0;
INIT_LIST_HEAD(&isol.buffers);
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index f2240383d4bb..8686df6c7609 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -18,20 +18,21 @@
#include "xfs.h"
#include <linux/proc_fs.h>
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
+struct xstats xfsstats;
-static int counter_val(int idx)
+static int counter_val(struct xfsstats __percpu *stats, int idx)
{
int val = 0, cpu;
for_each_possible_cpu(cpu)
- val += *(((__u32 *)&per_cpu(xfsstats, cpu) + idx));
+ val += *(((__u32 *)per_cpu_ptr(stats, cpu) + idx));
return val;
}
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
{
int i, j;
+ int len = 0;
__uint64_t xs_xstrat_bytes = 0;
__uint64_t xs_write_bytes = 0;
__uint64_t xs_read_bytes = 0;
@@ -65,54 +66,59 @@ static int xfs_stat_proc_show(struct seq_file *m, void *v)
};
/* Loop over all stats groups */
+
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- seq_printf(m, "%s", xstats[i].desc);
+ len += snprintf(buf + len, PATH_MAX - len, "%s",
+ xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- seq_printf(m, " %u", counter_val(j));
- seq_putc(m, '\n');
+ len += snprintf(buf + len, PATH_MAX - len, " %u",
+ counter_val(stats, j));
+ len += snprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
- xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
- xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
- xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+ xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
+ xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
+ xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
}
- seq_printf(m, "xpc %Lu %Lu %Lu\n",
+ len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- seq_printf(m, "debug %u\n",
+ len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
0);
#endif
- return 0;
+
+ return len;
}
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+void xfs_stats_clearall(struct xfsstats __percpu *stats)
{
- return single_open(file, xfs_stat_proc_show, NULL);
+ int c;
+ __uint32_t vn_active;
+
+ xfs_notice(NULL, "Clearing xfsstats");
+ for_each_possible_cpu(c) {
+ preempt_disable();
+ /* save vn_active, it's a universal truth! */
+ vn_active = per_cpu_ptr(stats, c)->vn_active;
+ memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
+ per_cpu_ptr(stats, c)->vn_active = vn_active;
+ preempt_enable();
+ }
}
-static const struct file_operations xfs_stat_proc_fops = {
- .owner = THIS_MODULE,
- .open = xfs_stat_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
-
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
seq_printf(m, "%d\t%d\t%d\t%u\n",
- 0,
- counter_val(XFSSTAT_END_XQMSTAT),
- 0,
- counter_val(XFSSTAT_END_XQMSTAT + 1));
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT),
+ 0, counter_val(xfsstats.xs_stats, XFSSTAT_END_XQMSTAT + 1));
return 0;
}
@@ -136,7 +142,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
seq_printf(m, "qm");
for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
- seq_printf(m, " %u", counter_val(j));
+ seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
}
@@ -155,44 +161,35 @@ static const struct file_operations xqmstat_proc_fops = {
};
#endif /* CONFIG_XFS_QUOTA */
+#ifdef CONFIG_PROC_FS
int
xfs_init_procfs(void)
{
if (!proc_mkdir("fs/xfs", NULL))
+ return -ENOMEM;
+
+ if (!proc_symlink("fs/xfs/stat", NULL,
+ "/sys/fs/xfs/stats/stats"))
goto out;
- if (!proc_create("fs/xfs/stat", 0, NULL,
- &xfs_stat_proc_fops))
- goto out_remove_xfs_dir;
#ifdef CONFIG_XFS_QUOTA
if (!proc_create("fs/xfs/xqmstat", 0, NULL,
&xqmstat_proc_fops))
- goto out_remove_stat_file;
+ goto out;
if (!proc_create("fs/xfs/xqm", 0, NULL,
&xqm_proc_fops))
- goto out_remove_xqmstat_file;
+ goto out;
#endif
return 0;
-#ifdef CONFIG_XFS_QUOTA
- out_remove_xqmstat_file:
- remove_proc_entry("fs/xfs/xqmstat", NULL);
- out_remove_stat_file:
- remove_proc_entry("fs/xfs/stat", NULL);
-#endif
- out_remove_xfs_dir:
- remove_proc_entry("fs/xfs", NULL);
- out:
+out:
+ remove_proc_subtree("fs/xfs", NULL);
return -ENOMEM;
}
void
xfs_cleanup_procfs(void)
{
-#ifdef CONFIG_XFS_QUOTA
- remove_proc_entry("fs/xfs/xqm", NULL);
- remove_proc_entry("fs/xfs/xqmstat", NULL);
-#endif
- remove_proc_entry("fs/xfs/stat", NULL);
- remove_proc_entry("fs/xfs", NULL);
+ remove_proc_subtree("fs/xfs", NULL);
}
+#endif /* CONFIG_PROC_FS */
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index c8f238b8299a..483b0eff1988 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -19,8 +19,6 @@
#define __XFS_STATS_H__
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
#include <linux/percpu.h>
/*
@@ -215,15 +213,29 @@ struct xfsstats {
__uint64_t xs_read_bytes;
};
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
+int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
+void xfs_stats_clearall(struct xfsstats __percpu *stats);
+extern struct xstats xfsstats;
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v) (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v) (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc) (per_cpu(xfsstats, current_cpu()).v += (inc))
+#define XFS_STATS_INC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++; \
+} while (0)
+
+#define XFS_STATS_DEC(mp, v) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--; \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--; \
+} while (0)
+
+#define XFS_STATS_ADD(mp, v, inc) \
+do { \
+ per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc); \
+ per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc); \
+} while (0)
+
+#if defined(CONFIG_PROC_FS)
extern int xfs_init_procfs(void);
extern void xfs_cleanup_procfs(void);
@@ -231,10 +243,6 @@ extern void xfs_cleanup_procfs(void);
#else /* !CONFIG_PROC_FS */
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
static inline int xfs_init_procfs(void)
{
return 0;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 904f637cfa5f..36bd8825bfb0 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -838,17 +838,18 @@ xfs_init_mount_workqueues(
goto out_destroy_unwritten;
mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_reclaim_workqueue)
goto out_destroy_cil;
mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
- WQ_FREEZABLE|WQ_HIGHPRI, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE|WQ_HIGHPRI, 0,
+ mp->m_fsname);
if (!mp->m_log_workqueue)
goto out_destroy_reclaim;
mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
- WQ_FREEZABLE, 0, mp->m_fsname);
+ WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname);
if (!mp->m_eofblocks_workqueue)
goto out_destroy_log;
@@ -922,7 +923,7 @@ xfs_fs_destroy_inode(
trace_xfs_destroy_inode(ip);
- XFS_STATS_INC(vn_reclaim);
+ XFS_STATS_INC(ip->i_mount, vn_reclaim);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -983,8 +984,8 @@ xfs_fs_evict_inode(
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
+ XFS_STATS_INC(ip->i_mount, vn_rele);
+ XFS_STATS_INC(ip->i_mount, vn_remove);
xfs_inactive(ip);
}
@@ -1474,9 +1475,16 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_workqueues;
+ /* Allocate stats memory before we do operations that might use it */
+ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!mp->m_stats.xs_stats) {
+ error = -ENOMEM;
+ goto out_destroy_counters;
+ }
+
error = xfs_readsb(mp, flags);
if (error)
- goto out_destroy_counters;
+ goto out_free_stats;
error = xfs_finish_flags(mp);
if (error)
@@ -1545,9 +1553,11 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
+ out_free_stats:
+ free_percpu(mp->m_stats.xs_stats);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
-out_destroy_workqueues:
+ out_destroy_workqueues:
xfs_destroy_mount_workqueues(mp);
out_close_devices:
xfs_close_devices(mp);
@@ -1574,6 +1584,7 @@ xfs_fs_put_super(
xfs_unmountfs(mp);
xfs_freesb(mp);
+ free_percpu(mp->m_stats.xs_stats);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
xfs_close_devices(mp);
@@ -1838,19 +1849,32 @@ init_xfs_fs(void)
xfs_kset = kset_create_and_add("xfs", NULL, fs_kobj);
if (!xfs_kset) {
error = -ENOMEM;
- goto out_sysctl_unregister;;
+ goto out_sysctl_unregister;
}
+ xfsstats.xs_kobj.kobject.kset = xfs_kset;
+
+ xfsstats.xs_stats = alloc_percpu(struct xfsstats);
+ if (!xfsstats.xs_stats) {
+ error = -ENOMEM;
+ goto out_kset_unregister;
+ }
+
+ error = xfs_sysfs_init(&xfsstats.xs_kobj, &xfs_stats_ktype, NULL,
+ "stats");
+ if (error)
+ goto out_free_stats;
+
#ifdef DEBUG
xfs_dbg_kobj.kobject.kset = xfs_kset;
error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug");
if (error)
- goto out_kset_unregister;
+ goto out_remove_stats_kobj;
#endif
error = xfs_qm_init();
if (error)
- goto out_remove_kobj;
+ goto out_remove_dbg_kobj;
error = register_filesystem(&xfs_fs_type);
if (error)
@@ -1859,11 +1883,15 @@ init_xfs_fs(void)
out_qm_exit:
xfs_qm_exit();
- out_remove_kobj:
+ out_remove_dbg_kobj:
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
- out_kset_unregister:
+ out_remove_stats_kobj:
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ out_free_stats:
+ free_percpu(xfsstats.xs_stats);
+ out_kset_unregister:
kset_unregister(xfs_kset);
out_sysctl_unregister:
xfs_sysctl_unregister();
@@ -1889,6 +1917,8 @@ exit_xfs_fs(void)
#ifdef DEBUG
xfs_sysfs_del(&xfs_dbg_kobj);
#endif
+ xfs_sysfs_del(&xfsstats.xs_kobj);
+ free_percpu(xfsstats.xs_stats);
kset_unregister(xfs_kset);
xfs_sysctl_unregister();
xfs_cleanup_procfs();
@@ -1896,6 +1926,7 @@ exit_xfs_fs(void)
xfs_mru_cache_uninit();
xfs_destroy_workqueues();
xfs_destroy_zones();
+ xfs_uuid_table_free();
}
module_init(init_xfs_fs);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index a0c8067cea6f..aed74d3f8da9 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -19,6 +19,7 @@
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "xfs_error.h"
+#include "xfs_stats.h"
static struct ctl_table_header *xfs_table_header;
@@ -31,22 +32,12 @@ xfs_stats_clear_proc_handler(
size_t *lenp,
loff_t *ppos)
{
- int c, ret, *valp = ctl->data;
- __uint32_t vn_active;
+ int ret, *valp = ctl->data;
ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
if (!ret && write && *valp) {
- xfs_notice(NULL, "Clearing xfsstats");
- for_each_possible_cpu(c) {
- preempt_disable();
- /* save vn_active, it's a universal truth! */
- vn_active = per_cpu(xfsstats, c).vn_active;
- memset(&per_cpu(xfsstats, c), 0,
- sizeof(struct xfsstats));
- per_cpu(xfsstats, c).vn_active = vn_active;
- preempt_enable();
- }
+ xfs_stats_clearall(xfsstats.xs_stats);
xfs_stats_clear = 0;
}
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index aa03670851d8..ee70f5dec9dc 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -21,11 +21,13 @@
#include "xfs_log_format.h"
#include "xfs_log.h"
#include "xfs_log_priv.h"
+#include "xfs_stats.h"
struct xfs_sysfs_attr {
struct attribute attr;
- ssize_t (*show)(char *buf, void *data);
- ssize_t (*store)(const char *buf, size_t count, void *data);
+ ssize_t (*show)(struct kobject *kobject, char *buf);
+ ssize_t (*store)(struct kobject *kobject, const char *buf,
+ size_t count);
};
static inline struct xfs_sysfs_attr *
@@ -38,6 +40,8 @@ to_attr(struct attribute *attr)
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name)
#define XFS_SYSFS_ATTR_RO(name) \
static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name)
+#define XFS_SYSFS_ATTR_WO(name) \
+ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name)
#define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr
@@ -51,14 +55,42 @@ struct kobj_type xfs_mp_ktype = {
.release = xfs_sysfs_release,
};
+STATIC ssize_t
+xfs_sysfs_object_show(
+ struct kobject *kobject,
+ struct attribute *attr,
+ char *buf)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0;
+}
+
+STATIC ssize_t
+xfs_sysfs_object_store(
+ struct kobject *kobject,
+ struct attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+
+ return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0;
+}
+
+static const struct sysfs_ops xfs_sysfs_ops = {
+ .show = xfs_sysfs_object_show,
+ .store = xfs_sysfs_object_store,
+};
+
#ifdef DEBUG
/* debug */
STATIC ssize_t
log_recovery_delay_store(
+ struct kobject *kobject,
const char *buf,
- size_t count,
- void *data)
+ size_t count)
{
int ret;
int val;
@@ -77,8 +109,8 @@ log_recovery_delay_store(
STATIC ssize_t
log_recovery_delay_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay);
}
@@ -89,52 +121,87 @@ static struct attribute *xfs_dbg_attrs[] = {
NULL,
};
+struct kobj_type xfs_dbg_ktype = {
+ .release = xfs_sysfs_release,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_dbg_attrs,
+};
+
+#endif /* DEBUG */
+
+/* stats */
+
+static inline struct xstats *
+to_xstats(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xstats, xs_kobj);
+}
+
STATIC ssize_t
-xfs_dbg_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
+stats_show(
+ struct kobject *kobject,
+ char *buf)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ struct xstats *stats = to_xstats(kobject);
- return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0;
+ return xfs_stats_format(stats->xs_stats, buf);
}
+XFS_SYSFS_ATTR_RO(stats);
STATIC ssize_t
-xfs_dbg_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
+stats_clear_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
{
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
+ int ret;
+ int val;
+ struct xstats *stats = to_xstats(kobject);
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
- return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0;
+ if (val != 1)
+ return -EINVAL;
+
+ xfs_stats_clearall(stats->xs_stats);
+ return count;
}
+XFS_SYSFS_ATTR_WO(stats_clear);
-static struct sysfs_ops xfs_dbg_ops = {
- .show = xfs_dbg_show,
- .store = xfs_dbg_store,
+static struct attribute *xfs_stats_attrs[] = {
+ ATTR_LIST(stats),
+ ATTR_LIST(stats_clear),
+ NULL,
};
-struct kobj_type xfs_dbg_ktype = {
+struct kobj_type xfs_stats_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_dbg_ops,
- .default_attrs = xfs_dbg_attrs,
+ .sysfs_ops = &xfs_sysfs_ops,
+ .default_attrs = xfs_stats_attrs,
};
-#endif /* DEBUG */
-
/* xlog */
+static inline struct xlog *
+to_xlog(struct kobject *kobject)
+{
+ struct xfs_kobj *kobj = to_kobj(kobject);
+
+ return container_of(kobj, struct xlog, l_kobj);
+}
+
STATIC ssize_t
log_head_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
spin_lock(&log->l_icloglock);
cycle = log->l_curr_cycle;
@@ -147,12 +214,12 @@ XFS_SYSFS_ATTR_RO(log_head_lsn);
STATIC ssize_t
log_tail_lsn_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int block;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block);
@@ -161,12 +228,13 @@ XFS_SYSFS_ATTR_RO(log_tail_lsn);
STATIC ssize_t
reserve_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
+
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -175,12 +243,12 @@ XFS_SYSFS_ATTR_RO(reserve_grant_head);
STATIC ssize_t
write_grant_head_show(
- char *buf,
- void *data)
+ struct kobject *kobject,
+ char *buf)
{
- struct xlog *log = data;
int cycle;
int bytes;
+ struct xlog *log = to_xlog(kobject);
xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes);
return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes);
@@ -195,45 +263,8 @@ static struct attribute *xfs_log_attrs[] = {
NULL,
};
-static inline struct xlog *
-to_xlog(struct kobject *kobject)
-{
- struct xfs_kobj *kobj = to_kobj(kobject);
- return container_of(kobj, struct xlog, l_kobj);
-}
-
-STATIC ssize_t
-xfs_log_show(
- struct kobject *kobject,
- struct attribute *attr,
- char *buf)
-{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
- return xfs_attr->show ? xfs_attr->show(buf, log) : 0;
-}
-
-STATIC ssize_t
-xfs_log_store(
- struct kobject *kobject,
- struct attribute *attr,
- const char *buf,
- size_t count)
-{
- struct xlog *log = to_xlog(kobject);
- struct xfs_sysfs_attr *xfs_attr = to_attr(attr);
-
- return xfs_attr->store ? xfs_attr->store(buf, count, log) : 0;
-}
-
-static struct sysfs_ops xfs_log_ops = {
- .show = xfs_log_show,
- .store = xfs_log_store,
-};
-
struct kobj_type xfs_log_ktype = {
.release = xfs_sysfs_release,
- .sysfs_ops = &xfs_log_ops,
+ .sysfs_ops = &xfs_sysfs_ops,
.default_attrs = xfs_log_attrs,
};
diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h
index 240eee35f342..be692e59938d 100644
--- a/fs/xfs/xfs_sysfs.h
+++ b/fs/xfs/xfs_sysfs.h
@@ -22,6 +22,7 @@
extern struct kobj_type xfs_mp_ktype; /* xfs_mount */
extern struct kobj_type xfs_dbg_ktype; /* debug */
extern struct kobj_type xfs_log_ktype; /* xlog */
+extern struct kobj_type xfs_stats_ktype; /* stats */
static inline struct xfs_kobj *
to_kobj(struct kobject *kobject)
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 5ed36b1e04c1..877079eb0f8f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -689,6 +689,7 @@ DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_fault);
DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite);
+DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
@@ -1312,6 +1313,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+DEFINE_SIMPLE_IO_EVENT(xfs_zero_eof);
DECLARE_EVENT_CLASS(xfs_itrunc_class,
TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a0ab1dae9c31..748b16aff45a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -930,9 +930,9 @@ __xfs_trans_commit(
*/
if (sync) {
error = _xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
- XFS_STATS_INC(xs_trans_sync);
+ XFS_STATS_INC(mp, xs_trans_sync);
} else {
- XFS_STATS_INC(xs_trans_async);
+ XFS_STATS_INC(mp, xs_trans_async);
}
return error;
@@ -955,7 +955,7 @@ out_unreserve:
xfs_trans_free_items(tp, NULLCOMMITLSN, !!error);
xfs_trans_free(tp);
- XFS_STATS_INC(xs_trans_empty);
+ XFS_STATS_INC(mp, xs_trans_empty);
return error;
}
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098cf490189..aa67339b9537 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -349,7 +349,7 @@ xfsaild_push(
xfs_ail_min_lsn(ailp))) {
ailp->xa_log_flush = 0;
- XFS_STATS_INC(xs_push_ail_flush);
+ XFS_STATS_INC(mp, xs_push_ail_flush);
xfs_log_force(mp, XFS_LOG_SYNC);
}
@@ -371,7 +371,7 @@ xfsaild_push(
goto out_done;
}
- XFS_STATS_INC(xs_push_ail);
+ XFS_STATS_INC(mp, xs_push_ail);
lsn = lip->li_lsn;
while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
@@ -385,7 +385,7 @@ xfsaild_push(
lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
- XFS_STATS_INC(xs_push_ail_success);
+ XFS_STATS_INC(mp, xs_push_ail_success);
trace_xfs_ail_push(lip);
ailp->xa_last_pushed_lsn = lsn;
@@ -403,7 +403,7 @@ xfsaild_push(
* re-try the flushing relatively soon if most of the
* AIL is beeing flushed.
*/
- XFS_STATS_INC(xs_push_ail_flushing);
+ XFS_STATS_INC(mp, xs_push_ail_flushing);
trace_xfs_ail_flushing(lip);
flushing++;
@@ -411,14 +411,14 @@ xfsaild_push(
break;
case XFS_ITEM_PINNED:
- XFS_STATS_INC(xs_push_ail_pinned);
+ XFS_STATS_INC(mp, xs_push_ail_pinned);
trace_xfs_ail_pinned(lip);
stuck++;
ailp->xa_log_flush++;
break;
case XFS_ITEM_LOCKED:
- XFS_STATS_INC(xs_push_ail_locked);
+ XFS_STATS_INC(mp, xs_push_ail_locked);
trace_xfs_ail_locked(lip);
stuck++;
@@ -497,6 +497,7 @@ xfsaild(
long tout = 0; /* milliseconds */
current->flags |= PF_MEMALLOC;
+ set_freezable();
while (!kthread_should_stop()) {
if (tout && tout <= 20)
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 17280cd71934..b97f1df910ab 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -108,6 +108,15 @@ xfs_trans_log_inode(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
/*
+ * Record the specific change for fdatasync optimisation. This
+ * allows fdatasync to skip log forces for inodes that are only
+ * timestamp dirty. We do this before the change count so that
+ * the core being logged in this case does not impact on fdatasync
+ * behaviour.
+ */
+ ip->i_itemp->ili_fsync_fields |= flags;
+
+ /*
* First time we log the inode in a transaction, bump the inode change
* counter if it is configured for this to occur. We don't use
* inode_inc_version() because there is no need for extra locking around
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index c036815183cb..839b35ca21c6 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -32,9 +32,10 @@
static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
- void *value, size_t size, int xflags)
+xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, void *value, size_t size)
{
+ int xflags = handler->flags;
struct xfs_inode *ip = XFS_I(d_inode(dentry));
int error, asize = size;
@@ -53,11 +54,35 @@ xfs_xattr_get(struct dentry *dentry, const char *name,
return asize;
}
+void
+xfs_forget_acl(
+ struct inode *inode,
+ const char *name,
+ int xflags)
+{
+ /*
+ * Invalidate any cached ACLs if the user has bypassed the ACL
+ * interface. We don't validate the content whatsoever so it is caller
+ * responsibility to provide data in valid format and ensure i_mode is
+ * consistent.
+ */
+ if (xflags & ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+ if (!strcmp(name, SGI_ACL_FILE))
+ forget_cached_acl(inode, ACL_TYPE_ACCESS);
+ else if (!strcmp(name, SGI_ACL_DEFAULT))
+ forget_cached_acl(inode, ACL_TYPE_DEFAULT);
+#endif
+ }
+}
+
static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags, int xflags)
+xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
+ const char *name, const void *value, size_t size, int flags)
{
- struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int xflags = handler->flags;
+ struct xfs_inode *ip = XFS_I(d_inode(dentry));
+ int error;
if (strcmp(name, "") == 0)
return -EINVAL;
@@ -70,8 +95,12 @@ xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
if (!value)
return xfs_attr_remove(ip, (unsigned char *)name, xflags);
- return xfs_attr_set(ip, (unsigned char *)name,
+ error = xfs_attr_set(ip, (unsigned char *)name,
(void *)value, size, xflags);
+ if (!error)
+ xfs_forget_acl(d_inode(dentry), name, xflags);
+
+ return error;
}
static const struct xattr_handler xfs_xattr_user_handler = {